commit
3f8a7b55b8
@ -0,0 +1,50 @@
|
||||
INCLUDE(ExternalProject)
|
||||
|
||||
SET(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
|
||||
|
||||
INCLUDE_DIRECTORIES(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
|
||||
|
||||
|
||||
if(WITH_DSO)
|
||||
# If we use DSO, we do not build nccl, just download the dependencies
|
||||
set(NCCL_BUILD_COMMAND "")
|
||||
set(NCCL_INSTALL_COMMAND "")
|
||||
set(NCCL_INSTALL_DIR "")
|
||||
else()
|
||||
# otherwise, we build nccl and link it.
|
||||
set(NCCL_BUILD_COMMAND "make -j 8")
|
||||
set(NCCL_INSTALL_COMMAND "make install")
|
||||
SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(
|
||||
extern_nccl
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git"
|
||||
GIT_TAG "v1.3.4-1"
|
||||
PREFIX "${NCCL_SOURCE_DIR}"
|
||||
UPDATE_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
|
||||
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
|
||||
INSTALL_DIR "${NCCL_INSTALL_DIR}"
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
|
||||
if (WITH_DSO)
|
||||
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
|
||||
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
|
||||
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
|
||||
add_library(nccl STATIC ${dummyfile})
|
||||
else()
|
||||
add_library(nccl INTERFACE)
|
||||
endif()
|
||||
else()
|
||||
ADD_LIBRARY(nccl STATIC IMPORTED GLOBAL)
|
||||
SET_PROPERTY(TARGET nccl PROPERTY IMPORTED_LOCATION
|
||||
${NCCL_INSTALL_DIR}/lib/libnccl.a)
|
||||
endif()
|
||||
|
||||
add_dependencies(nccl extern_nccl)
|
||||
|
||||
LIST(APPEND external_project_dependencies nccl)
|
Binary file not shown.
@ -0,0 +1,23 @@
|
||||
# Executor Design Doc
|
||||
|
||||
## Motivation
|
||||
|
||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
|
||||
|
||||
## Overview
|
||||
|
||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
|
||||
|
||||
### What does executor do?
|
||||
|
||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
|
||||
|
||||
### What does executor NOT do?
|
||||
|
||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
|
||||
|
||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
|
||||
|
||||
## Implementation
|
||||
|
||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
|
@ -0,0 +1,232 @@
|
||||
## Survey on Graph
|
||||
|
||||
Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
|
||||
|
||||
### Mxnet
|
||||
|
||||
The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
|
||||
|
||||
|
||||
`Symbol` is help class used to represent the operator node in Graph.
|
||||
`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
|
||||
|
||||
|
||||
A simple network topology wrote by Symbol is as follows:
|
||||
|
||||
```python
|
||||
def get_symbol(num_classes=10, **kwargs):
|
||||
data = mx.symbol.Variable('data')
|
||||
data = mx.symbol.Flatten(data=data)
|
||||
fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
|
||||
act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
|
||||
fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
|
||||
act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
|
||||
fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
|
||||
mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
|
||||
return mlp
|
||||
```
|
||||
|
||||
|
||||
|
||||
Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
|
||||
|
||||
Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
|
||||
|
||||
And Symbol can be saved to a Json file.
|
||||
|
||||
Here is a detailed example:
|
||||
|
||||
```
|
||||
>>> import mxnet as mx
|
||||
>>> data = mx.symbol.Variable('data')
|
||||
>>> print data.debug_str()
|
||||
Variable:data
|
||||
|
||||
>>> data = mx.symbol.Flatten(data=data)
|
||||
>>> print data.debug_str()
|
||||
Symbol Outputs:
|
||||
output[0]=flatten0(0)
|
||||
Variable:data
|
||||
--------------------
|
||||
Op:Flatten, Name=flatten0
|
||||
Inputs:
|
||||
arg[0]=data(0) version=0
|
||||
|
||||
>>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
|
||||
>>> print fc1.debug_str()
|
||||
Symbol Outputs:
|
||||
output[0]=fc1(0)
|
||||
Variable:data
|
||||
--------------------
|
||||
Op:Flatten, Name=flatten0
|
||||
Inputs:
|
||||
arg[0]=data(0) version=0
|
||||
Variable:fc1_weight
|
||||
Variable:fc1_bias
|
||||
--------------------
|
||||
Op:FullyConnected, Name=fc1
|
||||
Inputs:
|
||||
arg[0]=flatten0(0)
|
||||
arg[1]=fc1_weight(0) version=0
|
||||
arg[2]=fc1_bias(0) version=0
|
||||
Attrs:
|
||||
num_hidden=128
|
||||
|
||||
```
|
||||
|
||||
|
||||
### TensorFlow
|
||||
|
||||
|
||||
The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
|
||||
|
||||
A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
|
||||
|
||||
A simple example is as follows:
|
||||
|
||||
```python
|
||||
# Build a dataflow graph.
|
||||
c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
|
||||
d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
|
||||
e = tf.matmul(c, d)
|
||||
|
||||
# Construct a `Session` to execute the graph.
|
||||
sess = tf.Session()
|
||||
|
||||
# Execute the graph and store the value that `e` represents in `result`.
|
||||
result = sess.run(e)
|
||||
```
|
||||
|
||||
|
||||
The main method of `Tensor` is as follows:
|
||||
|
||||
|
||||
```python
|
||||
@property
|
||||
def op(self):
|
||||
"""The `Operation` that produces this tensor as an output."""
|
||||
return self._op
|
||||
|
||||
@property
|
||||
def dtype(self):
|
||||
"""The `DType` of elements in this tensor."""
|
||||
return self._dtype
|
||||
|
||||
@property
|
||||
def graph(self):
|
||||
"""The `Graph` that contains this tensor."""
|
||||
return self._op.graph
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""The string name of this tensor."""
|
||||
if not self._op.name:
|
||||
raise ValueError("Operation was not named: %s" % self._op)
|
||||
return "%s:%d" % (self._op.name, self._value_index)
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
"""The name of the device on which this tensor will be produced, or None."""
|
||||
return self._op.device
|
||||
```
|
||||
|
||||
|
||||
Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
|
||||
|
||||
|
||||
Here is a detailed example:
|
||||
|
||||
|
||||
```
|
||||
>>> import tensorflow as tf
|
||||
>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
|
||||
>>> print c.graph
|
||||
<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
|
||||
>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
|
||||
>>> print d.graph
|
||||
<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
|
||||
>>> e = tf.matmul(c, d)
|
||||
>>> print e.graph
|
||||
<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
|
||||
```
|
||||
|
||||
### Dynet
|
||||
|
||||
|
||||
The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
|
||||
|
||||
|
||||
A simple example is as follows:
|
||||
|
||||
```cpp
|
||||
ComputationGraph cg;
|
||||
Expression W = parameter(cg, pW);
|
||||
|
||||
Expression in = input(cg, xs[i]);
|
||||
Expression label = input(cg, ys[i]);
|
||||
Expression pred = W * in;
|
||||
Expression loss = square(pred - label);
|
||||
```
|
||||
|
||||
The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node.
|
||||
|
||||
Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
|
||||
|
||||
|
||||
Here is a detailed example:
|
||||
|
||||
write topology in C++
|
||||
|
||||
```
|
||||
ComputationGraph cg;
|
||||
Expression W = parameter(cg, pW);
|
||||
cg.print_graphviz();
|
||||
|
||||
Expression pred = W * xs[i];
|
||||
cg.print_graphviz();
|
||||
|
||||
Expression loss = square(pred - ys[i]);
|
||||
cg.print_graphviz();
|
||||
```
|
||||
|
||||
compile and print
|
||||
|
||||
```
|
||||
# first print
|
||||
digraph G {
|
||||
rankdir=LR;
|
||||
nodesep=.05;
|
||||
N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
|
||||
}
|
||||
# second print
|
||||
digraph G {
|
||||
rankdir=LR;
|
||||
nodesep=.05;
|
||||
N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
|
||||
N1 [label="v1 = v0 * -0.98"];
|
||||
N0 -> N1;
|
||||
}
|
||||
# third print
|
||||
digraph G {
|
||||
rankdir=LR;
|
||||
nodesep=.05;
|
||||
N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
|
||||
N1 [label="v1 = v0 * -0.98"];
|
||||
N0 -> N1;
|
||||
N2 [label="v2 = -1.88387 - v1"];
|
||||
N1 -> N2;
|
||||
N3 [label="v3 = -v2"];
|
||||
N2 -> N3;
|
||||
N4 [label="v4 = square(v3)"];
|
||||
N3 -> N4;
|
||||
}
|
||||
```
|
||||
|
||||
### Conclusion
|
||||
|
||||
|
||||
Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
|
||||
|
||||
- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
|
||||
- Expression corresponds with a global Graph, and Expression can also be composed.
|
||||
- Expression tracks all dependency and can be taken as a run target
|
After Width: | Height: | Size: 32 KiB |
After Width: | Height: | Size: 45 KiB |
After Width: | Height: | Size: 1.1 KiB |
After Width: | Height: | Size: 989 B |
After Width: | Height: | Size: 1.6 KiB |
@ -0,0 +1,78 @@
|
||||
# Design Doc: InferVarType
|
||||
|
||||
## The Problem Posed
|
||||
|
||||
The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
|
||||
|
||||
For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
|
||||
|
||||
The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
|
||||
|
||||
## Proposed Solution
|
||||
|
||||
The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
|
||||
|
||||
|
||||
```c++
|
||||
using InferVarTypeFN = std::function<
|
||||
void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
|
||||
```
|
||||
|
||||
It takes an operator description as its input and will write the output variable type and store them in block description.
|
||||
|
||||
The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
|
||||
|
||||
```cpp
|
||||
struct OpInfo {
|
||||
InferVarTypeFN infer_var_type_;
|
||||
...
|
||||
};
|
||||
```
|
||||
|
||||
The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
|
||||
|
||||
```cpp
|
||||
void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
|
||||
// set the output type of variable as `LoDTensor`.
|
||||
// ...
|
||||
}
|
||||
|
||||
struct OpInfo {
|
||||
InferVarTypeFN infer_var_type_;
|
||||
InferVarTypeFN GetInferVarType() const {
|
||||
if (infer_var_type_) {
|
||||
return infer_var_type_;
|
||||
} else {
|
||||
return DefaultInferVarType;
|
||||
}
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
## Register InferVarType
|
||||
|
||||
We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
|
||||
|
||||
```cpp
|
||||
class VarTypeInferer {
|
||||
public:
|
||||
virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
|
||||
}
|
||||
```
|
||||
|
||||
Operator developers can write the specialize `VarTypeInferer` as follow.
|
||||
|
||||
```cpp
|
||||
class SpecialVarTypeInferer : public VarTypeInferer {
|
||||
public:
|
||||
virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
|
||||
// .. own logic
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
|
||||
|
||||
```
|
||||
REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
|
||||
```
|
@ -0,0 +1,36 @@
|
||||
# Design Doc: Model Format
|
||||
|
||||
## Motivation
|
||||
|
||||
The model is the output of training process. One complete model consists of two parts, namely, the **topology** and the **parameters**. To support industrial deployment, we need to make the model format must be self-completed and do not expose any training source code.
|
||||
|
||||
As a result, In PaddlePaddle, the **topology** represents as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model, we must support large size parameter, and efficient serialization/deserialization.
|
||||
|
||||
## Implementation
|
||||
|
||||
The topology is saved as a plain text, in detail, a self-contain protobuf file.
|
||||
|
||||
The parameters are saved as a binary file. As we all know, the protobuf message has the limits of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We do a (benchmark experiment)[https://github.com/PaddlePaddle/Paddle/pull/4610], its result shows protobuf is not fit in this scene.
|
||||
|
||||
As a result, we design a particular format for tensor serialization. By default, arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of (LoDTensorDesc)[https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99]. We save the DescProto as the byte string header, it contains the necessary information, such as the `dims`, the `name` of the tensor, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). Tensor stores value in a continuous memory buffer, for speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
|
||||
|
||||
|HeaderLength|ContentLength|**LoDTensorDesc**|**TensorValue**|
|
||||
|
||||
In detail, tensor's byte view as the table shows. Note that all the signed value written in little-endian.
|
||||
|
||||
```text
|
||||
[offset] [type] [description]
|
||||
0004 4 bytes integer HeaderLength, the length of LoDTensorDesc
|
||||
0008 4 bytes integer ContentLength, the length of LodTensor Buffer
|
||||
0009 1 bytes char TensorDesc
|
||||
00010 1 bytes char TensorDesc
|
||||
...
|
||||
00100 1 bytes char TensorValue
|
||||
00101 1 bytes char TensorValue
|
||||
00102 1 bytes char TensorValue ..
|
||||
...
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
We introduce the model format, the `ProgramDesc` describe the **topology**, and a bunch of particular format binary tensors describes the **parameters**.
|
@ -0,0 +1,63 @@
|
||||
# Prune
|
||||
|
||||
## Motivation
|
||||
|
||||
We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement
|
||||
`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
|
||||
and generate a pruned `ProgramDesc`.
|
||||
|
||||
## Challenge
|
||||
|
||||
Pruning need to support both variables and operators being evaluation targets. Consider the following
|
||||
different situations.
|
||||
|
||||
```python
|
||||
# Case 1: run foward pass.
|
||||
cost_np = session.run(target=cost)
|
||||
# Case 2: run backward passing.
|
||||
opts_np, _ = session.run(target=[cost, opt])
|
||||
# Case 3: run checkpointing
|
||||
_ = session.run(target=checkpoint)
|
||||
```
|
||||
|
||||
## Solution
|
||||
|
||||
To support evaluation of operators, we add `is_target` field in the `OpDesc`.
|
||||
|
||||
```c++
|
||||
message OpDesc {
|
||||
required string type = 3;
|
||||
repeated Var inputs = 1;
|
||||
repeated Var outputs = 2;
|
||||
repeated Attr attrs = 4;
|
||||
optional bool is_target = 5 [ default = false ];
|
||||
};
|
||||
```
|
||||
|
||||
To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
|
||||
For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
|
||||
`fetch_op`'s input. Then we also set `fetch_op` is a target.
|
||||
|
||||
### Algorithm
|
||||
|
||||
If an operator needs to be run, it must fall into one of the following cases:
|
||||
|
||||
1. It is the target.
|
||||
2. It is depended by some other ops, meaning its output is some other op's input.
|
||||
|
||||
The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
|
||||
|
||||
```c++
|
||||
bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
|
||||
for (auto& var : op_desc.outputs()) {
|
||||
for (auto& argu : var.arguments()) {
|
||||
if (dependent_vars.count(argu) != 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
```
|
||||
|
||||
Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue