Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cross_channel_norm

cross_channel_norm
sweetsky0901 7 years ago
commit 333995a700

@ -6,8 +6,18 @@ height = 227
width = 227 width = 227
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
gp = get_config_arg('layer_num', int, 1)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2( define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args) "train.list", None, module="provider", obj="process", args=args)
@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2 # conv2
net = img_conv_layer( net = img_conv_layer(
input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1) input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75) net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2) net = img_pool_layer(input=net, pool_size=3, stride=2)
@ -40,11 +50,11 @@ net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1) input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4 # conv4
net = img_conv_layer( net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1) input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5 # conv5
net = img_conv_layer( net = img_conv_layer(
input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1) input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2) net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer( net = fc_layer(
@ -59,6 +69,9 @@ net = fc_layer(
layer_attr=ExtraAttr(drop_rate=0.5)) layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation()) net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
lab = data_layer('label', num_class) if is_infer:
loss = cross_entropy(input=net, label=lab) outputs(net)
outputs(loss) else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=net, label=lab)
outputs(loss)

@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True) use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False) is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = { args = {
'height': height, 'height': height,
'width': width, 'width': width,
'color': True, 'color': True,
'num_class': num_class, 'num_class': num_class,
'is_infer': is_infer 'is_infer': is_infer,
'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list" if not is_infer else None, "train.list" if not is_infer else None,

@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
else: else:
settings.data_size = settings.height * settings.width settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False) settings.is_infer = kwargs.get('is_infer', False)
settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer: if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)] settings.slots = [dense_vector(settings.data_size)]
else: else:
@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider( @provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list): def process(settings, file_list):
for i in xrange(2560 if settings.is_infer else 1024): for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer: if settings.is_infer:
yield img.astype('float32') yield img.astype('float32')

@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64) batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50) layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False) is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = { args = {
'height': height, 'height': height,
'width': width, 'width': width,
'color': True, 'color': True,
'num_class': num_class, 'num_class': num_class,
'is_infer': is_infer 'is_infer': is_infer,
'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list" if not is_infer else None, "train.list" if not is_infer else None,

@ -37,7 +37,7 @@ function infer() {
--trainer_count=1 \ --trainer_count=1 \
--num_passes=1 \ --num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \ --save_dir="models/${topology}-${layer_num}" \
--config_args="batch_size=128,layer_num=${layer_num}" \ --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1 > /dev/null 2>&1
echo "Done" echo "Done"
fi fi
@ -79,8 +79,9 @@ fi
# inference benchmark # inference benchmark
for use_mkldnn in True False; do for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do for batchsize in 1 2 4 8 16; do
infer googlenet v1 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn infer vgg 19 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer googlenet v1 $batchsize $use_mkldnn
infer alexnet 2 $batchsize $use_mkldnn
done done
done done

@ -47,5 +47,6 @@ for use_mkldnn in True False; do
train vgg 19 $batchsize $use_mkldnn train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn train googlenet v1 $batchsize $use_mkldnn
train alexnet 2 $batchsize $use_mkldnn
done done
done done

@ -23,24 +23,25 @@ function infer() {
echo "./run_mkl_infer.sh to save the model first" echo "./run_mkl_infer.sh to save the model first"
exit 0 exit 0
fi fi
log_period=$((256 / bs)) log_period=$((32 / bs))
paddle train --job=test \ paddle train --job=test \
--config="${topology}.py" \ --config="${topology}.py" \
--use_mkldnn=False \
--use_gpu=False \ --use_gpu=False \
--trainer_count=$thread \ --trainer_count=$thread \
--log_period=$log_period \ --log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \ --init_model_path=$models_in \
2>&1 | tee ${log} 2>&1 | tee ${log}
# calculate the last 5 logs period time of 1280 samples, # calculate the last 5 logs period time of 160(=32*5) samples,
# the time before are burning time. # the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start` start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end` end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
} }
@ -56,7 +57,8 @@ fi
# inference benchmark # inference benchmark
for batchsize in 1 2 4 8 16; do for batchsize in 1 2 4 8 16; do
infer googlenet v1 $batchsize
infer resnet 50 $batchsize
infer vgg 19 $batchsize infer vgg 19 $batchsize
infer resnet 50 $batchsize
infer googlenet v1 $batchsize
infer alexnet 2 $batchsize
done done

@ -12,10 +12,11 @@ function train() {
config="${topology}.py" config="${topology}.py"
paddle train --job=time \ paddle train --job=time \
--config=$config \ --config=$config \
--use_mkldnn=False \
--use_gpu=False \ --use_gpu=False \
--trainer_count=$thread \ --trainer_count=$thread \
--log_period=10 \ --log_period=3 \
--test_period=100 \ --test_period=30 \
--config_args=$args \ --config_args=$args \
2>&1 | tee ${log} 2>&1 | tee ${log}
@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
train vgg 19 $batchsize train vgg 19 $batchsize
train resnet 50 $batchsize train resnet 50 $batchsize
train googlenet v1 $batchsize train googlenet v1 $batchsize
train alexnet 2 $batchsize
done done

@ -7,13 +7,15 @@ num_class = 1000
batch_size = get_config_arg('batch_size', int, 64) batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19) layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False) is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = { args = {
'height': height, 'height': height,
'width': width, 'width': width,
'color': True, 'color': True,
'num_class': num_class, 'num_class': num_class,
'is_infer': is_infer 'is_infer': is_infer,
'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list" if not is_infer else None, "train.list" if not is_infer else None,

@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
IF(WITH_C_API) IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID) IF(ANDROID)
INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
ELSE() ELSE()
INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib) INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
ENDIF() ENDIF()
ENDIF() ENDIF()

@ -467,7 +467,7 @@ lambda_cost
:noindex: :noindex:
square_error_cost square_error_cost
-------- -----------------
.. autoclass:: paddle.v2.layer.square_error_cost .. autoclass:: paddle.v2.layer.square_error_cost
:noindex: :noindex:
@ -533,7 +533,7 @@ Miscs
===== =====
dropout dropout
-------------- --------
.. autoclass:: paddle.v2.layer.dropout .. autoclass:: paddle.v2.layer.dropout
:noindex: :noindex:

File diff suppressed because it is too large Load Diff

@ -3,19 +3,19 @@ Nets
=========== ===========
simple_img_conv_pool simple_img_conv_pool
----------- --------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool .. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex: :noindex:
img_conv_group img_conv_group
----------- ---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group .. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex: :noindex:
sequence_conv_pool sequence_conv_pool
----------- ------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex: :noindex:

@ -18,7 +18,7 @@ SGDOptimizer
MomentumOptimizer MomentumOptimizer
----------- -----------------
.. automodule:: paddle.v2.fluid.optimizer .. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer :members: MomentumOptimizer
:noindex: :noindex:
@ -26,14 +26,14 @@ MomentumOptimizer
AdagradOptimizer AdagradOptimizer
----------- ----------------
.. automodule:: paddle.v2.fluid.optimizer .. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer :members: AdagradOptimizer
:noindex: :noindex:
AdamOptimizer AdamOptimizer
----------- -------------
.. automodule:: paddle.v2.fluid.optimizer .. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer :members: AdamOptimizer
:noindex: :noindex:
@ -47,7 +47,7 @@ AdamaxOptimizer
DecayedAdagradOptimizer DecayedAdagradOptimizer
----------- -----------------------
.. automodule:: paddle.v2.fluid.optimizer .. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer :members: DecayedAdagradOptimizer
:noindex: :noindex:

@ -3,14 +3,14 @@ Regularizer
=========== ===========
WeightDecayRegularizer WeightDecayRegularizer
----------- ----------------------
.. automodule:: paddle.v2.fluid.regularizer .. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer :members: WeightDecayRegularizer
:noindex: :noindex:
L2DecayRegularizer L2DecayRegularizer
----------- ------------------
.. automodule:: paddle.v2.fluid.regularizer .. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer :members: L2DecayRegularizer
:noindex: :noindex:
@ -18,7 +18,7 @@ L2DecayRegularizer
L1DecayRegularizer L1DecayRegularizer
----------- -------------------
.. automodule:: paddle.v2.fluid.regularizer .. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer :members: L1DecayRegularizer

@ -0,0 +1,57 @@
## Problem
In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
In the current design, we use KernelType to describe one kernel.
```cpp
struct KernelType {
Place place_;
DataType data_type_;
LayoutType layout_;
};
```
`place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
## Solution
### Potential choice
1. Do nothing, let the user add the information they want to operators attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
### Final choice
To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
In C++
```cpp
const std::string kForceCPU = "force_cpu";
const std::string kUseCUDNN = "use_cudnn";
const std::string kUseMKLDNN = "use_mkldnn";
KernelType GetExpectedKernelType() {
if (Attr<bool>(kForceCPU)) {
return KernelType(CPUPlace, ...)
} else {
...
}
}
```
In Python code
```python
FORCE_CPU = core.kForceCPU()
def xx_layer(..., force_cpu=false):
layer_helper = LayerHelper(...)
layer_helper.append_op(
type="xx",
attr={FORCE_CPU: force_cpu})
```

@ -0,0 +1,91 @@
# Design Doc: The Keys of Operator Kernel Type
## Problem
An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
```cpp
struct OpKernelType {
platform::Place place_;
proto::DataType data_type_;
};
```
For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`.
We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
## Solution
There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
```cpp
struct OpKernelType {
platform::Place place_;
platform::Library library_;
proto::DataType data_type_;
framework::Layout layout_;
};
```
Following is the details:
### Place
`Place` is defined as follows:
```cpp
typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
```
`Place` is to represent the device memory where data is locating.
### Library
One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
```cpp
enum Library { Plain, MKLDNN, CUDNN };
```
We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
### DataType
`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
### Layout
Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
- The inference of Layout is at run-time, not compile-time.
- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
`Layout` is also defined as a enum variable:
```cpp
enum Layout {
kNCHW,
kNHWC,
#ifdef PADDLE_WITH_MKLDNN
knChw8c
...
#endif
};
```

@ -0,0 +1,43 @@
# Design Doc: Execute the Program with Multi CPU
## Abstract
This Design Doc propose an approach to make the user-defined Op graph
running with multi-CPU, we will use an auto transpiler to convert the user-defined
Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
## Transpiler
<img src="src/multi-threads/single-thread@3x.png" width="300">
After converted:
<img src="src/multi-threads/multi-threads@3x.png" width="1000">
## Implement
- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
which would be executed with multi-threads.
- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
for the atomic counter become `0`:
```cpp
BlockingCounter bc(thread_count);
for (int i = 0; i < thread_count; ++i) {
thread_pool->Start([&bc] {bc.DecrementCount(); })
}
bc.Wait();
```
- `ParallelDo` Operator
- Initialize a thread pool which is a Singleton.
- Use a block id as the input, and create run the specify Block on independent scope
with multi-threads.
- Initialize a `BlockingCounter` instance and wait until all threads are done.
- `Split` Operator will split the Input Tensor into a TensorArray.
- `Merge` merge all the gradients which calculated in different threads
with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
## TODO
- Improve the optimizer stage with multi-threads, since we could
assign the parameters to the different threads and execute
optimizer with multi-threads.

Binary file not shown.

After

Width:  |  Height:  |  Size: 350 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

@ -0,0 +1,66 @@
## Background
Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold.
The `KernelType` is as follows.
```
struct KernelType {
Place place_;
DataType data_type_;
LayoutType layout_;
};
```
The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
## Problem
We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
Problems under these situations are similar. We can formalise this problem as follow.
We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
## Solution
It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
The algorithm is described as follow
```cpp
using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
using KernelTypePair = std::pair<KernelType, KernelType>;
map<KernelTypePair, DataTransformationFN> g_data_transformation_;
void OpWithKernel::Run() {
vec<Tensor> inputs = ...
auto actual_kernel_type = GetActualKernelType(inputs);
// The expected kernel type is related to actual kernel type.
// For the most operators, the expected kernel type is as same as
// actual kernel type.
//
// So we pass `actual_kernel_type` as a parameter of
// GetExpectedKernelType
auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
kernel.run(trans(inputs));
}
```

@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖包含但不限于
:header: "依赖", "版本", "说明" :header: "依赖", "版本", "说明"
:widths: 10, 15, 30 :widths: 10, 15, 30
"CMake", ">=3.5", "" "CMake", ">=3.2", ""
"GCC", "4.8.2", "推荐使用CentOS的devtools2" "GCC", "4.8.2", "推荐使用CentOS的devtools2"
"Python", "2.7.x", "依赖libpython2.7.so" "Python", "2.7.x", "依赖libpython2.7.so"
"pip", ">=9.0", "" "pip", ">=9.0", ""
"numpy", "", "" "numpy", "", ""
"SWIG", ">=2.0", "" "SWIG", ">=2.0", ""
"Go", ">=1.8", "可选" "Go", ">=1.8", "可选"
.. _build_options: .. _build_options:

@ -76,13 +76,13 @@ will be downloaded automatically.
:header: "Dependency", "Version", "Description" :header: "Dependency", "Version", "Description"
:widths: 10, 15, 30 :widths: 10, 15, 30
"CMake", ">=3.5", "" "CMake", ">=3.2", ""
"GCC", "4.8.2", "Recommend devtools2 for CentOS" "GCC", "4.8.2", "Recommend devtools2 for CentOS"
"Python", "2.7.x", "Need libpython2.7.so" "Python", "2.7.x", "Need libpython2.7.so"
"pip", ">=9.0", "" "pip", ">=9.0", ""
"numpy", "", "" "numpy", "", ""
"SWIG", ">=2.0", "" "SWIG", ">=2.0", ""
"Go", ">=1.8", "Optional" "Go", ">=1.8", "Optional"
.. _build_options: .. _build_options:

@ -128,7 +128,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
AVX是一种CPU指令集可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 AVX是一种CPU指令集可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
是开启AVX编译的所以如果您的电脑不支持AVX需要单独 是开启AVX编译的所以如果您的电脑不支持AVX需要单独
`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。 `编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
以下指令能检查Linux电脑是否支持AVX 以下指令能检查Linux电脑是否支持AVX

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save