Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into profiler_tool
	
		
	
				
					
				
			
						commit
						f03e73c8fa
					
				@ -0,0 +1,64 @@
 | 
				
			||||
set -e
 | 
				
			||||
 | 
				
			||||
function clock_to_seconds() {
 | 
				
			||||
  hours=`echo $1 | awk -F ':' '{print $1}'`
 | 
				
			||||
  mins=`echo $1 | awk -F ':' '{print $2}'`
 | 
				
			||||
  secs=`echo $1 | awk -F ':' '{print $3}'`
 | 
				
			||||
  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
 | 
				
			||||
}
 | 
				
			||||
 | 
				
			||||
function infer() {
 | 
				
			||||
  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
 | 
				
			||||
  topology=$1
 | 
				
			||||
  layer_num=$2
 | 
				
			||||
  bs=$3
 | 
				
			||||
  thread=`nproc`
 | 
				
			||||
  if [ $thread -gt $bs ]; then
 | 
				
			||||
    thread=$bs
 | 
				
			||||
  fi
 | 
				
			||||
  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
 | 
				
			||||
 | 
				
			||||
  models_in="models/${topology}-${layer_num}/pass-00000/"
 | 
				
			||||
  if [ ! -d $models_in ]; then
 | 
				
			||||
    echo "./run_mkl_infer.sh to save the model first"
 | 
				
			||||
    exit 0
 | 
				
			||||
  fi
 | 
				
			||||
  log_period=$((32 / bs))
 | 
				
			||||
  paddle train --job=test \
 | 
				
			||||
    --config="${topology}.py" \
 | 
				
			||||
    --use_mkldnn=False \
 | 
				
			||||
    --use_gpu=False \
 | 
				
			||||
    --trainer_count=$thread \
 | 
				
			||||
    --log_period=$log_period \
 | 
				
			||||
    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
 | 
				
			||||
    --init_model_path=$models_in \
 | 
				
			||||
    2>&1 | tee ${log}
 | 
				
			||||
 | 
				
			||||
  # calculate the last 5 logs period time of 160(=32*5) samples,
 | 
				
			||||
  # the time before are burning time.
 | 
				
			||||
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
 | 
				
			||||
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
 | 
				
			||||
  start_sec=`clock_to_seconds $start`
 | 
				
			||||
  end_sec=`clock_to_seconds $end`
 | 
				
			||||
  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
 | 
				
			||||
  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
 | 
				
			||||
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 | 
				
			||||
}
 | 
				
			||||
 | 
				
			||||
if [ ! -f "train.list" ]; then
 | 
				
			||||
  echo " " > train.list
 | 
				
			||||
fi
 | 
				
			||||
if [ ! -f "test.list" ]; then
 | 
				
			||||
  echo " " > test.list
 | 
				
			||||
fi
 | 
				
			||||
if [ ! -d "logs" ]; then
 | 
				
			||||
  mkdir logs
 | 
				
			||||
fi
 | 
				
			||||
 | 
				
			||||
# inference benchmark
 | 
				
			||||
for batchsize in 1 2 4 8 16; do
 | 
				
			||||
  infer vgg 19 $batchsize
 | 
				
			||||
  infer resnet 50 $batchsize 
 | 
				
			||||
  infer googlenet v1 $batchsize
 | 
				
			||||
  infer alexnet 2 $batchsize
 | 
				
			||||
done
 | 
				
			||||
@ -0,0 +1,41 @@
 | 
				
			||||
set -e
 | 
				
			||||
 | 
				
			||||
function train() {
 | 
				
			||||
  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
 | 
				
			||||
  topology=$1
 | 
				
			||||
  layer_num=$2
 | 
				
			||||
  bs=$3
 | 
				
			||||
  thread=`nproc`
 | 
				
			||||
  # each trainer_count use only 1 core to avoid conflict
 | 
				
			||||
  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
 | 
				
			||||
  args="batch_size=${bs},layer_num=${layer_num}"
 | 
				
			||||
  config="${topology}.py"
 | 
				
			||||
  paddle train --job=time \
 | 
				
			||||
    --config=$config \
 | 
				
			||||
    --use_mkldnn=False \
 | 
				
			||||
    --use_gpu=False \
 | 
				
			||||
    --trainer_count=$thread \
 | 
				
			||||
    --log_period=3 \
 | 
				
			||||
    --test_period=30 \
 | 
				
			||||
    --config_args=$args \
 | 
				
			||||
    2>&1 | tee ${log} 
 | 
				
			||||
 | 
				
			||||
  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
 | 
				
			||||
  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
 | 
				
			||||
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 | 
				
			||||
}
 | 
				
			||||
 | 
				
			||||
if [ ! -f "train.list" ]; then
 | 
				
			||||
  echo " " > train.list
 | 
				
			||||
fi
 | 
				
			||||
if [ ! -d "logs" ]; then
 | 
				
			||||
  mkdir logs
 | 
				
			||||
fi
 | 
				
			||||
 | 
				
			||||
# training benchmark
 | 
				
			||||
for batchsize in 64 128 256; do
 | 
				
			||||
  train vgg 19 $batchsize
 | 
				
			||||
  train resnet 50 $batchsize
 | 
				
			||||
  train googlenet v1 $batchsize
 | 
				
			||||
  train alexnet 2 $batchsize
 | 
				
			||||
done
 | 
				
			||||
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								@ -1,23 +1,29 @@
 | 
				
			||||
# Executor Design Doc
 | 
				
			||||
 | 
				
			||||
## Motivation
 | 
				
			||||
In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
 | 
				
			||||
[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 | 
				
			||||
 | 
				
			||||
We use executor to do the runtime evaluation of a `ProgramDesc`.
 | 
				
			||||
The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
 | 
				
			||||
 | 
				
			||||
## Overview
 | 
				
			||||
 | 
				
			||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
 | 
				
			||||
An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
 | 
				
			||||
 | 
				
			||||
### What does executor do?
 | 
				
			||||
## Executor
 | 
				
			||||
 | 
				
			||||
It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
 | 
				
			||||
The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
 | 
				
			||||
It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
 | 
				
			||||
 | 
				
			||||
### What does executor NOT do?
 | 
				
			||||
### The interface
 | 
				
			||||
```c++
 | 
				
			||||
  Executor(places);
 | 
				
			||||
```
 | 
				
			||||
A executor does not own any computing resources, a user can only construct an executor using the specified places.
 | 
				
			||||
 | 
				
			||||
It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
 | 
				
			||||
### Running an Executor
 | 
				
			||||
 | 
				
			||||
It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
 | 
				
			||||
 | 
				
			||||
## Implementation
 | 
				
			||||
 | 
				
			||||
`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
 | 
				
			||||
```
 | 
				
			||||
  void Run(ProgramDesc, Scope, block_id, create_local_scope);
 | 
				
			||||
```
 | 
				
			||||
An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
 | 
				
			||||
 | 
				
			||||
@ -0,0 +1,149 @@
 | 
				
			||||
# Design Doc: Add MKLDNN Kernel in Fluid Operator
 | 
				
			||||
 | 
				
			||||
## Principles
 | 
				
			||||
 | 
				
			||||
First of all, we should follow some basical principles like:
 | 
				
			||||
1.  [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
 | 
				
			||||
2.  [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
 | 
				
			||||
3.  [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
 | 
				
			||||
4.  [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
 | 
				
			||||
 | 
				
			||||
## Sulution
 | 
				
			||||
 | 
				
			||||
In general, there are four parts we should follow to run a MKL-DNN primitive.
 | 
				
			||||
-  Create a primitive descriptor that describe this operator
 | 
				
			||||
-  Create a primitive itself by primitive descriptor and the engine
 | 
				
			||||
-  Create all memory buffers that primitive needed
 | 
				
			||||
-  Launch a stream to execute the primitive created
 | 
				
			||||
More details can refer to [here](http://01org.github.io/mkl-dnn).
 | 
				
			||||
 | 
				
			||||
It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
 | 
				
			||||
So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
 | 
				
			||||
 | 
				
			||||
It's assumed that following three conditions should be satisfied.
 | 
				
			||||
1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
 | 
				
			||||
2. the `Input Tensor` inside `Compute` function is the one after converted.
 | 
				
			||||
3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
 | 
				
			||||
 | 
				
			||||
### Compute
 | 
				
			||||
The algorithm of `Compute` would be described as follow, let's take conv like an example.
 | 
				
			||||
 | 
				
			||||
```c++
 | 
				
			||||
 | 
				
			||||
  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
 | 
				
			||||
  PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
 | 
				
			||||
 | 
				
			||||
  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
 | 
				
			||||
 | 
				
			||||
  // find primitive by unique key from mkldnn context
 | 
				
			||||
  // the op_key should be a unique name of this op instance
 | 
				
			||||
  auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
 | 
				
			||||
 | 
				
			||||
  // assuming the input tensor inside this compute function is the one after converted
 | 
				
			||||
  // this point should be guarantee by another mechanism
 | 
				
			||||
  auto& i = dev_ctx.findMemory(op_key + "_input");
 | 
				
			||||
  
 | 
				
			||||
  if (p == nullptr || i == nullptr || inputSizeChanged(p, i))  {
 | 
				
			||||
    auto fwd_primitive_desc = createPrimitiveDesc(ctx);
 | 
				
			||||
    auto* input = ctx.Input<Tensor>("Input");
 | 
				
			||||
    auto* filter = ctx.Input<Tensor>("Filter");
 | 
				
			||||
    auto* output = ctx.Output<Tensor>("Output");
 | 
				
			||||
    shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
 | 
				
			||||
    shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
 | 
				
			||||
    shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
 | 
				
			||||
    shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
 | 
				
			||||
 | 
				
			||||
    dev_ctx.addMemory(op_key+"_input", in);
 | 
				
			||||
    dev_ctx.addMemory(op_key+"_output", out);
 | 
				
			||||
    dev_ctx.addMemory(op_key+"_filer", wgt);
 | 
				
			||||
    dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
 | 
				
			||||
    dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
 | 
				
			||||
  }
 | 
				
			||||
 | 
				
			||||
  p = dev_ctx.findPrimitive(op_key + "_fwd");
 | 
				
			||||
 | 
				
			||||
  PADDLE_ENFORCE(p, "Should have forward Primitive");
 | 
				
			||||
  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
 | 
				
			||||
  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
 | 
				
			||||
  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
 | 
				
			||||
  PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
 | 
				
			||||
  dev_ctx.submit(p);
 | 
				
			||||
  dev_ctx.execute();  // the convert primitive should have already contained.
 | 
				
			||||
 | 
				
			||||
```
 | 
				
			||||
 | 
				
			||||
The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
 | 
				
			||||
```c++
 | 
				
			||||
  auto* input = ctx.Input<Tensor>("Input");
 | 
				
			||||
  auto* filter = ctx.Input<Tensor>("Filter");
 | 
				
			||||
  auto* output = ctx.Output<Tensor>("Output");
 | 
				
			||||
  std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
 | 
				
			||||
  std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 | 
				
			||||
  std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
 | 
				
			||||
  int groups = ctx.Attr<int>("groups");
 | 
				
			||||
  algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
 | 
				
			||||
  prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
 | 
				
			||||
    
 | 
				
			||||
  auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
 | 
				
			||||
  shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
 | 
				
			||||
 | 
				
			||||
  return fwd_primitive_desc;
 | 
				
			||||
  }
 | 
				
			||||
```
 | 
				
			||||
 | 
				
			||||
### MKLDNNDeviceContext
 | 
				
			||||
`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
### mkldnn_helper
 | 
				
			||||
Some functions would be put in `paddle/platform/mkldnn_helper.h`.
 | 
				
			||||
- create MKLDNN memories
 | 
				
			||||
- create MKLDNN primitives
 | 
				
			||||
- error check function
 | 
				
			||||
- etc
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
### Kernel Switch
 | 
				
			||||
We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
 | 
				
			||||
 | 
				
			||||
`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. 
 | 
				
			||||
`trans` would be like this:
 | 
				
			||||
 | 
				
			||||
```c++
 | 
				
			||||
void trans(inputs, ctx) override {
 | 
				
			||||
  if (NoNeedTrans()) {
 | 
				
			||||
    return;
 | 
				
			||||
  }
 | 
				
			||||
  // find reorder primitive by op_key from context
 | 
				
			||||
  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
 | 
				
			||||
  auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
 | 
				
			||||
  auto& i = dev_ctx.findMemory(op_key + "_src_input");
 | 
				
			||||
 | 
				
			||||
  if (p == nullptr || i == nullptr || changeSized(i, input)) {
 | 
				
			||||
    auto prim = createPrimitiveDesc(ctx);
 | 
				
			||||
    auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
 | 
				
			||||
    auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
 | 
				
			||||
    auto dst = createMemory(p->expected_desc(), newbuffer->data);
 | 
				
			||||
    auto reorder_primitive(new mkldnn::reorder(src, dst));
 | 
				
			||||
 | 
				
			||||
    dev_ctx.addMemory(op_key+"_src_input", src);
 | 
				
			||||
    dev_ctx.addMemory(op_key+"_input", dst);
 | 
				
			||||
    dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
 | 
				
			||||
  }
 | 
				
			||||
 | 
				
			||||
  p = dev_ctx.findPrimitive(op_key + "_reorder_input");
 | 
				
			||||
  PADDLE_ENFORCE(p, "Should have Reorder Primitive");
 | 
				
			||||
  dev_ctx.submit(p);
 | 
				
			||||
  if (! this->isMKLDNNKernel()) {
 | 
				
			||||
    // execute immediately only if this is not mkldnn kernel function.
 | 
				
			||||
    // otherwise, it can be executed with the operator primitive in Compute
 | 
				
			||||
    dev_ctx.stream();
 | 
				
			||||
  }
 | 
				
			||||
  // after submit, the input tensor in ExecutionContext should be changed as the converted one
 | 
				
			||||
  // there should be another mechanism to ensure this
 | 
				
			||||
}
 | 
				
			||||
```
 | 
				
			||||
 | 
				
			||||
### Unit Test
 | 
				
			||||
All the functions should be tested corresponding.
 | 
				
			||||
TBD
 | 
				
			||||
@ -0,0 +1,91 @@
 | 
				
			||||
# Design Doc: The Keys of Operator Kernel Type
 | 
				
			||||
## Problem
 | 
				
			||||
An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
 | 
				
			||||
 | 
				
			||||
```cpp
 | 
				
			||||
struct OpKernelType {
 | 
				
			||||
  platform::Place place_;
 | 
				
			||||
  proto::DataType data_type_;
 | 
				
			||||
};
 | 
				
			||||
```
 | 
				
			||||
For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
 | 
				
			||||
 | 
				
			||||
It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
 | 
				
			||||
 | 
				
			||||
We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
 | 
				
			||||
 | 
				
			||||
For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
 | 
				
			||||
 | 
				
			||||
It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
 | 
				
			||||
 | 
				
			||||
## Solution
 | 
				
			||||
 | 
				
			||||
There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
 | 
				
			||||
 | 
				
			||||
```cpp
 | 
				
			||||
struct OpKernelType {
 | 
				
			||||
  platform::Place place_;
 | 
				
			||||
  platform::Library library_;
 | 
				
			||||
  proto::DataType data_type_;
 | 
				
			||||
  framework::Layout layout_;
 | 
				
			||||
};
 | 
				
			||||
```
 | 
				
			||||
 | 
				
			||||
Following is the details:
 | 
				
			||||
 | 
				
			||||
### Place
 | 
				
			||||
 | 
				
			||||
`Place` is defined as follows:
 | 
				
			||||
 | 
				
			||||
```cpp
 | 
				
			||||
typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
 | 
				
			||||
```
 | 
				
			||||
 | 
				
			||||
`Place` is to represent the device memory where data is locating.
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
### Library
 | 
				
			||||
 | 
				
			||||
One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
 | 
				
			||||
 | 
				
			||||
```cpp
 | 
				
			||||
enum Library { Plain, MKLDNN, CUDNN };
 | 
				
			||||
```
 | 
				
			||||
 | 
				
			||||
We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
 | 
				
			||||
A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
 | 
				
			||||
 | 
				
			||||
If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
### DataType
 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
 | 
				
			||||
 | 
				
			||||
### Layout
 | 
				
			||||
 | 
				
			||||
Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
 | 
				
			||||
 | 
				
			||||
Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
 | 
				
			||||
 | 
				
			||||
- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
 | 
				
			||||
 | 
				
			||||
- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
 | 
				
			||||
 | 
				
			||||
- The inference of Layout is at run-time, not compile-time.
 | 
				
			||||
 | 
				
			||||
- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
 | 
				
			||||
 | 
				
			||||
`Layout` is also defined as a enum variable:
 | 
				
			||||
 | 
				
			||||
```cpp
 | 
				
			||||
enum Layout {
 | 
				
			||||
  kNCHW,
 | 
				
			||||
  kNHWC,
 | 
				
			||||
#ifdef PADDLE_WITH_MKLDNN
 | 
				
			||||
  knChw8c
 | 
				
			||||
  ...
 | 
				
			||||
#endif
 | 
				
			||||
};
 | 
				
			||||
```
 | 
				
			||||
Some files were not shown because too many files have changed in this diff Show More
					Loading…
					
					
				
		Reference in new issue