Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cuda_profile
	
		
	
				
					
				
			
						commit
						ba8660a1bd
					
				| @ -0,0 +1,64 @@ | |||||||
|  | set -e | ||||||
|  | 
 | ||||||
|  | function clock_to_seconds() { | ||||||
|  |   hours=`echo $1 | awk -F ':' '{print $1}'` | ||||||
|  |   mins=`echo $1 | awk -F ':' '{print $2}'` | ||||||
|  |   secs=`echo $1 | awk -F ':' '{print $3}'` | ||||||
|  |   echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'` | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | function infer() { | ||||||
|  |   unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY | ||||||
|  |   topology=$1 | ||||||
|  |   layer_num=$2 | ||||||
|  |   bs=$3 | ||||||
|  |   thread=`nproc` | ||||||
|  |   if [ $thread -gt $bs ]; then | ||||||
|  |     thread=$bs | ||||||
|  |   fi | ||||||
|  |   log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log" | ||||||
|  | 
 | ||||||
|  |   models_in="models/${topology}-${layer_num}/pass-00000/" | ||||||
|  |   if [ ! -d $models_in ]; then | ||||||
|  |     echo "./run_mkl_infer.sh to save the model first" | ||||||
|  |     exit 0 | ||||||
|  |   fi | ||||||
|  |   log_period=$((32 / bs)) | ||||||
|  |   paddle train --job=test \ | ||||||
|  |     --config="${topology}.py" \ | ||||||
|  |     --use_mkldnn=False \ | ||||||
|  |     --use_gpu=False \ | ||||||
|  |     --trainer_count=$thread \ | ||||||
|  |     --log_period=$log_period \ | ||||||
|  |     --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ | ||||||
|  |     --init_model_path=$models_in \ | ||||||
|  |     2>&1 | tee ${log} | ||||||
|  | 
 | ||||||
|  |   # calculate the last 5 logs period time of 160(=32*5) samples, | ||||||
|  |   # the time before are burning time. | ||||||
|  |   start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` | ||||||
|  |   end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` | ||||||
|  |   start_sec=`clock_to_seconds $start` | ||||||
|  |   end_sec=`clock_to_seconds $end` | ||||||
|  |   fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'` | ||||||
|  |   echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} | ||||||
|  |   echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | if [ ! -f "train.list" ]; then | ||||||
|  |   echo " " > train.list | ||||||
|  | fi | ||||||
|  | if [ ! -f "test.list" ]; then | ||||||
|  |   echo " " > test.list | ||||||
|  | fi | ||||||
|  | if [ ! -d "logs" ]; then | ||||||
|  |   mkdir logs | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # inference benchmark | ||||||
|  | for batchsize in 1 2 4 8 16; do | ||||||
|  |   infer vgg 19 $batchsize | ||||||
|  |   infer resnet 50 $batchsize  | ||||||
|  |   infer googlenet v1 $batchsize | ||||||
|  |   infer alexnet 2 $batchsize | ||||||
|  | done | ||||||
| @ -0,0 +1,41 @@ | |||||||
|  | set -e | ||||||
|  | 
 | ||||||
|  | function train() { | ||||||
|  |   unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY | ||||||
|  |   topology=$1 | ||||||
|  |   layer_num=$2 | ||||||
|  |   bs=$3 | ||||||
|  |   thread=`nproc` | ||||||
|  |   # each trainer_count use only 1 core to avoid conflict | ||||||
|  |   log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log" | ||||||
|  |   args="batch_size=${bs},layer_num=${layer_num}" | ||||||
|  |   config="${topology}.py" | ||||||
|  |   paddle train --job=time \ | ||||||
|  |     --config=$config \ | ||||||
|  |     --use_mkldnn=False \ | ||||||
|  |     --use_gpu=False \ | ||||||
|  |     --trainer_count=$thread \ | ||||||
|  |     --log_period=3 \ | ||||||
|  |     --test_period=30 \ | ||||||
|  |     --config_args=$args \ | ||||||
|  |     2>&1 | tee ${log}  | ||||||
|  | 
 | ||||||
|  |   avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'` | ||||||
|  |   fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'` | ||||||
|  |   echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | if [ ! -f "train.list" ]; then | ||||||
|  |   echo " " > train.list | ||||||
|  | fi | ||||||
|  | if [ ! -d "logs" ]; then | ||||||
|  |   mkdir logs | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # training benchmark | ||||||
|  | for batchsize in 64 128 256; do | ||||||
|  |   train vgg 19 $batchsize | ||||||
|  |   train resnet 50 $batchsize | ||||||
|  |   train googlenet v1 $batchsize | ||||||
|  |   train alexnet 2 $batchsize | ||||||
|  | done | ||||||
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								| @ -1,23 +1,29 @@ | |||||||
| # Executor Design Doc | # Executor Design Doc | ||||||
| 
 | 
 | ||||||
| ## Motivation | ## Motivation | ||||||
|  | In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message | ||||||
|  | [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). | ||||||
| 
 | 
 | ||||||
| We use executor to do the runtime evaluation of a `ProgramDesc`. | The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code. | ||||||
| 
 | 
 | ||||||
| ## Overview | ## Overview | ||||||
| 
 | 
 | ||||||
| An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs. | An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs. | ||||||
| 
 | 
 | ||||||
| ### What does executor do? | ## Executor | ||||||
| 
 | 
 | ||||||
| It evaluates all the operators in the `block_id`th block of a `ProgramDesc`. | The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one. | ||||||
|  | It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process. | ||||||
| 
 | 
 | ||||||
| ### What does executor NOT do? | ### The interface | ||||||
|  | ```c++ | ||||||
|  |   Executor(places); | ||||||
|  | ``` | ||||||
|  | A executor does not own any computing resources, a user can only construct an executor using the specified places. | ||||||
| 
 | 
 | ||||||
| It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run. | ### Running an Executor | ||||||
| 
 | 
 | ||||||
| It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices. | ``` | ||||||
| 
 |   void Run(ProgramDesc, Scope, block_id, create_local_scope); | ||||||
| ## Implementation | ``` | ||||||
| 
 | An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished. | ||||||
| `Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc) |  | ||||||
|  | |||||||
| @ -0,0 +1,91 @@ | |||||||
|  | # Design Doc: The Keys of Operator Kernel Type | ||||||
|  | ## Problem | ||||||
|  | An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows: | ||||||
|  | 
 | ||||||
|  | ```cpp | ||||||
|  | struct OpKernelType { | ||||||
|  |   platform::Place place_; | ||||||
|  |   proto::DataType data_type_; | ||||||
|  | }; | ||||||
|  | ``` | ||||||
|  | For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github. | ||||||
|  | 
 | ||||||
|  | It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`.  | ||||||
|  | 
 | ||||||
|  | We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.  | ||||||
|  | 
 | ||||||
|  | For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`. | ||||||
|  | 
 | ||||||
|  | It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration. | ||||||
|  | 
 | ||||||
|  | ## Solution | ||||||
|  | 
 | ||||||
|  | There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`. | ||||||
|  | 
 | ||||||
|  | ```cpp | ||||||
|  | struct OpKernelType { | ||||||
|  |   platform::Place place_; | ||||||
|  |   platform::Library library_; | ||||||
|  |   proto::DataType data_type_; | ||||||
|  |   framework::Layout layout_; | ||||||
|  | }; | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | Following is the details: | ||||||
|  | 
 | ||||||
|  | ### Place | ||||||
|  | 
 | ||||||
|  | `Place` is defined as follows: | ||||||
|  | 
 | ||||||
|  | ```cpp | ||||||
|  | typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place; | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | `Place` is to represent the device memory where data is locating. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ### Library | ||||||
|  | 
 | ||||||
|  | One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable: | ||||||
|  | 
 | ||||||
|  | ```cpp | ||||||
|  | enum Library { Plain, MKLDNN, CUDNN }; | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator. | ||||||
|  | A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle. | ||||||
|  | 
 | ||||||
|  | If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ### DataType | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | `DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported. | ||||||
|  | 
 | ||||||
|  | ### Layout | ||||||
|  | 
 | ||||||
|  | Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout. | ||||||
|  | 
 | ||||||
|  | Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework. | ||||||
|  | 
 | ||||||
|  | - We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too. | ||||||
|  | 
 | ||||||
|  | - Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW. | ||||||
|  | 
 | ||||||
|  | - The inference of Layout is at run-time, not compile-time. | ||||||
|  | 
 | ||||||
|  | - Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators. | ||||||
|  | 
 | ||||||
|  | `Layout` is also defined as a enum variable: | ||||||
|  | 
 | ||||||
|  | ```cpp | ||||||
|  | enum Layout { | ||||||
|  |   kNCHW, | ||||||
|  |   kNHWC, | ||||||
|  | #ifdef PADDLE_WITH_MKLDNN | ||||||
|  |   knChw8c | ||||||
|  |   ... | ||||||
|  | #endif | ||||||
|  | }; | ||||||
|  | ``` | ||||||
| @ -0,0 +1,43 @@ | |||||||
|  | # Design Doc: Execute the Program with Multi CPU | ||||||
|  | 
 | ||||||
|  | ## Abstract | ||||||
|  | 
 | ||||||
|  | This Design Doc propose an approach to make the user-defined Op graph | ||||||
|  | running with multi-CPU, we will use an auto transpiler to convert the user-defined | ||||||
|  | Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph. | ||||||
|  | 
 | ||||||
|  | ## Transpiler | ||||||
|  | 
 | ||||||
|  | <img src="src/multi-threads/single-thread@3x.png" width="300"> | ||||||
|  | 
 | ||||||
|  | After converted: | ||||||
|  | 
 | ||||||
|  | <img src="src/multi-threads/multi-threads@3x.png" width="1000"> | ||||||
|  | 
 | ||||||
|  | ## Implement | ||||||
|  | 
 | ||||||
|  | - `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph | ||||||
|  |   which would be executed with multi-threads. | ||||||
|  | - `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait` | ||||||
|  |   for the atomic counter become `0`: | ||||||
|  |   ```cpp | ||||||
|  |   BlockingCounter bc(thread_count); | ||||||
|  |   for (int i = 0; i < thread_count; ++i) { | ||||||
|  |     thread_pool->Start([&bc] {bc.DecrementCount(); }) | ||||||
|  |   } | ||||||
|  |   bc.Wait(); | ||||||
|  |   ``` | ||||||
|  | - `ParallelDo` Operator | ||||||
|  |   - Initialize a thread pool which is a Singleton. | ||||||
|  |   - Use a block id as the input, and create run the specify Block on independent scope | ||||||
|  |     with multi-threads. | ||||||
|  |   - Initialize a `BlockingCounter` instance and wait until all threads are done. | ||||||
|  | - `Split` Operator will split the Input Tensor into a TensorArray. | ||||||
|  | - `Merge` merge all the gradients which calculated in different threads | ||||||
|  |   with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`. | ||||||
|  | 
 | ||||||
|  | ## TODO | ||||||
|  | 
 | ||||||
|  | - Improve the optimizer stage with multi-threads, since we could | ||||||
|  |   assign the parameters to the different threads and execute | ||||||
|  |   optimizer with multi-threads. | ||||||
											
												Binary file not shown.
											
										
									
								| After Width: | Height: | Size: 350 KiB | 
| After Width: | Height: | Size: 76 KiB | 
| @ -0,0 +1,66 @@ | |||||||
|  | ## Background | ||||||
|  | Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold.  | ||||||
|  | 
 | ||||||
|  | The `KernelType` is as follows. | ||||||
|  | 
 | ||||||
|  | ``` | ||||||
|  | struct KernelType { | ||||||
|  |   Place place_; | ||||||
|  |   DataType data_type_; | ||||||
|  |   LayoutType layout_; | ||||||
|  | }; | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`. | ||||||
|  | 
 | ||||||
|  | The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`. | ||||||
|  | 
 | ||||||
|  | The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel. | ||||||
|  | 
 | ||||||
|  | ## Problem | ||||||
|  | 
 | ||||||
|  | We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations. | ||||||
|  | 
 | ||||||
|  | 1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel. | ||||||
|  | 2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem. | ||||||
|  | 3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`. | ||||||
|  | 
 | ||||||
|  | Problems under these situations are similar. We can formalise this problem as follow. | ||||||
|  | 
 | ||||||
|  | We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$. | ||||||
|  | 
 | ||||||
|  | ## Solution | ||||||
|  | 
 | ||||||
|  | It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods. | ||||||
|  | 
 | ||||||
|  | We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed. | ||||||
|  | 
 | ||||||
|  | We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`. | ||||||
|  | 
 | ||||||
|  | We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type. | ||||||
|  | 
 | ||||||
|  | The algorithm is described as follow | ||||||
|  | 
 | ||||||
|  | ```cpp | ||||||
|  | using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>; | ||||||
|  | using KernelTypePair = std::pair<KernelType, KernelType>; | ||||||
|  | 
 | ||||||
|  | map<KernelTypePair, DataTransformationFN> g_data_transformation_; | ||||||
|  | 
 | ||||||
|  | void OpWithKernel::Run() { | ||||||
|  |   vec<Tensor> inputs = ... | ||||||
|  |   auto actual_kernel_type = GetActualKernelType(inputs); | ||||||
|  |    | ||||||
|  |   // The expected kernel type is related to actual kernel type. | ||||||
|  |   // For the most operators, the expected kernel type is as same as | ||||||
|  |   // actual kernel type. | ||||||
|  |   // | ||||||
|  |   // So we pass `actual_kernel_type` as a parameter of  | ||||||
|  |   // GetExpectedKernelType | ||||||
|  |   auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type); | ||||||
|  |    | ||||||
|  |   auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}]; | ||||||
|  |    | ||||||
|  |   kernel.run(trans(inputs)); | ||||||
|  | } | ||||||
|  | ``` | ||||||
Some files were not shown because too many files have changed in this diff Show More
					Loading…
					
					
				
		Reference in new issue