Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into cross_channel_norm

7 years ago · 333995a700
parent d8b13dee5e fc5c432a32
commit 333995a700
263 changed files with 3583 additions and 1674 deletions
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 gp = get_config_arg('layer_num', int, 1)
 is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)
@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
@ -40,11 +50,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 net = fc_layer(
@ -59,6 +69,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-lab = data_layer('label', num_class)
+if is_infer:
-loss = cross_entropy(input=net, label=lab)
+    outputs(net)
-outputs(loss)
+else:
    lab = data_layer('label', num_class)
    loss = cross_entropy(input=net, label=lab)
    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
    else:
        settings.data_size = settings.height * settings.width
    settings.is_infer = kwargs.get('is_infer', False)
    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
@provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
        if settings.is_infer:
            yield img.astype('float32')
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@ -37,7 +37,7 @@ function infer() {
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
    infer googlenet v1 $batchsize $use_mkldnn
    infer resnet 50 $batchsize $use_mkldnn
    infer vgg 19 $batchsize $use_mkldnn
    infer resnet 50 $batchsize $use_mkldnn
    infer googlenet v1 $batchsize $use_mkldnn
    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@ -47,5 +47,6 @@ for use_mkldnn in True False; do
    train vgg 19 $batchsize $use_mkldnn
    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@ -23,24 +23,25 @@ function infer() {
    echo "./run_mkl_infer.sh to save the model first"
    exit 0
  fi
-  log_period=$((256 / bs))
+  log_period=$((32 / bs))
  paddle train --job=test \
    --config="${topology}.py" \
    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \
    2>&1 | tee ${log}
-  # calculate the last 5 logs period time of 1280 samples,
+  # calculate the last 5 logs period time of 160(=32*5) samples,
  # the time before are burning time.
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
@ -56,7 +57,8 @@ fi
 # inference benchmark
 for batchsize in 1 2 4 8 16; do
  infer googlenet v1 $batchsize
  infer resnet 50 $batchsize
  infer vgg 19 $batchsize
  infer resnet 50 $batchsize 
  infer googlenet v1 $batchsize
  infer alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@ -12,10 +12,11 @@ function train() {
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
-    --log_period=10 \
+    --log_period=3 \
-    --test_period=100 \
+    --test_period=30 \
    --config_args=$args \
    2>&1 | tee ${log} 
@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
  train vgg 19 $batchsize
  train resnet 50 $batchsize
  train googlenet v1 $batchsize
  train alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
        ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
        ENDIF()
    ENDIF()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -467,7 +467,7 @@ lambda_cost
    :noindex:
 square_error_cost
--------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
    :noindex:
@ -533,7 +533,7 @@ Miscs
 =====
 dropout
--------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@ -3,19 +3,19 @@ Nets
 ===========
 simple_img_conv_pool
-----------
+--------------------
 ..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
    :noindex:
 img_conv_group
-----------
+---------------
 ..  autofunction:: paddle.v2.fluid.nets.img_conv_group
    :noindex:
 sequence_conv_pool
-----------
+------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@ -18,7 +18,7 @@ SGDOptimizer
 MomentumOptimizer
-----------
+-----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: MomentumOptimizer
    :noindex:
@ -26,14 +26,14 @@ MomentumOptimizer
 AdagradOptimizer
-----------
+----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdagradOptimizer
    :noindex:
 AdamOptimizer
-----------
+-------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdamOptimizer
    :noindex:
@ -47,7 +47,7 @@ AdamaxOptimizer
 DecayedAdagradOptimizer
-----------
+-----------------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: DecayedAdagradOptimizer
    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@ -3,14 +3,14 @@ Regularizer
 ===========
 WeightDecayRegularizer
-----------
+----------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: WeightDecayRegularizer
    :noindex:
 L2DecayRegularizer
-----------
+------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L2DecayRegularizer
    :noindex:
@ -18,7 +18,7 @@ L2DecayRegularizer
 L1DecayRegularizer
-----------
+-------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L1DecayRegularizer
--- a/doc/design/kernel_hint_design.md
+++ b/doc/design/kernel_hint_design.md
@ -0,0 +1,57 @@
 ## Problem
 In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
 In the current design, we use KernelType to describe one kernel.
 ```cpp
 struct KernelType {
  Place place_;
  DataType data_type_;
  LayoutType layout_;
 };
 ```
 `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
 The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
 So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
 The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
 ## Solution
 ### Potential choice
 1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
 2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
 ### Final choice
 To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
 In C++
 ```cpp
 const std::string kForceCPU = "force_cpu";
 const std::string kUseCUDNN = "use_cudnn";
 const std::string kUseMKLDNN = "use_mkldnn";
 KernelType GetExpectedKernelType() {
  if (Attr<bool>(kForceCPU)) {
    return KernelType(CPUPlace, ...)
  } else {
    ...
  }
 }
 ```
 In Python code
 ```python
 FORCE_CPU = core.kForceCPU()
 def xx_layer(..., force_cpu=false):
  layer_helper = LayerHelper(...)
  layer_helper.append_op(
    type="xx",
    attr={FORCE_CPU: force_cpu})
 ```
--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
@ -0,0 +1,91 @@
 # Design Doc: The Keys of Operator Kernel Type
 ## Problem
 An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
 ```cpp
 struct OpKernelType {
  platform::Place place_;
  proto::DataType data_type_;
 };
 ```
 For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
 It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
 We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
 For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
 It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
 ## Solution
 There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
 ```cpp
 struct OpKernelType {
  platform::Place place_;
  platform::Library library_;
  proto::DataType data_type_;
  framework::Layout layout_;
 };
 ```
 Following is the details:
 ### Place
 `Place` is defined as follows:
 ```cpp
 typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
 ```
 `Place` is to represent the device memory where data is locating.
 ### Library
 One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
 ```cpp
 enum Library { Plain, MKLDNN, CUDNN };
 ```
 We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
 A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
 If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
 ### DataType
 `DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
 ### Layout
 Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
 Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
 - We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
 - Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
 - The inference of Layout is at run-time, not compile-time.
 - Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
 `Layout` is also defined as a enum variable:
 ```cpp
 enum Layout {
  kNCHW,
  kNHWC,
 #ifdef PADDLE_WITH_MKLDNN
  knChw8c
  ...
 #endif
 };
 ```
--- a/doc/design/refactor/multi_cpu.md
+++ b/doc/design/refactor/multi_cpu.md
@ -0,0 +1,43 @@
 # Design Doc: Execute the Program with Multi CPU
 ## Abstract
 This Design Doc propose an approach to make the user-defined Op graph
 running with multi-CPU, we will use an auto transpiler to convert the user-defined
 Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 ## Transpiler
 <img src="src/multi-threads/single-thread@3x.png" width="300">
 After converted:
 <img src="src/multi-threads/multi-threads@3x.png" width="1000">
 ## Implement
 - `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
  which would be executed with multi-threads.
 - `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
  for the atomic counter become `0`:
  ```cpp
  BlockingCounter bc(thread_count);
  for (int i = 0; i < thread_count; ++i) {
    thread_pool->Start([&bc] {bc.DecrementCount(); })
  }
  bc.Wait();
  ```
 - `ParallelDo` Operator
  - Initialize a thread pool which is a Singleton.
  - Use a block id as the input, and create run the specify Block on independent scope
    with multi-threads.
  - Initialize a `BlockingCounter` instance and wait until all threads are done.
 - `Split` Operator will split the Input Tensor into a TensorArray.
 - `Merge` merge all the gradients which calculated in different threads
  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
 ## TODO
 - Improve the optimizer stage with multi-threads, since we could
  assign the parameters to the different threads and execute
  optimizer with multi-threads.
--- a/doc/design/refactor/src/multi-threads.graffle
+++ b/doc/design/refactor/src/multi-threads.graffle
--- a/doc/design/refactor/src/multi-threads/multi-threads@3x.png
+++ b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
--- a/doc/design/refactor/src/multi-threads/single-thread@3x.png
+++ b/doc/design/refactor/src/multi-threads/single-thread@3x.png
--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
@ -0,0 +1,66 @@
 ## Background
 Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
 The `KernelType` is as follows.
 ```
 struct KernelType {
  Place place_;
  DataType data_type_;
  LayoutType layout_;
 };
 ```
 The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
 The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
 The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
 ## Problem
 We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
 1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
 2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
 3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
 Problems under these situations are similar. We can formalise this problem as follow.
 We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
 ## Solution
 It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
 We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
 We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
 We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
 The algorithm is described as follow
 ```cpp
 using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
 using KernelTypePair = std::pair<KernelType, KernelType>;
 map<KernelTypePair, DataTransformationFN> g_data_transformation_;
 void OpWithKernel::Run() {
  vec<Tensor> inputs = ...
  auto actual_kernel_type = GetActualKernelType(inputs);
  // The expected kernel type is related to actual kernel type.
  // For the most operators, the expected kernel type is as same as
  // actual kernel type.
  //
  // So we pass `actual_kernel_type` as a parameter of 
  // GetExpectedKernelType
  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
  kernel.run(trans(inputs));
 }
 ```
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
   :header: "依赖", "版本", "说明"
   :widths: 10, 15, 30
-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
+   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
+   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"
 .. _build_options:
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@ -76,13 +76,13 @@ will be downloaded automatically.
   :header: "Dependency", "Version", "Description"
   :widths: 10, 15, 30
-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
+   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
+   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"
 .. _build_options:
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@ -128,7 +128,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
 是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
 以下指令能检查Linux电脑是否支持AVX：
--- a/Show More
+++ b/Show More