Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lstm_fix

8 years ago · 4098ce730b
parent ed2d30b5f9 3b32eb9e2d
commit 4098ce730b
113 changed files with 3190 additions and 573 deletions
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -0,0 +1,213 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_test = get_config_arg("is_test", bool, False)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+
+#######################Network Configuration #############
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        "conv1",
+        input=img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+    # conv2_x: 56x56
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+
+    # conv3_x: 28x28
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
+
+    # conv4_x: 14x14
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
+
+    # conv5_x: 7x7
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 50:
+    resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+    resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+    resnet = deep_res_net(3, 8, 36, 3)
+else:
+    print("Wrong layer number.")
+
+lbl = data_layer(name="label", size=num_class)
+loss = cross_entropy(name='loss', input=resnet, label=lbl)
+inputs(img, lbl)
+outputs(loss)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@ -5,22 +5,23 @@ function train() {
  export OMP_DYNAMIC="FALSE"
  export KMP_AFFINITY="granularity=fine,compact,0,0"
  topology=$1
-  bs=$2
-  use_mkldnn=$3
-  if [ $3 == "True" ]; then
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
    thread=1
-    log="logs/${topology}-mkldnn-${bs}.log"
-  elif [ $3 == "False" ]; then
+    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
    export OMP_NUM_THREADS=1
    export MKL_NUM_THREADS=1
-    log="logs/${topology}-${thread}mklml-${bs}.log"
+    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
    echo "Wrong input $3, use True or False."
    exit 0
  fi
-  args="batch_size=${bs}"
+  args="batch_size=${bs},layer_num=${layer_num}"
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then
  mkdir logs
 fi

-#========== mkldnn ==========#
-train vgg 64 True
-train vgg 128 True
-train vgg 256 True
-
-#========== mklml ===========#
-train vgg 64 False
-train vgg 128 False
-train vgg 256 False
+for use_mkldnn in True False; do
+  for batchsize in 64 128 256; do
+    train vgg 19 $batchsize $use_mkldnn
+    train resnet 50  $batchsize $use_mkldnn
+  done
+done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -13,7 +13,7 @@ define_py_data_sources2(

 settings(
    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
+    learning_rate=0.001 / batch_size,
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()

+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
 ExternalProject_Add(
    ${MKLDNN_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.10"
+    GIT_TAG             "v0.11"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -27,8 +27,8 @@ ENDIF()
 INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================

+..  toctree::
+    :maxdepth: 1

-DataTypes
-=========
-
-..  automodule:: paddle.v2.data_type
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.v2.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.v2.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
-
-Dataset
-=======
-
-..  automodule:: paddle.v2.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.v2.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.v2.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.v2.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.v2.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.v2.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.v2.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
-    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.v2.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.v2.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.v2.dataset.wmt14
-    :members:
-    :noindex:
-
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
--- a/doc/api/v2/data/data_reader.rst
+++ b/doc/api/v2/data/data_reader.rst
@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@ -0,0 +1,75 @@
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
+++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
+++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
+++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
+++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
+++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
+++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
+++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
--- a/doc/api/v2/data/image.rst
+++ b/doc/api/v2/data/image.rst
@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@ -0,0 +1,60 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required, using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## To do
+After float16 class is available, some of the future items are below:
+
+- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
+
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
+
+- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -21,7 +21,7 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"

-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);

 namespace paddle {
 namespace framework {
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
  }
 }

+inline std::type_index ToTypeIndex(DataType type) {
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
 template <typename Visitor>
 inline void VisitDataType(DataType type, Visitor visitor) {
  switch (type) {
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -79,6 +79,13 @@ DDim make_ddim(const std::vector<int64_t>& dims) {
  return result;
 }

+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
@ -117,7 +124,7 @@ int64_t DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }

-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }

 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -71,7 +71,7 @@ struct DDim {

  DDim operator*(DDim d) const;

-  int64_t size() const;
+  int size() const;
 };

 /**
@ -81,6 +81,8 @@ struct DDim {
 */
 DDim make_ddim(const std::vector<int64_t>& dims);

+DDim make_ddim(const std::vector<int>& dims);
+
 /**
 * \brief Make a DDim from an initializer list
 *
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
    TableItem item;
    item.index = i;
    item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
    items_.emplace_back(item);
  }
  // NOTE(yuyang18):
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@ -27,6 +27,20 @@
 namespace paddle {
 namespace framework {

+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    for (auto& i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
  LoD new_lod;
  new_lod.reserve(level_end - level_begin);
@ -136,37 +150,35 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
  ShareDataWith(Slice(begin, end));
 }

-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset) {
-  lod_length->clear();
-  PADDLE_ENFORCE(start_idx < lod.size() - 1,
-                 "start_idx should be >= 0 and < lod.size() - 1.");
-  PADDLE_ENFORCE(end_idx < lod.size(),
-                 "end_idx should be >= 0 and < lod.size().");
-  PADDLE_ENFORCE_LE(start_idx, end_idx,
-                    "start_idx should be less than end_idx.");
-  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
    std::vector<size_t> level_lens;
    for (size_t i = start_idx; i < end_idx; ++i) {
      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
    }
-    lod_length->emplace_back(level_lens);
+    sub_lod.emplace_back(level_lens);
    start_idx = lod[level_idx][start_idx];
    end_idx = lod[level_idx][end_idx];
  }
-  *start_offset = start_idx;
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }

-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
-  PADDLE_ENFORCE_EQ(
-      lod->size(), lod_length.size(),
+void AppendLoD(LoD* lod, const LoD& lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
      "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
  for (size_t i = 0; i < lod->size(); ++i) {
    auto& level = (*lod)[i];
-    if (level.empty()) {
-      level.push_back(0);
-    }
    for (size_t len : lod_length[i]) {
      level.push_back(level.back() + len);
    }
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
 */
 using LoD = std::vector<Vector<size_t>>;

+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
 /*
 * Slice levels from a LoD.
 * NOTE the lowest level should always be the absolute offsets of the underlying
@ -181,11 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  return tensor;
 }

-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
-                             std::vector<std::vector<size_t>>* lod_length,
-                             size_t* start_offset);
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);

-void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+void AppendLoD(LoD* lod, const LoD& lod_length);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@ -146,43 +146,44 @@ TEST(LodExpand, test) {

 TEST(LoD, GetFineGrainedLoDLength) {
  LoD lod;
-  lod.push_back(std::vector<size_t>{0, 2, 4, 5});
-  lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
  lod.push_back(
-      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));

-  std::vector<std::vector<size_t>> lod_length;
-  size_t start_offset;
-  paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
-                                             &start_offset);
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;

-  std::vector<std::vector<size_t>> expected;
+  LoD expected;
  expected.push_back(std::vector<size_t>{2});
  expected.push_back(std::vector<size_t>{2, 2});
  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
  EXPECT_EQ(lod_length, expected);
  EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
 }

 TEST(LoD, AppendLoD) {
-  std::vector<std::vector<size_t>> lod_lens;
-  lod_lens.push_back(std::vector<size_t>{2});
-  lod_lens.push_back(std::vector<size_t>{2, 2});
-  lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));

  LoD origin;
-  origin.push_back(std::vector<size_t>{0, 2});
-  origin.push_back(std::vector<size_t>{0, 1, 6});
-  origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));

  paddle::framework::AppendLoD(&origin, lod_lens);

  LoD expected;
-  expected.push_back(std::vector<size_t>{0, 2, 4});
-  expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
  expected.push_back(
-      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
-
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
  EXPECT_EQ(origin, expected);
 }

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {

  void operator()(const char* op_type) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
-                                        PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);

    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -254,8 +254,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
  return res;
 }

-std::ostream& operator<<(std::ostream& os,
-                         const OperatorWithKernel::OpKernelKey& kernel_key) {
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
     << "]";
  return os;
@ -432,7 +431,7 @@ void OperatorWithKernel::Run(const Scope& scope,

  // check if op[type] have kernel for kernel_key
  OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+  auto kernel_key = GetKernelType(ctx);
  auto kernel_iter = kernels.find(kernel_key);

  if (kernel_iter == kernels.end()) {
@ -440,6 +439,41 @@ void OperatorWithKernel::Run(const Scope& scope,
  }

  kernel_iter->second->Compute(ctx);
+
+  // throws errors if have.
+  dev_ctx.Finish();
+}
+OpKernelType OperatorWithKernel::GetKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+}
+DataType OperatorWithKernel::IndicateDataType(
+    const ExecutionContext& ctx) const {
+  auto& scope = ctx.scope();
+  int data_type = -1;
+  for (auto& input : this->inputs_) {
+    for (auto& ipt_name : input.second) {
+      auto* var = scope.FindVar(ipt_name);
+      if (var != nullptr) {
+        const Tensor* t = nullptr;
+        if (var->IsType<Tensor>()) {
+          t = &var->Get<Tensor>();
+        } else if (var->IsType<LoDTensor>()) {
+          t = &var->Get<LoDTensor>();
+        } else if (var->IsType<SelectedRows>()) {
+          t = &(var->Get<SelectedRows>().value());
+        }
+        if (t != nullptr) {
+          int tmp = static_cast<int>(ToDataType(t->type()));
+          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                         "DataType of Paddle Op %s must be the same.", Type());
+          data_type = tmp;
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  return static_cast<DataType>(data_type);
 }

 }  // namespace framework
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -345,27 +345,10 @@ class OpKernel : public OpKernelBase {
  using ELEMENT_TYPE = T;
 };

-class OperatorWithKernel : public OperatorBase {
- public:
-  struct OpKernelKey {
-    platform::Place place_;
-    DataType data_type_;
-
-    OpKernelKey(DataType data_type, platform::Place place)
-        : place_(place), data_type_(data_type) {}
-
-    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
-        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-
-    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_) &&
-             data_type_ == o.data_type_;
-    }
-  };
-
-  struct OpKernelHash {
+struct OpKernelType {
+  struct Hash {
    std::hash<int> hash_;
-    size_t operator()(const OpKernelKey& key) const {
+    size_t operator()(const OpKernelType& key) const {
      int place = key.place_.which();
      int data_type = static_cast<int>(key.data_type_);
      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
@ -374,9 +357,26 @@ class OperatorWithKernel : public OperatorBase {
    }
  };

+  platform::Place place_;
+  DataType data_type_;
+
+  OpKernelType(DataType data_type, platform::Place place)
+      : place_(place), data_type_(data_type) {}
+
+  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_;
+  }
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
  using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
-                         OpKernelHash>;
+      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
+                         OpKernelType::Hash>;

  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                     const VariableNameMap& outputs, const AttributeMap& attrs)
@ -404,40 +404,15 @@ class OperatorWithKernel : public OperatorBase {
  }

 protected:
+  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+
+ private:
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
-  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
-    auto& scope = ctx.scope();
-    int data_type = -1;
-    for (auto& input : this->inputs_) {
-      for (auto& ipt_name : input.second) {
-        auto* var = scope.FindVar(ipt_name);
-        if (var != nullptr) {
-          const Tensor* t = nullptr;
-          if (var->IsType<Tensor>()) {
-            t = &var->Get<Tensor>();
-          } else if (var->IsType<LoDTensor>()) {
-            t = &var->Get<LoDTensor>();
-          } else if (var->IsType<SelectedRows>()) {
-            t = &(var->Get<SelectedRows>().value());
-          }
-          if (t != nullptr) {
-            int tmp = static_cast<int>(ToDataType(t->type()));
-            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op %s must be the same.",
-                           Type());
-            data_type = tmp;
-          }
-        }
-      }
-    }
-    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-    return static_cast<DataType>(data_type);
-  }
+  DataType IndicateDataType(const ExecutionContext& ctx) const;
 };

-std::ostream& operator<<(std::ostream& os,
-                         const OperatorWithKernel::OpKernelKey& kernel_key);
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);

 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
-  DataType IndicateDataType(const ExecutionContext& ctx) const override {
-    return DataType::FP32;
+  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
+    return OpKernelType(DataType::FP32, ctx.device_context());
  }
 };

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };

 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
  size_t size = functor(type);
  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
  return size;
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@ -45,7 +45,8 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
      desc_.mutable_tensor_array()->set_lod_level(lod_level);
      break;
    default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
  }
 }

@ -56,7 +57,8 @@ int32_t VarDescBind::GetLodLevel() const {
    case VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().lod_level();
    default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
  }
 }

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -45,6 +45,7 @@ if(WITH_GPU)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
    add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
 endif()

 add_simple_unittest(Im2ColTest)
--- a/Show More
+++ b/Show More