diff --git a/.clang-format b/.clang-format
index 9ba433b173..aff93435f5 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/.gitignore b/.gitignore
index 1512c1438e..ac56a3320e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,11 +21,11 @@ third_party/
 cmake-build-*
 
 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
-python/paddle/v2/framework/tests/tmp/*
+python/paddle/version.py
diff --git a/.travis.yml b/.travis.yml
index c51e02eb79..e2d49daa19 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
   - |
     timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 264420ad83..b309ff37e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,10 +16,14 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
 include(system)
 
 project(paddle CXX C Go)
+message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
+message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -36,8 +40,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -55,7 +58,9 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -68,9 +73,6 @@ if(ANDROID OR IOS)
     if(ANDROID)
         if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
             message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-            # TODO: support glog for Android api 16 ~ 19 in the future
-            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
         endif()
     endif()
 
@@ -82,10 +84,10 @@ if(ANDROID OR IOS)
         "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
     set(WITH_RDMA OFF CACHE STRING
         "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLDNN OFF CACHE STRING
-        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKLML OFF CACHE STRING
-        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)
 
     # Compile PaddlePaddle mobile inference library
     if (NOT WITH_C_API)
@@ -111,6 +113,14 @@ else()
     set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 
+set(WITH_MKLML ${WITH_MKL})
+if (WITH_MKL AND AVX2_FOUND)
+    set(WITH_MKLDNN ON)
+else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+endif()
+
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -126,8 +136,10 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
-include(external/pybind11)    # download pybind11
+include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -158,14 +170,15 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
-    endif(NOT WITH_DSO)
+  include(cuda)
 endif(WITH_GPU)
 
+if(WITH_MKLML)
+    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
+
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 
 if(USE_NNPACK)
diff --git a/Dockerfile b/Dockerfile
index 150344a811..857d3f3e5f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
 # Install Go and glide
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 040f5ffa41..26930a7637 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -2,21 +2,17 @@
 
 Machine:
 
-- Server
- 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop
- 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
-- Desktop
- 	- i7-6700k
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
-
-- MKL-DNN tag v0.10
-- MKLML 2018.0.20170720
-- OpenBLAS v0.2.20
+PaddlePaddle: (TODO: will rerun after 0.11.0)
+- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+  - MKL-DNN tag v0.11
+  - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+  - OpenBLAS v0.2.20
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 
@@ -31,18 +27,31 @@ Input image size - 3 * 224 * 224, Time: images/second
 
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
+| MKLML        | 12.12 | 13.70 | 16.18  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
 
+<img src="figs/vgg-cpu-train.png" width="500">
 
-chart on batch size 128
-TBD
+ - ResNet-50
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |
+
+<img src="figs/resnet-cpu-train.png" width="500">
 
- - ResNet
  - GoogLeNet
 
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
+| MKLML        | 128.46| 137.89| 158.63 |
+| MKL-DNN      | 250.46| 264.83| 269.50 |
+
+<img src="figs/googlenet-cpu-train.png" width="500">
+
 ### Laptop
 TBD
-### Desktop
-TBD
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000..c3f67faf09
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000..b96ecd5ff9
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000..f830ca6a87
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index bc893bab98..7059c13bd2 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -5,10 +5,22 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+use_gpu = get_config_arg('use_gpu', bool, True)
+is_infer = get_config_arg("is_infer", bool, False)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -16,6 +28,8 @@ settings(
     learning_method=MomentumOptimizer(0.9),
     regularization=L2Regularization(0.0005 * batch_size))
 
+conv_projection = conv_projection if use_gpu else img_conv_layer
+
 def inception2(name, input, channels, \
     filter1,
     filter3R, filter3,
@@ -138,12 +152,11 @@ def inception(name, input, channels, \
     cat = concat_layer(
         name=name,
         input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
         act=ReluActivation())
     return cat
 
 
-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 
 # stage 1
@@ -221,6 +234,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
     name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)
 
-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 4703944c87..927b175994 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
         settings.data_size = settings.height * settings.width * 3
     else:
         settings.data_size = settings.height * settings.width
-
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    settings.is_infer = kwargs.get('is_infer', False)
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
+        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 
 
 @provider(
     init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(2560 if settings.is_infer else 1024):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class - 1)
-        yield img.astype('float32'), int(lab)
+        if settings.is_infer:
+            yield img.astype('float32')
+        else:
+            lab = random.randint(0, settings.num_class - 1)
+            yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
new file mode 100644
index 0000000000..4a14363ff1
--- /dev/null
+++ b/benchmark/paddle/image/resnet.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_infer = get_config_arg("is_infer", bool, False)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
+define_py_data_sources2(
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+
+#######################Network Configuration #############
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        "conv1",
+        input=img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+    # conv2_x: 56x56
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+
+    # conv3_x: 28x28
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
+
+    # conv4_x: 14x14
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
+
+    # conv5_x: 7x7
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 50:
+    resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+    resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+    resnet = deep_res_net(3, 8, 36, 3)
+else:
+    print("Wrong layer number.")
+
+if is_infer:
+    outputs(resnet)
+else:
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
deleted file mode 100755
index e31fec1cd8..0000000000
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-set -e
-
-function train() {
-  unset OMP_NUM_THREADS MKL_NUM_THREADS
-  export OMP_DYNAMIC="FALSE"
-  export KMP_AFFINITY="granularity=fine,compact,0,0"
-  topology=$1
-  bs=$2
-  use_mkldnn=$3
-  if [ $3 == "True" ]; then
-    thread=1
-    log="logs/${topology}-mkldnn-${bs}.log"
-  elif [ $3 == "False" ]; then
-    thread=`nproc`
-    # each trainer_count use only 1 core to avoid conflict
-    export OMP_NUM_THREADS=1
-    export MKL_NUM_THREADS=1
-    log="logs/${topology}-${thread}mklml-${bs}.log"
-  else
-    echo "Wrong input $3, use True or False."
-    exit 0
-  fi
-  args="batch_size=${bs}"
-  config="${topology}.py"
-  paddle train --job=time \
-    --config=$config \
-    --use_mkldnn=$use_mkldnn \
-    --use_gpu=False \
-    --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
-    --config_args=$args \
-    2>&1 | tee ${log} 
-}
-
-if [ ! -d "train.list" ]; then
-  echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
-  mkdir logs
-fi
-
-#========== mkldnn ==========#
-train vgg 64 True
-train vgg 128 True
-train vgg 256 True
-
-#========== mklml ===========#
-train vgg 64 False
-train vgg 128 False
-train vgg 256 False
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh
new file mode 100755
index 0000000000..d795bcab1b
--- /dev/null
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
@@ -0,0 +1,86 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num}" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer vgg 19 $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkldnn_train.sh
new file mode 100755
index 0000000000..320206239a
--- /dev/null
+++ b/benchmark/paddle/image/run_mkldnn_train.sh
@@ -0,0 +1,47 @@
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    # each trainer_count use only 1 core to avoid conflict
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for use_mkldnn in True False; do
+  for batchsize in 64 128 256; do
+    train vgg 19 $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
+    train googlenet v1 $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index b8429975f5..8d0a1e97a4 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -6,14 +6,25 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
+    learning_rate=0.001 / batch_size,
     learning_method=MomentumOptimizer(0.9),
     regularization=L2Regularization(0.0005 * batch_size))
 
@@ -98,6 +109,9 @@ elif layer_num == 19:
 else:
     print("Wrong layer number.")
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=vgg, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(vgg)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 8fdc382f0c..b21fc43904 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,17 +1,12 @@
 # Find the CBlas and lapack libraries
 #
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
-#
-# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
-# during cmake. If none of them set, it will try to find cblas implementation in
-# system paths.
-#
 
 set(CBLAS_FOUND OFF)
 
@@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   return()
 endif()
 
-## Then find MKL.
-set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
-
-set(MKL_INCLUDE_SEARCH_PATHS
-  ${MKL_ROOT}/include
-  ${INTEL_MKL_ROOT}/include)
-set(MKL_LIB_SEARCH_PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64
-  ${INTEL_MKL_ROOT}/lib
-  ${INTEL_MKL_ROOT}/lib/intel64)
-
-find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_INCLUDE_SEARCH_PATHS})
-find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_INCLUDE_SEARCH_PATHS})
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_LIB_SEARCH_PATHS})
-
-if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
-
-  add_definitions(-DPADDLE_USE_MKL)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
-  return()
-endif()
-
 ## Then find atlas.
 set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
 set(ATLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 24ddb24399..5c6bcfde76 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
@@ -76,27 +81,14 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
-if(WITH_MKLDNN)
-    add_definitions(-DPADDLE_USE_MKLDNN)
-    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
-        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
-        set(OPENMP_FLAGS "-fopenmp")
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-    else()
-        find_package(OpenMP)
-        if(OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        else()
-            message(WARNING "Can not find OpenMP."
-                 "Some performance features in MKLDNN may not be available")
-        endif()
-    endif()
-
-endif(WITH_MKLDNN)
+if (WITH_MKLML AND MKLML_IOMP_LIB)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+    set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 0b38943952..d3f5bf6852 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -76,12 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
 # Set the architecture for iOS
 if(NOT DEFINED IOS_ARCH)
   if(IOS_PLATFORM STREQUAL "OS")
-    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
-    set(IOS_ARCH "arm64")
+    set(IOS_ARCH "armv7;armv7s;arm64")
   elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
     set(IOS_ARCH "i386;x86_64")
-  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
-    set(IOS_ARCH armv7k)
   endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
@@ -249,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
 
 # Hidden visibilty is required for cxx on iOS 
 set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
 
 set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
new file mode 100644
index 0000000000..6bea7cf302
--- /dev/null
+++ b/cmake/cuda.cmake
@@ -0,0 +1,188 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000..aec51410b3
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make -j8
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index c819eb4d70..d4f252bb9f 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
-    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
-    # to fix it.  Before it gets accepted by the gflags team, we use
-    # my personal fork, which contains above fix, temporarily.  Let's
-    # change this back to the official Github repo once my PR is
-    # merged.
-    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
-    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 08bdc1e162..0c6b3aafcb 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,12 +26,21 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
+IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+  # Using the unofficial glog for Android API < 21
+  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
+  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
+ELSE()
+  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+  SET(GLOG_TAG "v0.3.5")
+ENDIF()
+
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
-    GIT_REPOSITORY  "https://github.com/google/glog.git"
-    GIT_TAG         v0.3.5
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
+    GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000..abee6698e3
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9686df0021..fc52d339d7 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -40,28 +40,32 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
-    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
-    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
-    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
+ELSE()
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
 
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.10"
+    GIT_TAG             "v0.11"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
 )
 
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_USE_MKLDNN)
 LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 74f3279831..20dbc32a73 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,8 @@ ENDIF()
 INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
index 57d2c0a352..fc43766efa 100644
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@@ -1,3 +1,21 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  return()
+endif()
+
 include(ExternalProject)
 
 set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 143b57a954..97857a686b 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
 
     IF(CMAKE_CROSSCOMPILING)
         SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
             ENDIF()
         ELSEIF(IOS)
-            # FIXME(liuyiqun): support multiple architectures
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
                 SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
                 SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
             ENDIF()
         ELSEIF(RPI)
             # use hardfp
@@ -86,7 +85,7 @@ IF(NOT ${CBLAS_FOUND})
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
-
+    SET(CBLAS_PROVIDER openblas)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
         # Because libopenblas.a is a symbolic link of another library, thus need to
@@ -98,7 +97,7 @@ IF(NOT ${CBLAS_FOUND})
         ENDIF()
         INSTALL(CODE "execute_process(
             COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
-                    destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
+                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
             )"
         )
         INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
@@ -115,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} MATCHES MKL)
-    ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
-    ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+ADD_LIBRARY(cblas STATIC ${dummyfile})
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 
 IF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465..fab2af362b 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
     function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
 
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
+
     ExternalProject_Add(
         ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
         DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
+        GIT_TAG         ${PROTOBUF_TAG}
         CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
+ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
 IF(CMAKE_CROSSCOMPILING)
     build_protobuf(protobuf_host TRUE)
     LIST(APPEND external_project_dependencies protobuf_host)
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 9391c285c7..4e87dc49d8 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -1,8 +1,26 @@
-INCLUDE(ExternalProject)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+if(NOT WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)
 
-INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
 
 ExternalProject_Add(
         extern_pybind
@@ -17,14 +35,12 @@ ExternalProject_Add(
         TEST_COMMAND      ""
 )
 
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
     add_library(pybind STATIC ${dummyfile})
 else()
     add_library(pybind INTERFACE)
 endif()
 
 add_dependencies(pybind extern_pybind)
-
-LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index ce088ae7ea..9db457c7b2 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 8bd0582228..a8e1aca49c 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index e2c9fe56f3..1638cd8fdf 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 4593ae6180..1120677a37 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -111,6 +111,8 @@ set(COMMON_FLAGS
     -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs
     -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 
 set(GPU_COMMON_FLAGS
@@ -149,58 +151,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
     safe_set_nvflag(${flag})
 endforeach()
-
-
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
-endif()
-
-# Custom gpu architecture
-set(CUDA_ARCH)
-
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
-
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c311783aa3..66c8e3ad7e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
@@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             python2 ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 46035a908b..53c2de332e 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -1,27 +1,28 @@
 # This file is use to check all support level of AVX on your machine
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
-INCLUDE(CheckCXXSourceRuns)
-INCLUDE(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns)
+include(CheckCXXSourceCompiles)
 
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
     set(SSE2_FLAG "-msse2")
     set(SSE3_FLAG "-msse3")
-    SET(AVX_FLAG "-mavx")
-    SET(AVX2_FLAG "-mavx2")
-ELSEIF(MSVC)
+    set(AVX_FLAG "-mavx")
+    set(AVX2_FLAG "-mavx2")
+elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
     set(SSE3_FLAG "/arch:SSE3")
     SET(AVX_FLAG "/arch:AVX")
     SET(AVX2_FLAG "/arch:AVX2")
-ENDIF()
+endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
 
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
+set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <mmintrin.h>
 int main()
@@ -32,6 +33,7 @@ int main()
 
 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <emmintrin.h>
 int main()
@@ -42,6 +44,7 @@ int main()
 
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
@@ -55,6 +58,7 @@ int main()
 
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -67,6 +71,7 @@ int main()
 
 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 117ab7f49c..0dc33ce385 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
         target_link_libraries(${TARGET_NAME} log)
     endif(ANDROID)
 
-    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
-      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
     endif()
 
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
@@ -168,17 +168,3 @@ function(create_resources res_file output_file)
     COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
     DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
-
-
-# Create a python unittest using run_python_tests.sh,
-# which takes care of making correct running environment
-function(add_python_test TEST_NAME)
-    foreach(arg ${ARGN})
-        get_filename_component(py_fn ${arg} NAME_WE)
-        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        add_test(NAME ${TRG_NAME}
-                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
-                python2 ${arg}
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    endforeach()
-endfunction()
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 25c1dd00b9..e6f632e1a5 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
     v2/model_configs.rst
     v2/data.rst
     v2/run_logic.rst
+    v2/fluid.rst
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
index eca3ce03bc..5317e66b64 100644
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
 ..  automodule:: paddle.v2.activation
     :members: STanh
     :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c..c3f9c18d06 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -82,6 +82,11 @@ maxout
 ..  autoclass:: paddle.v2.layer.maxout
     :noindex:
 
+roi_pool
+--------
+..  autoclass:: paddle.v2.layer.roi_pool
+    :noindex:
+
 Norm Layer
 ==========
 
@@ -99,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -109,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -330,6 +335,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
+
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
@@ -367,6 +382,11 @@ cos_sim
 ..  autoclass:: paddle.v2.layer.cos_sim
     :noindex:
 
+l2_distance
+-----------
+..  autoclass:: paddle.v2.layer.l2_distance
+    :noindex:
+
 trans
 -----
 ..  autoclass:: paddle.v2.layer.trans
@@ -395,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbd..b56c7332cc 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================
 
+..  toctree::
+    :maxdepth: 1
 
-DataTypes
-=========
-
-..  automodule:: paddle.v2.data_type
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.v2.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.v2.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
-
-Dataset
-=======
-
-..  automodule:: paddle.v2.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.v2.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.v2.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.v2.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.v2.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.v2.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.v2.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
-    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.v2.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.v2.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.v2.dataset.wmt14
-    :members:
-    :noindex:
-
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000..2ccfec9c28
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000..6a8ecc5bb1
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000..97651ffa6b
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
new file mode 100644
index 0000000000..43fc19dc49
--- /dev/null
+++ b/doc/api/v2/fluid.rst
@@ -0,0 +1,18 @@
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    fluid/layers.rst
+    fluid/data_feeder.rst
+    fluid/executor.rst
+    fluid/initializer.rst
+    fluid/evaluator.rst
+    fluid/nets.rst
+    fluid/optimizer.rst
+    fluid/param_attr.rst
+    fluid/profiler.rst
+    fluid/regularizer.rst
+
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
new file mode 100644
index 0000000000..0fa78f7dfb
--- /dev/null
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -0,0 +1,9 @@
+===========
+DataFeeder
+===========
+
+DataFeeder
+-----------
+..  automodule:: paddle.v2.fluid.data_feeder
+    :members: DataFeeder
+    :noindex:
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
new file mode 100644
index 0000000000..a23f3301d0
--- /dev/null
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -0,0 +1,9 @@
+===========
+Evaluator
+===========
+
+Evaluator
+-----------
+..  automodule:: paddle.v2.fluid.evaluator
+    :members: Evaluator
+    :noindex:
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
new file mode 100644
index 0000000000..3a283538c1
--- /dev/null
+++ b/doc/api/v2/fluid/executor.rst
@@ -0,0 +1,9 @@
+===========
+Executor
+===========
+
+Executor
+-----------
+..  automodule:: paddle.v2.fluid.executor
+    :members: Executor
+    :noindex:
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
new file mode 100644
index 0000000000..8f587837e9
--- /dev/null
+++ b/doc/api/v2/fluid/initializer.rst
@@ -0,0 +1,50 @@
+===========
+Initializer
+===========
+
+
+
+Initializer
+-----------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: Initializer
+    :noindex:
+
+
+
+ConstantInitializer
+-------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: ConstantInitializer
+    :noindex:
+
+
+
+UniformInitializer
+------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: UniformInitializer
+    :noindex:
+
+
+
+NormalInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: NormalInitializer
+    :noindex:
+
+
+XavierInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: XavierInitializer
+    :noindex:
+
+
+MSRAInitializer
+---------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: MSRAInitializer
+    :noindex:
+
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
new file mode 100644
index 0000000000..89e5fec13b
--- /dev/null
+++ b/doc/api/v2/fluid/layers.rst
@@ -0,0 +1,302 @@
+==========
+Layers
+==========
+
+
+fc
+---
+..  autofunction:: paddle.v2.fluid.layers.fc
+    :noindex:
+
+embedding
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+dynamic_lstm
+------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+    :noindex:
+
+data
+---------
+..  autofunction:: paddle.v2.fluid.layers.data
+    :noindex:
+
+mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.mean
+    :noindex:
+
+mul
+---------
+..  autofunction:: paddle.v2.fluid.layers.mul
+    :noindex:
+
+elementwise_add
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+    :noindex:
+
+elementwise_div
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+    :noindex:
+
+
+dropout
+---------
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+
+
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+
+
+sigmoid
+---------
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
+    :noindex:
+
+
+scale
+---------
+..  autofunction:: paddle.v2.fluid.layers.scale
+    :noindex:
+
+
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+
+
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
+
+
+sigmoid_cross_entropy_with_logits
+---------
+..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+    :noindex:
+
+
+cast
+---------
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+
+concat
+---------
+..  autofunction:: paddle.v2.fluid.layers.concat
+    :noindex:
+
+
+sums
+---------
+..  autofunction:: paddle.v2.fluid.layers.sums
+    :noindex:
+
+
+linear_chain_crf
+---------
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+
+assign
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+
+split_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+    :noindex:
+
+
+merge_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+    :noindex:
+
+cos_sim
+---------
+..  autofunction:: paddle.v2.fluid.layers.cos_sim
+    :noindex:
+
+
+cross_entropy
+---------
+..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+    :noindex:
+
+
+
+square_error_cost
+---------
+..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+    :noindex:
+
+
+accuracy
+---------
+..  autofunction:: paddle.v2.fluid.layers.accuracy
+    :noindex:
+
+
+sequence_conv
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+    :noindex:
+
+
+conv2d
+---------
+..  autofunction:: paddle.v2.fluid.layers.conv2d
+    :noindex:
+
+
+sequence_pool
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+    :noindex:
+
+
+pool2d
+---------
+..  autofunction:: paddle.v2.fluid.layers.pool2d
+    :noindex:
+
+
+batch_norm
+---------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+
+
+beam_search_decode
+---------
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+    :noindex:
+
+
+lstm
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm
+    :noindex:
+
+
+lod_rank_table
+---------
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+    :noindex:
+
+
+max_sequence_len
+---------
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+    :noindex:
+
+
+topk
+---------
+..  autofunction:: paddle.v2.fluid.layers.topk
+    :noindex:
+
+
+lod_tensor_to_array
+---------
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+
+
+array_to_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+
+
+
+fill_constant
+---------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
+    :noindex:
+
+
+
+fill_constant_batch_size_like
+---------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+
+ones
+---------
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+
+
+zeros
+---------
+..  autofunction:: paddle.v2.fluid.layers.zeros
+    :noindex:
+
+
+increment
+---------
+..  autofunction:: paddle.v2.fluid.layers.increment
+    :noindex:
+
+
+array_write
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_write
+    :noindex:
+
+
+
+create_array
+---------
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+
+less_than
+---------
+..  autofunction:: paddle.v2.fluid.layers.less_than
+    :noindex:
+
+
+array_read
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+
+shrink_memory
+---------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+
+
+array_length
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+
+conv2d_transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+    :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
new file mode 100644
index 0000000000..2c3d075422
--- /dev/null
+++ b/doc/api/v2/fluid/nets.rst
@@ -0,0 +1,22 @@
+===========
+Nets
+===========
+
+simple_img_conv_pool
+-----------
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+    :noindex:
+
+
+img_conv_group
+-----------
+..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+    :noindex:
+
+
+sequence_conv_pool
+-----------
+..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+    :noindex:
+
+
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
new file mode 100644
index 0000000000..233762fcdf
--- /dev/null
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -0,0 +1,54 @@
+===========
+Optimizer
+===========
+
+Optimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: Optimizer
+    :noindex:
+
+
+SGDOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: SGDOptimizer
+    :noindex:
+
+
+
+MomentumOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: MomentumOptimizer
+    :noindex:
+
+
+
+AdagradOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdagradOptimizer
+    :noindex:
+
+
+AdamOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamOptimizer
+    :noindex:
+
+
+AdamaxOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamaxOptimizer
+    :noindex:
+
+
+DecayedAdagradOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: DecayedAdagradOptimizer
+    :noindex:
+
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
new file mode 100644
index 0000000000..ca0c8af9e8
--- /dev/null
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -0,0 +1,11 @@
+===========
+ParamAttr
+===========
+
+
+
+ParamAttr
+-----------
+..  automodule:: paddle.v2.fluid.param_attr
+    :members: ParamAttr
+    :noindex:
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
new file mode 100644
index 0000000000..7d4042d1f4
--- /dev/null
+++ b/doc/api/v2/fluid/profiler.rst
@@ -0,0 +1,10 @@
+===========
+Profiler
+===========
+
+
+
+Profiler
+-----------
+..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+    :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
new file mode 100644
index 0000000000..3af2b07d2a
--- /dev/null
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -0,0 +1,25 @@
+===========
+Regularizer
+===========
+
+WeightDecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: WeightDecayRegularizer
+    :noindex:
+
+
+L2DecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L2DecayRegularizer
+    :noindex:
+
+
+
+L1DecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L1DecayRegularizer
+
+
diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md
new file mode 100644
index 0000000000..11cc129d56
--- /dev/null
+++ b/doc/design/evaluator.md
@@ -0,0 +1,58 @@
+## Evaluator Design
+
+### Problem Statement
+
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
+
+### Evaluator Design
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+### Implementation
+This design is shown in the Python API. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
+
+    
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """ 
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+    
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+      
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
diff --git a/doc/design/float16.md b/doc/design/float16.md
new file mode 100644
index 0000000000..1ea95ed6b5
--- /dev/null
+++ b/doc/design/float16.md
@@ -0,0 +1,105 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required, using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## To do
+After float16 class is available, some of the future items are below:
+
+- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
+
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
+
+- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index fe8da907d9..61d453de24 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -1,64 +1,163 @@
 # Intel® MKL-DNN on PaddlePaddle: Design Doc
 
-我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
 
-我们短期内的基本目标是：
+<div align="center">
+<img src="image/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
 
-- 完成常用layer的MKL-DNN实现。
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
 - 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
 
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
 
 ## Contents
 
 - [Overview](#overview)
 - [Actions](#actions)
  	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
 	- [Layers](#layers)
 	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
 	- [Unit Tests](#unit-tests)
-	- [Protobuf Messages](#protobuf-messages)
 	- [Python API](#python-api)
-	- [Demos](#demos)
 	- [Benchmarking](#benchmarking)
 	- [Others](#others)
 - [Design Concerns](#design-concerns)
 
 ## Overview
 
-我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+
+MKL，MKLML以及MKL-DNN三者关系如下表：
+
+| Name        |  Open Source     | License     | Descriptions  |
+| :---------- | :--------------- | :---------- | :------------ |
+|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
+|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
+
 <div align="center">
-<img src="image/overview.png" width=350><br/>
-Figure 1. PaddlePaddle on IA.
+<img src="image/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
 </div>
 
 ## Actions
-我们把集成方案大致分为了如下几个方面。
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
 
 ### CMake
-我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
 
-同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+- `WITH_MKLML` 控制是否使用MKLML库。 
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
 
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
 
-**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+<div align="center">
+<img src="image/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
 
 ### Layers
-所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中，并且文件名都会一以*Mkldnn*开头。
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
 
-所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+<div align="center">
+<img src="image/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
 
-### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口，实现方法还是会在`ActivationFunction.cpp`文件。
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
 
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
+
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
 
-Activation的测试，计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+<div align="center">
+<img src="image/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
 
-### Protobuf Messages
-根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
 
 ### Python API
 目前只考虑**v1 API**。
@@ -73,39 +172,40 @@ if use_mkldnn
     self.layer_type = mkldnn_*
 ```
 
-所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
 
-并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
-
-### Demos
-
-会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 
 ### Benchmarking
-会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`，添加使用MKL-DNN的测试。
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
 
 ### Others
-1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
 2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
 
 ## Design Concerns
 
-为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
 
 我们总结出一些特别需要注意的点：
 
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MkldnnLayer`特有的设备ID。
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MkldnnMatrix`，用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MkldnnBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
-6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
-7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
 
 ## References
-
-1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
-2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
-3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
 
diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkldnn/image/engine.png
new file mode 100644
index 0000000000..1f5f65c2cc
Binary files /dev/null and b/doc/design/mkldnn/image/engine.png differ
diff --git a/doc/design/mkldnn/image/gradients.png b/doc/design/mkldnn/image/gradients.png
new file mode 100644
index 0000000000..f031bcf8e4
Binary files /dev/null and b/doc/design/mkldnn/image/gradients.png differ
diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkldnn/image/layers.png
new file mode 100644
index 0000000000..306f79b7a8
Binary files /dev/null and b/doc/design/mkldnn/image/layers.png differ
diff --git a/doc/design/mkldnn/image/matrix.png b/doc/design/mkldnn/image/matrix.png
new file mode 100644
index 0000000000..c33ce9cf03
Binary files /dev/null and b/doc/design/mkldnn/image/matrix.png differ
diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png
index 84b455c282..8fb7bbb9dd 100644
Binary files a/doc/design/mkldnn/image/overview.png and b/doc/design/mkldnn/image/overview.png differ
diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot
index a498e882a3..5d77865061 100644
--- a/doc/design/ops/images/2_level_rnn.dot
+++ b/doc/design/ops/images/2_level_rnn.dot
@@ -1,6 +1,6 @@
 digraph G {
 
-  rnn [label="1-th level RNN" shape=box]
+  rnn [label="1st level RNN" shape=box]
 
   subgraph cluster0 {
     label = "time step 0"
@@ -8,7 +8,7 @@ digraph G {
     sent0 [label="sentence"]
     sent1 [label="sentence"]
 
-    rnn1 [label="2-th level RNN" shape=box]
+    rnn1 [label="2nd level RNN" shape=box]
 
     sent0 -> rnn1
     sent1 -> rnn1
@@ -20,7 +20,7 @@ digraph G {
     sent2 [label="sentence"]
     sent3 [label="sentence"]
 
-    rnn2 [label="2-th level RNN" shape=box]
+    rnn2 [label="2nd level RNN" shape=box]
 
     sent2 -> rnn2
     sent3 -> rnn2
@@ -32,7 +32,7 @@ digraph G {
     sent4 [label="sentence"]
     sent5 [label="sentence"]
 
-    rnn3 [label="2-th level RNN" shape=box]
+    rnn3 [label="2nd level RNN" shape=box]
 
     sent4 -> rnn3
     sent5 -> rnn3
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000..8b0d90f7b9
Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md
index a78eea7d45..2f4854793f 100644
--- a/doc/design/ops/rnn.md
+++ b/doc/design/ops/rnn.md
@@ -1,62 +1,62 @@
 # RNNOp design
 
-This document is about an RNN operator which requires that instances in a mini-batch have the same length.  We will have a more flexible RNN operator.
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
 
 ## RNN Algorithm Implementation
 
-<p aligh="center">
+<p align="center">
 <img src="./images/rnn.jpg"/>
 </p>
 
 The above diagram shows an RNN unrolled into a full network.
 
-There are several important concepts:
+There are several important concepts here:
 
-- *step-net*: the sub-graph to run at each step,
-- *memory*, $h_t$, the state of the current step,
-- *ex-memory*, $h_{t-1}$, the state of the previous step,
-- *initial memory value*, the ex-memory of the first step.
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
 
 ### Step-scope
 
-There could be local variables defined in step-nets.  PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step.
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 
-<p aligh="center">
+<p align="center">
 <img src="./images/rnn.png"/><br/>
-Figure 2 the RNN's data flow
+Figure 2 illustrates the RNN's data flow
 </p>
 
-Please be aware that all steps run the same step-net.  Each step
+Please be aware that every step runs the same step-net.  Each step does the following:
 
-1. creates the step-scope,
-2. realizes local variables, including step-outputs, in the step-scope, and
-3. runs the step-net, which could use these variables.
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
 
-The RNN operator will compose its output from step outputs in step scopes.
+The RNN operator will compose its output from step outputs in each of the step scopes.
 
 ### Memory and Ex-memory
 
-Let's give more details about memory and ex-memory via a simply example:
+Let's give more details about memory and ex-memory using a simple example:
 
 $$
 h_t = U h_{t-1} + W x_t
 $$,
 
-where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively.
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
 
-In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step,
-or copy the value of the previous memory value to the current ex-memory variable.
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
 
 ### Usage in Python
 
 For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
 
-We can define an RNN's step-net using Block:
+We can define an RNN's step-net using a Block:
 
 ```python
 import paddle as pd
 
-X = some_op() # x is some operator's output, and is a LoDTensor
+X = some_op() # x is some operator's output and is a LoDTensor
 a = some_op()
 
 # declare parameters
@@ -68,7 +68,7 @@ with rnn.stepnet():
     x = rnn.add_input(X)
     # declare a memory (rnn's step)
     h = rnn.add_memory(init=a)
-    # h.pre_state() means previous memory of rnn
+    # h.pre_state(), the previous memory of rnn
     new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
     # update current memory
     h.update(new_state)
@@ -80,19 +80,19 @@ out = rnn()
 
 Python API functions in above example:
 
-- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs.
-- `rnn.add_memory` creates a variable used as the memory.
-- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output.
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
 
 ### Nested RNN and LoDTensor
 
 An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
 
-For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences.
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
 
-The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text.
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 
-<p aligh="center">
+<p align="center">
 <img src="./images/2_level_rnn.png"/>
 </p>
 
@@ -110,7 +110,7 @@ a = some_op()
 
 # chapter_data is a set of 128-dim word vectors
 # the first level of LoD is sentence
-# the second level of LoD is chapter
+# the second level of LoD is a chapter
 chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
 
 def lower_level_rnn(paragraph):
@@ -138,14 +138,14 @@ with top_level_rnn.stepnet():
         pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
     top_level_rnn.add_outputs(h)
 
-# just output the last step
+# output the last step
 chapter_out = top_level_rnn(output_all_steps=False)
 ```
 
-in above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
 
-By default, the `RNNOp` will concatenate the outputs from all the time steps,
-if the `output_all_steps` set to False, it will only output the final time step.
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
 
 
 <p align="center">
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
new file mode 100644
index 0000000000..9db5fb8e9a
--- /dev/null
+++ b/doc/design/ops/sequence_decoder.md
@@ -0,0 +1,229 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
+
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
+
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
+
+The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following examples are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
+
+In this way, users can customize anything on the input or output of beam search, for example:
+
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`:
+
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
+
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state:
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state is stored in `encoder_ctx_expanded`:
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
+
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
+
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
+
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According to the image above, the only phase that changes the LoD is beam search.
+
+## Beam search design
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
+
+1. `topk_ids`, the top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables:
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
index 320dccec3d..2cd4b6225b 100644
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -1,25 +1,25 @@
 # Python Data Reader Design Doc
 
-At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
 
-- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
 
-and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
 
 ## Data Reader Interface
 
-Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
 
 ```
 iterable = data_reader()
 ```
 
-Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
 
-An example implementation for single item data reader creator:
+An example implementation for single item data reader creator is as follows:
 
 ```python
 def reader_creator_random_image(width, height):
@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
     return reader
 ```
 
-An example implementation for multiple item data reader creator:
+An example implementation for multiple item data reader creator is as follows:
 ```python
 def reader_creator_random_image_and_label(width, height, label):
     def reader():
@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
 
 ## Batch Reader Interface
 
-*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
 
-Here are valid outputs:
 ```python
 # a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
 [(1, 1, 1),
@@ -58,20 +59,22 @@ Here are valid outputs:
 Please note that each item inside the list must be a tuple, below is an invalid output:
 ```python
  # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
- # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
- # or three column of datas, each of which is 1.
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
 [[1,1,1],
 [2,2,2],
 [3,3,3]]
 ```
 
-It's easy to convert from reader to batch reader:
+It is easy to convert from a reader to a batch reader:
+
 ```python
 mnist_train = paddle.dataset.mnist.train()
 mnist_train_batch_reader = paddle.batch(mnist_train, 128)
 ```
 
-Also easy to create custom batch reader:
+It is also straight forward to create a custom batch reader:
+
 ```python
 def custom_batch_reader():
     while True:
@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
 
 ## Usage
 
-batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
 
 ```python
 # two data layer is created:
@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
 
 ## Data Reader Decorator
 
-*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
 
-Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
 
 ### Prefetch Data
 
-Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
 
 Use `paddle.reader.buffered` to prefetch data:
 
@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
 
 ### Compose Multiple Data Readers
 
-For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
 
-We can do:
+We can do the following :
 
 ```python
 def reader_creator_random_image(width, height):
@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
 
 reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
-# And we don't care second item at this time.
+# And we don't care about the second item at this time.
 paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 
 ### Shuffle
 
-Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
 
 Example:
 ```python
@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
 
 ## Q & A
 
-### Why reader return only a single entry, but not a mini batch?
+### Why does a reader return only a single entry, and not a mini batch?
 
-Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
 
-We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
 
-### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
 
-In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
 
-### Why use a dictionary but not a list to provide mapping?
+### Why use a dictionary instead of a list to provide mapping?
 
-We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
 
-### How to create custom data reader creator
+### How to create a custom data reader creator ?
 
 ```python
 def image_reader_creator(image_path, label_path, n):
@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 
 ### How is `paddle.train` implemented
 
-An example implementation of paddle.train could be:
+An example implementation of paddle.train is:
 
 ```python
 def train(batch_reader, mapping, batch_size, total_pass):
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index ac7e98ccf1..d9fe7d6bbb 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 
 ## Abstract
 
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
 
-1. Need to write special code to handle tasks which should only be run
-  by a single trainer. E.g., initializing model and saving model.
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
 
-2. Model parallelism is hard: need to write if-else branches conditioned
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
 
-3. The user can not directly specify the parameter update rule: need
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
 
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
 
 ## Analysis
 
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
+The assumption is that the user writes the trainer program in either Python or C++.
 
 ### Limitation 1
 
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
+There are two basic functionalities in the trainer program:
 
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
-  layer, the fully connected layer, the cost function and the
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
   optimizer.
 
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
 
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
 
 ### Limitation 2
 
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
 
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
 
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
 <img src="src/compiler.png"/>
 
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
 <img src="src/paddle-compile.png"/>
 
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
 ### Limitation 3
 
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
+[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
 
 ## Distributed Training Architecture
 
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
 <img src="src/distributed_architecture.png"/>
 
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
 
 ### PaddlePaddle Python
 
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
 
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
 
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
 
 #### session.eval
 
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
 
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
 
 ### PaddlePaddle Converter
 
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
 
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
-   fetches the eval targets to the IR.
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
 
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
 
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
 
-1. Place the OPs in the graph onto different devices on different
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
 
-1. Partition the graph according to runtime boundaries and add `send` /
-   `recv` OP pair on the runtime boundaries.
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
 
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
 
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
-   
 The output IRs will be cached to optimize the conversion latency.
 
 
 #### Placement Algorithm
 
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
 
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
 
 
 ### PaddlePaddle Runtime
 
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
 
 
 ### Local Training Architecture
 
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
 <img src="src/local_architecture.png"/>
 
 
 ### Training Data
 
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
+
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
 
 
 ## References:
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 62ff8f3229..14c081ea84 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Vers
 PaddlePaddle每次发新的版本，遵循以下流程:
 
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-3. 对这个版本的提交，做如下几个操作:
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
 	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
-4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
-6. 协同完成Release Note的书写
+1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+1. 协同完成Release Note的书写
 
 
 需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范
 
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
 
@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
 
 本列表说明PaddlePaddle发版之前需要测试的功能点。
 
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 0e939a2671..b331d9d36e 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -99,7 +99,7 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源
 ++++++++++++++++++
 
-利用更多的计算资源可以分为一下几个方式来进行\:
+利用更多的计算资源可以分为以下几个方式来进行\:
 
 * 单机CPU训练
 
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
deleted file mode 100644
index b473944fc7..0000000000
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-经典的线性回归任务
-==================
-
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-
-任务简介
---------
-
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-
-一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
-
-准备数据
------------
-
-假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-
-.. code-block:: python
-
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-
-训练模型
------------
-
-为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-.. code-block:: python
-
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-
-    # 1. 定义数据来源，调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = square_error_cost(input= ȳ, label=y)
-    outputs(cost)
-
-
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-    
-    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
-
-定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
-
-.. code-block:: bash
-
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
-
-模型检验
------------
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-.. code-block:: python
-
-    import numpy as np
-    import os
-
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-        
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
-
-这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
deleted file mode 100644
index 2cc438ebbe..0000000000
--- a/doc/getstarted/basic_usage/index_en.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-Simple Linear Regression
-========================
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-Problem Background
-------------------
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-Prepare the Data
------------------
-
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-    .. code-block:: python
-
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-
-Train a NeuralNetwork
-----------------------
-
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-
-    .. code-block:: python
-
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = square_error_cost(input=y_predict, label=y)
-        outputs(cost)
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
-
-    .. code-block:: bash
- 
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- 
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-Evaluate the Model
--------------------
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-
-    .. code-block:: python
-
-        import numpy as np
-        import os
-
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-                
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-
-    .. image:: parameters.png
-        :align: center
-
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
deleted file mode 100644
index 2ec6748095..0000000000
Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000..c875c807b8
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,141 @@
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+
+如果不使用Docker，可以执行ctest命令即可：
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
deleted file mode 100644
index 2f14614894..0000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ /dev/null
@@ -1,236 +0,0 @@
-Installing from Sources
-==========================
-
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-
-
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-
-To compile the source code, your computer must be equipped with the following dependencies.
-
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
-- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
-- **BLAS**: MKL, OpenBlas or ATLAS
-- **Python**: only support Python 2.7
-- **Go**
-
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-
-### Options
-
-PaddlePaddle supports some build options. 
-
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-
-As a simple example, consider the following:  
-
-1. **BLAS Dependencies(optional)**
-  
-    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-
-2. **Doc Dependencies(optional)**
-
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-
-### Install Dependencies
-
-- **Paddle Dependencies**
-
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-
-    # Install Go
-    # You can follow https://golang.org/doc/install for a detailed explanation.
-    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C $HOME -xzf go.tgz && \
-    mkdir $HOME/gopath && \
-    rm go.tgz
-
-    # Setup environment variables
-    export GOROOT=$HOME/go
-    export GOPATH=$HOME/gopath
-    export PATH=$PATH:$GOROOT/bin
-
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
-
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-
-## <span id="centos">Build on Centos 7</span>
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
-  
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-  
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000..f194f84ce7
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -0,0 +1,159 @@
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png
deleted file mode 100644
index a58cd09ad9..0000000000
Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index be0c1ffa45..0000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16..0000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470..0000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 30b144d849..f78b1fb0e1 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,222 +1,139 @@
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行
 ================================
 
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
 
-Docker使用入门
-------------------------------
-
-几个基础的概念帮助理解和使用Docker：
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
 
-- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
 
-  .. code-block:: bash
+.. _docker_pull:
 
-     docker images
+获取PaddlePaddle的Docker镜像
+------------------------------
 
-  来列出当前系统中的所有镜像，同样可以执行：
+执行下面的命令获取最新的PaddlePaddle Docker镜像
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
+     docker pull paddlepaddle/paddle
 
-- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
-  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
-  可以执行：
+对于国内用户，我们提供了加速访问的镜像源：
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  来使用一个镜像启动一个容器。
-
-- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
-  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
-  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
-  debian镜像，并且启动后执行 :code:`ls /data`。
+下载GPU版本的Docker镜像：
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-PaddlePaddle发布的Docker镜像使用说明
-------------------------------
-
-我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
-和国内镜像`docker.paddlepaddle.org` 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-
-**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
-
-1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
-
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具：
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
-
-   以交互容器方式运行开发镜像：
-
-   .. code-block:: bash
-
-      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
-
-   或者，可以以后台进程方式运行容器：
-
-   .. code-block:: bash
-
-      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
-
-   然后用密码 :code:`root` SSH进入容器：
-
-   .. code-block:: bash
-
-      ssh -p 2202 root@localhost
-
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-
-2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   如果输出是No，就需要选择使用no-AVX的镜像
-
-   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
-   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+选择下载使用不同的BLAS库的Docker镜像：
 
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
+  .. code-block:: bash
 
-   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   .. code-block:: bash
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
 
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
+  .. code-block:: bash
 
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+.. _docker_run:
 
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+------------------------------
 
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
 
-   如果要使用GPU，请运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
 
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
-   创建和发布自己的AI程序镜像。
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
 
-运行PaddlePaddle Book
----------------------
+.. _docker_run_book:
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+------------------------------
 
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 然后在浏览器中输入以下网址：
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 就这么简单，享受您的旅程！
 
-通过Docker容器开发PaddlePaddle
-------------------------------
-
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+.. _docker_run_gpu:
 
-1. 制作PaddlePaddle开发镜像
-
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
-
-   .. code-block:: bash
-      
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
-
-   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
+使用Docker执行GPU训练
+------------------------------
 
-2. 制作PaddlePaddle生产镜像
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
 
-   生产镜像的生成分为两步，第一步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
-   第二步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
 
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+**关于AVX：**
 
-3. 运行单元测试
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
 
-   运行以下指令：
+以下指令能检查Linux电脑是否支持AVX：
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-文档
-----
-
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
-只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 94860240f6..d7acc7aeb7 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,270 +1,146 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 
-Docker container is currently the only officially-supported way to
-running PaddlePaddle.  This is reasonable as Docker now runs on all
-major operating systems including Linux, Mac OS X, and Windows.
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
 
-Working With Docker
--------------------
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
 
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
 
-- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
 
-  .. code-block:: bash
-
-     docker images
+Pull PaddlePaddle Docker Image
+------------------------------
 
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images:
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  to download a Docker image, paddlepaddle/paddle in this example,
-  from Dockerhub.com.
+     docker pull paddlepaddle/paddle
 
-- *container*: considering a Docker image a program, a container is a
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
+For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
-
-- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
+Download GPU version images:
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-Usage of CPU-only and GPU Images
-----------------------------------
-
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. 
-
-** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
-
-
-1. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
-2. Production images, this image might have multiple variants:
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
-   To run the CPU-only image as an interactive container:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
-
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
-
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-
-   Now you can run a GPU image:
-
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-
-
-Train Model Using Python API
-----------------------------
-
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
-
-Create a directory as workspace:
-
-.. code-block:: bash
-
-   mkdir ~/workspace
-
-Edit a PaddlePaddle python program using your favourite editor
-
-.. code-block:: bash
-
-   emacs ~/workspace/example.py
-
-Run the program using docker:
-
-.. code-block:: bash
-
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
-
-Or if you are using GPU for training:
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-.. code-block:: bash
+Choose between different BLAS version:
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
-
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
-
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
-
-.. code-block:: bash
-
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-Running with GPU is identical:
-
-.. code-block:: bash
-
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-
-Develop PaddlePaddle or Train Model Using C++ API
----------------------------------------------------
-
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
+  .. code-block:: bash
 
-1. Build PaddlePaddle develop image
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   Use following command to build PaddlePaddle develop image:
 
-   .. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
 
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-      docker build -t paddle:dev .
-
-2. Build PaddlePaddle production image
+  .. code-block:: bash
 
-   There are two steps for building production image, the first step is to run:
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   .. code-block:: bash
+.. _docker_run:
 
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+Launch your training program in Docker
+------------------------------
 
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
 
-   The second step is to run:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
 
-      docker build -t paddle:prod -f build/Dockerfile ./build
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
 
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+Also, you can go into the container shell, run or debug your code
+interactively:
 
-3. Run unit test
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   Following command will run unit test:
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
 
-   .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+.. _docker_run_book:
 
 PaddlePaddle Book
 ------------------
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
-
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
 We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 Then, you would back and paste the address into the local browser:
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 That's all. Enjoy your journey!
 
+.. _docker_run_gpu:
 
-Documentation
--------------
+Train with Docker with GPU
+------------------------------
 
-Paddle Docker images include an HTML version of C++ source code
-generated using `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
-for users to browse and understand the C++ source code.
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
 
-As long as we give the Paddle Docker container a name, we can run an
-additional Nginx Docker container to serve the volume from the Paddle
-container:
+  .. code-block:: bash
 
-.. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   docker run -d --name paddle-cpu-doc paddle:<version>
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
+  .. code-block:: bash
 
-Then we can direct our Web browser to the HTML version of source code
-at http://localhost:8088/paddle/
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index dd9923697a..c9ba84c842 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,22 +6,28 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供Docker镜像来部署环境。
+PaddlePaddle提供pip和Docker的安装方式：
 
 .. toctree::
    :maxdepth: 1
-   
-   docker_install_cn.rst 
 
+   pip_install_cn.rst
+   docker_install_cn.rst
+   ../../howto/dev/build_cn.md
 
 编译流程
 ++++++++
 
 ..  warning::
 
-    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+
+常见问题解答
+++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 8a53588e04..32d66d63dd 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -1,22 +1,34 @@
 Install and Build
 =================
 
-Install PaddlePaddle
-----------------------
+.. _install_steps:
 
-..  toctree::
-    :maxdepth: 1
+Install Steps
+++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
+   ../../howto/dev/build_en.md
 
-    docker_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 
 ..  toctree::
     :maxdepth: 1
 
     build_from_source_en.md
+
+FAQ
+++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png
new file mode 100644
index 0000000000..16087ce059
Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000..b270e2c2f0
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -0,0 +1,86 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000..70f601a11c
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -0,0 +1,104 @@
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version, just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index aa418c657a..a9087be6f3 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,10 +1,61 @@
 新手入门
 ============
 
+.. _quick_install:
+
+快速安装
+++++++++
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+
+快速开始
+++++++++
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index be3253e3d4..d14e3f5c0c 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,9 +1,61 @@
 GET STARTED
 ============
 
+.. _quick_install:
+
+Quick Install
+----------------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version, run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+
+.. _quick_start:
+
+Quick Start
+++++++++
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
index 0b911f7b75..4a80a52451 100644
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
@@ -1,4 +1,4 @@
-# 编译PaddlePaddle和运行单元测试
+# 用Docker编译和测试PaddlePaddle
 
 ## 需要的软硬件
 
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
index d0048e3714..91c41ef8ce 100644
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
@@ -1,4 +1,4 @@
-# Build PaddlePaddle from Source Code and Run Unit Test
+# Build using Docker
 
 ## What Developers Need
 
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index c823d7e9fc..6cfc9536f2 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -214,7 +214,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     // if use Eigen unsupported module before include head files
-    #define EIGEN_USE_GPU
+    // #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
     REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 731a63f945..1bc947c260 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,12 +3,64 @@
 ##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-
+也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
 
 如何构建文档
 ============
 
-PaddlePaddle的文档构建有两种方式。
+PaddlePaddle的文档构建有三种方式。
+
+
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+
+文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
 
 使用Docker构建
 --------------
@@ -34,7 +86,7 @@ PaddlePaddle的文档构建有两种方式。
     cd TO_YOUR_PADDLE_CLONE_PATH
     mkdir -p build
     cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
     make gen_proto_py
     make paddle_docs paddle_docs_cn
 
@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。
 
 PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
 
-如何更新文档主题
-================
-
-PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下，包含所有和前端网页设计相关的文件。
-
-如何更新doc.paddlepaddle.org
+如何更新www.paddlepaddle.org
 ============================
 
-更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
+更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
 
 
 ..  _cmake: https://cmake.org/
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst
new file mode 100644
index 0000000000..b3ef07eb1d
--- /dev/null
+++ b/doc/howto/dev/write_docs_en.rst
@@ -0,0 +1,80 @@
+##################
+Contribute Documentation
+##################
+
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+
+How to Build Documentations
+============
+
+We recommend using PaddlePaddle.org tool to build documentation
+
+
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+
+How to write Documentations
+============
+
+PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+
+
+How to update www.paddlepaddle.org
+============================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 76d3e0a009..991b9e2596 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,7 +19,7 @@
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_cn.rst
+  dev/contribute_to_paddle_cn.md
   dev/write_docs_cn.rst
 
 模型配置
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1b6034be4e..61bf25ccd1 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,9 +18,9 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
+  dev/write_docs_en.rst
 
 Configuration
 -------------
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
new file mode 100644
index 0000000000..1775374cf6
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -0,0 +1,197 @@
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
+
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
+
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
+
+1. the Python code and
+1. the mixture of Python and C++ code.
+
+## Profiling the Python Code
+
+### Generate the Performance Profiling File
+
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
+
+### Look into the Profiling File
+
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
+
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
+
+| column | meaning |
+| --- | --- |
+| ncalls | the number of calls into a function |
+| tottime | the total execution time of the function, not including the
+ execution time of other functions called by the function |
+| percall | tottime divided by ncalls |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
+
+### Identify Performance Bottlenecks
+
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this 
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
+
+```
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+
+## Profiling Python and C++ Code
+
+### Generate the Profiling File
+
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
+
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+Then we can run the following command
+
+```bash
+python -m yep -v main.py
+```
+
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
+
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
+
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
+
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
+
+### Examining the Profiling File
+
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
+
+```bash
+go get github.com/google/pprof
+```
+
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
+
+![result](./pprof_1.png)
+
+### Identifying the Performance Bottlenecks
+
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
+
+![kernel_perf](./pprof_2.png)
+
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
+
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000..14eba0e2f3
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,155 @@
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000..8e9edbf377
Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ
diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000..172ba20399
Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index f7aa525054..2dea231ca5 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -63,7 +63,7 @@
 </tr>
 
 <tr>
-<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 9279bac7f4..ada51c2d73 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,3 +8,4 @@ PaddlePaddle 文档
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
+  mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 64684b8b9b..23b64b6cad 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,3 +7,4 @@ PaddlePaddle Documentation
   getstarted/index_en.rst
   howto/index_en.rst
   api/index_en.rst
+  mobile/index_en.rst
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
similarity index 94%
rename from doc/howto/cross_compiling/cross_compiling_for_android_cn.md
rename to doc/mobile/cross_compiling_for_android_cn.md
index 58e4dd9c3f..424d7718c6 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,4 +1,4 @@
-# 构建Android平台上的PaddlePaddle库
+# Android平台编译指南
 
 用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
 - 基于Docker容器的编译方式
@@ -20,10 +20,32 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
 
-| Argument        | Optional Values         | Default |
-|-----------------|-------------------------|---------|
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
-|`ANDROID_API`    |`>= 21` | `21` |
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
 
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
   ```bash
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android.md b/doc/mobile/cross_compiling_for_android_en.md
similarity index 93%
rename from doc/howto/cross_compiling/cross_compiling_for_android.md
rename to doc/mobile/cross_compiling_for_android_en.md
index 161863e5c0..26858581fc 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_android.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -26,10 +26,32 @@ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_A
 
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 
-| Argument        | Optional Values         | Default |
-|-----------------|-------------------------|---------|
-|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
-|`ANDROID_API`    |`>= 21` | `21` |
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 21</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
 
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
 
diff --git a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
similarity index 86%
rename from doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
rename to doc/mobile/cross_compiling_for_ios_cn.md
index 32c490d9aa..9da48e7f21 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_ios_cn.md
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -1,4 +1,4 @@
-# 构建iOS平台上的PaddlePaddle库
+# iOS平台编译指南
 交叉编译iOS平台上适用的PaddlePaddle库，需要在MacOS系统上进行。本文的将介绍在MacOS上，从源码交叉编译iOS平台上适用的PaddlePaddle库。
 
 ## 准备交叉编译环境
@@ -25,12 +25,30 @@ iOS平台可选配置参数：
 - `IOS_PLATFORM`，可设置为`OS/SIMULATOR`，默认值为`OS`。
   - `OS`，构建目标为`arm`架构的iPhone或者iPad等物理设备。
   - `SIMULATOR`，构建目标为`x86`架构的模拟器平台。
-- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示：
-
-   | IOS_PLATFORM | IOS_ARCH             |
-   |--------------|----------------------|
-   |   OS         | armv7, armv7s, arm64 (默认) |
-   | SIMULATOR    | i386, x86_64 (默认)         |   
+- `IOS_ARCH`，目标架构。针对不同的`IOS_PLATFORM`，可设置的目标架构如下表所示，默认编译所有架构：
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
 
 - `IOS_DEPLOYMENT_TARGET`，最小的iOS部署版本，默认值为`7.0`。
 - `IOS_ENABLE_BITCODE`，是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3)，可设置`ON/OFF`，默认值为`ON`。
@@ -48,7 +66,7 @@ iOS平台可选配置参数：
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=iOS \
       -DIOS_PLATFORM=OS \
-      -DIOS_ARCH="arm64" \
+      -DIOS_ARCH="armv7;arm64" \
       -DIOS_ENABLE_BITCODE=ON \
       -DIOS_USE_VECLIB_FOR_BLAS=ON \
       -DCMAKE_INSTALL_PREFIX=your/path/to/install \
@@ -94,6 +112,6 @@ $ make install
 - `lib`目录，其中包含PaddlePaddle的C-API静态库
 - `third_party`目录，其中包含所依赖的所有第三方库
 
-注意，不同架构的PaddlePaddle库建议安装到不同的目录下，然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
+注意，如果PaddlePaddle库需要同时支持真机和模拟器，则需要分别编译真机和模拟器版本，然后使用`lipo`工具合并fat库。
 
 自此，PaddlePaddle库已经安装完成，用户可将合成的fat库用于深度学习相关的iOS App中，调用方法见C-API文档。
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
similarity index 98%
rename from doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
rename to doc/mobile/cross_compiling_for_raspberry_cn.md
index 6e983645fa..f8ef9dc803 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -1,4 +1,4 @@
-# 构建Raspberry Pi平台上的PaddlePaddle库
+# Raspberry Pi平台编译指南
 
 通常有两个方法来构建基于 Rasspberry Pi 的版本：
 
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md
similarity index 100%
rename from doc/howto/cross_compiling/cross_compiling_for_raspberry_en.md
rename to doc/mobile/cross_compiling_for_raspberry_en.md
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000..1d99666e58
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000..3c08d73671
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,8 @@
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index e767856d50..ebb083c5a4 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -4,6 +4,16 @@ else ()
   set(PADDLE_FLOAT_TYPE float)
 endif()
 
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
+
 # config.h used for C-API. It will store Paddle building configuration as a
 # header. Make user just include PaddleCAPI.h then can get building
 # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
@@ -29,32 +39,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 add_dependencies(paddle_capi paddle_proto)
 
 # TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
 if(MOBILE_INFERENCE)
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto)
 else()
-    set(PADDLE_CAPI_INFER_LIBS
-        paddle_utils
-        paddle_parameter
-        paddle_math
-        paddle_cuda
-        paddle_function
-        paddle_gserver
-        paddle_proto
-        paddle_pserver
-        paddle_network)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto
+      paddle_pserver
+      paddle_network)
 endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
 # Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
-cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
 
 # Link the shared library for inference
 if(NOT IOS)
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index 78c43949df..bb8249a551 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
 
 extern "C" {
 paddle_error paddle_init(int argc, char** argv) {
+  static bool isInit = false;
+  if (isInit) return kPD_NO_ERROR;
+
   std::vector<char*> realArgv;
   realArgv.reserve(argc + 1);
   realArgv.push_back(strdup(""));
@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) {
   }
   initPaddle(argc + 1, realArgv.data());
   free(realArgv[0]);
+  isInit = true;
   return kPD_NO_ERROR;
 }
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 4547afaf1d..30f3a766f0 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                            paddle_real* value) {
+  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(value, value + width * height, buf);
+  }
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                            paddle_real* result) {
+  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(buf, buf + width * height, result);
+  }
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real** rawRowBuffer) {
@@ -81,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 
 paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
   auto ptr = new paddle::capi::CMatrix();
   ptr->mat = paddle::Matrix::createSparseMatrix(
       height,
@@ -91,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
       false,
       useGpu);
   return ptr;
+#else
+  return nullptr;
+#endif
 }
 
 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@@ -100,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                             uint64_t colSize,
                                             float* valueArray,
                                             uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (mat == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (rowArray == nullptr || colArray == nullptr ||
@@ -120,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
   } else {
     return kPD_NOT_SUPPORTED;
   }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in
index d205307588..0ddbd8c753 100644
--- a/paddle/capi/config.h.in
+++ b/paddle/capi/config.h.in
@@ -3,6 +3,9 @@
 
 typedef @PADDLE_FLOAT_TYPE@ paddle_real;
 
+#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
+#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
+
 // Since we only support linux and macos in compile, always use clang or
 // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
 #define PD_API __attribute__((visibility("default")))
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd52850..5eeaf7e31f 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -1,5 +1,6 @@
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
 #define CONFIG_BIN "./trainer_config.bin"
@@ -31,6 +32,7 @@ int main() {
                                            /* size */ 784,
                                            /* useGPU */ false);
   srand(time(0));
+
   paddle_real* array;
 
   // Get First row.
@@ -51,11 +53,18 @@ int main() {
 
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
+  uint64_t height;
+  uint64_t width;
+
+  CHECK(paddle_matrix_get_shape(prob, &height, &width));
   CHECK(paddle_matrix_get_row(prob, 0, &array));
 
-  printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
+  printf("Prob: \n");
+  for (int i = 0; i < height * width; ++i) {
+    printf("%.4f ", array[i]);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
   }
   printf("\n");
 
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index f15f7f3bbb..8cc3e0034e 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
  * @param isBinary is binary (either 1 or 0 in matrix) or not.
  * @param useGpu is using GPU or not.
  * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@@ -70,6 +71,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real* rowArray);
 
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note  value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                            paddle_real* value);
+
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
  * @param [in] mat Target matrix
@@ -81,6 +92,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real** rawRowBuffer);
 
+/**
+ * @brief copy data from the matrix
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
@@ -110,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
  * NULL if the matrix is binary.
  * @param [in] valueSize length of value array. Zero if the matrix is binary.
  * @return paddle_error
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                    int* rowArray,
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9..6940c28448 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
   paddle_matrix mat = paddle_matrix_create_none();
   ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
 }
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, true);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 0865b02c4f..efd1b7a73e 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -27,7 +27,9 @@ if(WITH_GPU)
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
     set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()
 
 set(CUDA_CU_SOURCES
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 6b56d9ec8d..8841806292 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_base.h"
 
 /**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
  *
  * @param[in]   frameCnt    batch size of input image.
  * @param[in]   inputData   input data.
@@ -35,7 +35,7 @@ limitations under the License. */
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
  */
 extern void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);
 
 /**
  * @brief   Maximum pool backward.
@@ -115,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_forward(const int frameCnt,
@@ -131,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               bool excludeMode);
 
 /**
  * @brief   Maximum pool backward.
@@ -153,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
  * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_backward(const int frameCnt,
@@ -171,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride);
+                                const int outStride,
+                                bool excludeMode);
 
 extern void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index ede2670882..4ab8de80d1 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index a76dbf0b65..706cc59a8e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}
 
 inline void hl_maxpool_backward(const int frameCnt,
                                 const real* inputData,
@@ -67,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               const bool excludeMode) {}
 
 inline void hl_avgpool_backward(const int frameCnt,
                                 const real* outGrad,
@@ -85,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride) {}
+                                const int outStride,
+                                const bool excludeMode) {}
 
 inline void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 58674febdc..2d1bc4f6d5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
                                  const int offsetH,
                                  const int offsetW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 real* maskData) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
+    int max_index = -1;
     inputData += (frameNum * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
-          maxval = inputData[h * width + w];
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
       }
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
   }
 }
 
@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        real* maskData) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
                                                          paddingH,
                                                          paddingW,
                                                          tgtData,
-                                                         tgtStride);
+                                                         tgtStride,
+                                                         maskData);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
@@ -201,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
                                  const int padH,
                                  const int padW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -215,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
     int wend = min(wstart + sizeX, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
 
     real aveval = 0;
     inputData += (frameNum * channels + c) * height * width;
@@ -226,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads,
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
+    tgtData[tgtIndex] = aveval / poolSize;
   }
 }
 
@@ -244,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        const bool excludeMode) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
@@ -261,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                        tgtStride);
+                                                        tgtStride,
+                                                        excludeMode);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -281,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads,
                                   real scaleA,
                                   real scaleB,
                                   real* tgtGrad,
-                                  const int outStride) {
+                                  const int outStride,
+                                  const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -305,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads,
         int wstart = pw * strideW - padW;
         int wend = min(wstart + sizeX, width);
         wstart = max(wstart, 0);
-        int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw] / poolsize;
+        int poolSize =
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
       }
     }
     tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
@@ -329,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt,
                          real scaleA,
                          real scaleB,
                          real* backGrad,
-                         const int outStride) {
+                         const int outStride,
+                         const bool excludeMode) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -349,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt,
                                                          scaleA,
                                                          scaleB,
                                                          backGrad,
-                                                         outStride);
+                                                         outStride,
+                                                         excludeMode);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 1afc524208..4b0eff3adb 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -6,7 +6,10 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
@@ -38,9 +41,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/
-    COMMENT "Copy generated python proto into directory paddle/v2/framework/proto."
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
+    COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 cc_library(backward SRCS backward.cc DEPS net_op)
@@ -51,10 +54,6 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-
-cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
-cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
-
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index ed94540c26..7294ba1a9c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -22,12 +22,23 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
 
+static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set<std::string>& CtrlFlowOps() {
+  if (g_ctrl_flow_ops_ == nullptr) {
+    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
+        "increment", "lod_rank_table", "less_than"};
+  }
+  return *g_ctrl_flow_ops_;
+}
+
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
     const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
     std::unordered_map<std::string, std::string>* grad_to_var) {
@@ -218,21 +229,6 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                      return false;
                    });
 
-    // process recurrent gradient op as a special operator.
-    if (forwardOp.Type() == "dynamic_recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or this will result in infinite loop.
-      const auto& rnnop =
-          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
-      auto rnn_grad_op =
-          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
-      const auto& stepnet_op =
-          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
-      // create stepnet's gradient op
-      rnn_grad_op->rnn.SetStepUnit(
-          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
-    }
-
     if (net->ops_.empty()) {  // Current no aux op is added to network
       return grad_op;
     }
@@ -270,6 +266,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
       return false;
     }
   }
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    sout << "All input {";
+    for (auto& name : names) {
+      sout << name << ",";
+    }
+    sout << "} is in {";
+    for (auto& name : set) {
+      sout << name << ",";
+    }
+    sout << "}";
+    VLOG(10) << sout.str();
+  }
   return true;
 }
 
@@ -290,15 +299,25 @@ static void CreateGradVarInBlock(
   auto ops = block_desc->AllOps();
   for (size_t op_index = grad_op_start_index; op_index < ops.size();
        ++op_index) {
-    bool need_infer_shape = false;
     std::unordered_set<std::string> new_vars;
+    auto& ctrl_flow_ops = CtrlFlowOps();
     ForEachVarName(ops[op_index]->Outputs(),
                    [&](const std::string& grad_var_name) {
-                     if (block_desc->HasVar(grad_var_name)) {
+                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+                         ctrl_flow_ops.end()) {
+                       if (block_desc->HasVarRecursive(grad_var_name)) {
+                         return false;
+                       }
+                     } else {
+                       if (block_desc->HasVar(grad_var_name)) {
+                         return false;
+                       }
+                     }
+                     if (grad_var_name == framework::kEmptyVarName) {
                        return false;
                      }
-                     need_infer_shape = true;
                      auto var = block_desc->Var(grad_var_name);
+                     VLOG(10) << "Creating Variable " << grad_var_name;
                      new_vars.insert(var->Name());
                      auto it = param_name_map.find(grad_var_name);
                      if (it == param_name_map.end()) {
@@ -311,25 +330,21 @@ static void CreateGradVarInBlock(
                      grad_record.op_idx_ = static_cast<int>(op_index);
                      return false; /* not break */
                    });
-    if (need_infer_shape) {
-      ops[op_index]->InferVarType(block_desc);
-      for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-        if (new_vars.find(arg) == new_vars.end()) {
-          continue;
-        }
-        auto pname = FwdName(arg);
-        auto* param = block_desc->FindVarRecursive(pname);
-        auto* grad = block_desc->FindVar(arg);
-        if (param == nullptr) {
-          LOG(WARNING) << "Cannot find forward variable of " << arg
-                       << ". Set its gradient to FP32";
-          grad->SetDataType(DataType::FP32);
-        } else {
-          grad->SetDataType(param->GetDataType());
-        }
+    ops[op_index]->InferVarType(block_desc);
+    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
+      if (new_vars.find(arg) == new_vars.end()) {
+        continue;
+      }
+      auto pname = FwdName(arg);
+      auto* param = block_desc->FindVarRecursive(pname);
+      auto* grad = block_desc->FindVar(arg);
+      if (param == nullptr) {
+        grad->SetDataType(DataType::FP32);
+      } else {
+        grad->SetDataType(param->GetDataType());
       }
-      ops[op_index]->InferShape(*block_desc);
     }
+    ops[op_index]->InferShape(*block_desc);
   }
 }
 
@@ -342,14 +357,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   // All input gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
   if (AllGradInSet(inputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator  " << op_desc->Type();
     return grad_op_descs;  // empty vector
   }
+
   // All output gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+
   if (AllGradInSet(outputs, *no_grad_vars)) {
-    for (const std::string& name : inputs) {
-      no_grad_vars->insert(GradVarName(name));
+    VLOG(10) << "Drop operator " << op_desc->Type();
+    // FIXME: Hack code here
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+      // Only computational op need drop input's gradient.
+      for (const std::string& name : inputs) {
+        no_grad_vars->insert(GradVarName(name));
+        VLOG(10) << " Also drop " << GradVarName(name);
+      }
     }
+
     return grad_op_descs;  // empty vector
   }
 
@@ -379,10 +405,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     ProgramDescBind& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var) {
+  VLOG(5) << "MakeBlockBackward";
   BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
   std::vector<OpDescBind*> op_descs = cur_block->AllOps();
   std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
@@ -390,24 +423,39 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   std::vector<std::unique_ptr<OpDescBind>> backward_descs;
 
   for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    VLOG(5) << "Making backward " << (*it)->Type() << " op";
     std::vector<std::unique_ptr<OpDescBind>> op_grads;
 
-    if ((*it)->Type() == "recurrent") {
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
       int step_block_idx = (*it)->GetBlockAttr("step_block");
-      auto backward_block_op_descs = MakeBlockBackward(
-          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+      BlockDescBind* backward_block = CreateStepBlock(
+          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
       BlockDescBind* backward_block =
-          program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-      for (auto& ptr : backward_block_op_descs) {
-        backward_block->AppendAllocatedOp(std::move(ptr));
-      }
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
     }
 
+    if (VLOG_IS_ON(10)) {
+      std::ostringstream sout;
+      sout << "Made ";
+      for (auto& op_grad : op_grads) {
+        sout << op_grad->Type() << " ";
+      }
+      VLOG(10) << sout.str();
+    }
+
     for (const auto& desc : op_grads) {
       for (const std::string& out_name : desc->OutputArgumentNames()) {
+        if (out_name.find("@GRAD") == std::string::npos) {
+          // Not all outputs of a backward operator is a gradient. Only gradient
+          // need to be sum. Skip variables are not gradient.
+          continue;
+        }
         dup_out_ops[out_name].emplace_back(grad_desc_idx);
       }
       ++grad_desc_idx;
@@ -416,6 +464,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
         [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
   }
+
+  VLOG(5) << "Appending Sums";
   // Check whether some variables are written more than once
   std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
   for (const auto& dup : dup_out_ops) {
@@ -423,16 +473,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     const std::vector<size_t> dup_op = dup.second;
     if (out_name != kEmptyVarName && dup_op.size() > 1) {
       std::vector<std::string> sum_op_inputs;
+      std::string next_g_name = out_name;
       for (size_t i = 0; i < dup_op.size(); ++i) {
+        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
+                 << " duplicated";
         std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
         sum_op_inputs.emplace_back(new_name);
+        next_g_name = sum_op_inputs.back();
       }
       std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
           "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
+
   pending_sum_ops.sort(
       [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
          const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
@@ -443,9 +499,26 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
                           std::move(p.second));
   }
 
+  VLOG(5) << "MakeBlockBackward Finished";
+
   return backward_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDescBind* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
 ParamGradInfoMap AppendBackward(
     ProgramDescBind& program_desc, const VarDescBind& target,
     const std::unordered_set<std::string>& no_grad_vars) {
@@ -459,21 +532,16 @@ ParamGradInfoMap AppendBackward(
   const int root_block_idx = 0;
   auto root_block = program_desc.MutableBlock(root_block_idx);
 
-  // insert fill one op for target
-  // TODO(qiao) add some check to the target.
   std::string fill_one_op_out = GradVarName(target.Name());
-  std::vector<int64_t> target_shape_desc = target.Shape();
-  std::vector<int> target_shape;
-  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
-                 std::back_inserter(target_shape),
-                 [](int64_t dim) { return static_cast<int>(dim); });
+  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
   std::unique_ptr<OpDescBind> fill_one_op(
       new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", target_shape},
+                     {{"shape", std::vector<int>{1}},
                       {"value", static_cast<float>(1.0)},
-                      {"data_type", target.GetDataType()}}));
+                      {"dtype", target.GetDataType()}}));
   // infer var type of fill_one_op
   fill_one_op->InferVarType(root_block);
 
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 4e8d630c26..2b858f5ea0 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,7 +21,7 @@
 #include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"
 
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
 
 namespace paddle {
 namespace framework {
@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
   op->SetOutput("Out", {"out"});
 
   auto target = f::VarDescBind("out");
+  target.SetShape({1});
   auto var_to_grad = AppendBackward(program, target, {});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
   op->CheckAttrs();
 
   auto target = f::VarDescBind("out");
+  target.SetShape({1});
   AppendBackward(program, target, {});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
   op3->SetOutput("Out", {"out3"});
 
   auto target = f::VarDescBind("out3");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {});
 
@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
   op4->SetOutput("Out", {"out4"});
 
   auto target = f::VarDescBind("out4");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"out3"});
 
@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
   op2->SetOutput("Z", {"z2"});
 
   auto target = f::VarDescBind("z2");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"z1"});
 
@@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
   op3->SetOutput("Out", {"out3"});
 
   auto target = f::VarDescBind("out3");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {});
 
@@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
   op1->SetOutput("Out", {"out"});
 
   auto target = f::VarDescBind("out");
+  target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"b"});
   f::OpDescBind *fill_op = block->AllOps()[forward_len];
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9e3d597f3a..6a7a07d5cf 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
 
 namespace paddle {
@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const {
 }
 
 VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+  if (name == kEmptyVarName) return nullptr;
+
   auto it = vars_.find(name);
   if (it == vars_.end()) {
     return Parent() == kNoneBlockIndex ? nullptr
@@ -50,6 +53,15 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
+VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
+    const std::string &name_bytes) {
+  VarDescBind *res = FindVarRecursive(name_bytes);
+  if (res == nullptr) {
+    res = Var(name_bytes);
+  }
+  return res;
+}
+
 bool BlockDescBind::HasVarRecursive(const std::string &name) const {
   return FindVarRecursive(name) != nullptr;
 }
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 26adf6a20f..8e967e5378 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -58,6 +58,8 @@ class BlockDescBind {
 
   VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
 
+  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+
   bool HasVarRecursive(const std::string &var_name) const;
 
   std::set<std::string> LocalVarNames() const {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c5ae7b1854..c54d2d4ddf 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -29,11 +29,30 @@ inline DataType ToDataType(std::type_index type) {
     return DataType::INT32;
   } else if (typeid(int64_t).hash_code() == type.hash_code()) {
     return DataType::INT64;
+  } else if (typeid(bool).hash_code() == type.hash_code()) {
+    return DataType::BOOL;
   } else {
     PADDLE_THROW("Not supported");
   }
 }
 
+inline std::type_index ToTypeIndex(DataType type) {
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    case DataType::BOOL:
+      return typeid(bool);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
 template <typename Visitor>
 inline void VisitDataType(DataType type, Visitor visitor) {
   switch (type) {
@@ -49,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
     case DataType::INT64:
       visitor.template operator()<int64_t>();
       break;
+    case DataType::BOOL:
+      visitor.template operator()<bool>();
+      break;
     default:
       PADDLE_THROW("Not supported");
   }
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 239ae5e123..8b6f42b82d 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
       ddim = make_dim<9>(dims);
       break;
     default:
-      throw std::invalid_argument(
-          "Dynamic dimensions must have between [1, 9] dimensions.");
+      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
   }
 }
 
@@ -79,6 +78,13 @@ DDim make_ddim(const std::vector<int64_t>& dims) {
   return result;
 }
 
+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
 /// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
@@ -117,7 +123,7 @@ int64_t DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 2a5e2d2b69..4ca5e49566 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -71,7 +71,7 @@ struct DDim {
 
   DDim operator*(DDim d) const;
 
-  int64_t size() const;
+  int size() const;
 };
 
 /**
@@ -81,6 +81,8 @@ struct DDim {
  */
 DDim make_ddim(const std::vector<int64_t>& dims);
 
+DDim make_ddim(const std::vector<int>& dims);
+
 /**
  * \brief Make a DDim from an initializer list
  *
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c1a009f131..83aa927c29 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 
@@ -73,6 +74,8 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
     var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == VarDesc::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
+  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
@@ -94,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
   if (create_local_scope) {
     local_scope = &scope->NewScope();
     for (auto& var : block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+
       if (var->Persistable()) {
         auto* ptr = scope->Var(var->Name());
         CreateTensor(ptr, var->GetType());
@@ -117,6 +124,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    VLOG(3) << op->DebugString();
     op->Run(*local_scope, *device);
   }
   if (create_local_scope) {
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 54ce461ce8..f1fc4529e1 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -109,6 +109,11 @@ message LoDTensorDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
+message LoDTensorArrayDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+
 message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
@@ -117,11 +122,13 @@ message VarDesc {
     FETCH_LIST = 4;
     STEP_SCOPES = 5;
     LOD_RANK_TABLE = 6;
+    LOD_TENSOR_ARRAY = 7;
   }
   required string name = 1;
   required VarType type = 2;
   optional LoDTensorDesc lod_tensor = 3;
   optional TensorDesc selected_rows = 4;
+  optional LoDTensorArrayDesc tensor_array = 6;
   optional bool persistable = 5 [ default = false ];
 }
 
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index f9abf902a1..1c2fba70c8 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -31,12 +31,18 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     TableItem item;
     item.index = i;
     item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
     items_.emplace_back(item);
   }
-  std::sort(items_.begin(), items_.end(),
-            [](const TableItem& a, const TableItem& b) {
-              return a.length > b.length;
-            });
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
+                   [](const TableItem& a, const TableItem& b) {
+                     return a.length > b.length;
+                   });
 }
 
 }  // namespace framework
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 584308a538..fdf6de4bab 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,7 +29,21 @@
 namespace paddle {
 namespace framework {
 
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+  os << "{";
+  for (auto &v : lod) {
+    os << "{";
+    for (auto &i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
@@ -39,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   return new_lod;
 }
 
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -50,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
   res[0].assign(in[level].begin() + elem_begin,
                 in[level].begin() + elem_end + 1);
   for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
     out_level.assign(in_level.begin() + above_level.front(),
                      in_level.begin() + above_level.back() + 1);
   }
@@ -60,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
     // to make the first offset equals 0, all the elements minus the first
     // element
     size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
       ele -= front;
     }
   }
   return res;
 }
 
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
   for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
       ele = result[level + 1][ele];
     }
   }
   return result;
 }
 
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
   if (a.size() != b.size()) {
     return false;
   }
 
   for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
     if (a_level.size() != b_level.size()) {
       return false;
     }
@@ -135,5 +151,168 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
   ShareDataWith(Slice(begin, end));
 }
+
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index f4fe4cdac6..9411c96aea 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -24,6 +24,7 @@
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
@@ -56,6 +57,8 @@ using Vector = thrust::host_vector<
  */
 using LoD = std::vector<Vector<size_t>>;
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
 /*
  * Slice levels from a LoD.
  * NOTE the lowest level should always be the absolute offsets of the underlying
@@ -173,13 +176,27 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
   for (size_t ins = 0; ins < num_instances; ins++) {
     for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      tensor.Slice(elem, elem + 1)
-          .CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
-                    platform::CPUDeviceContext());
+      auto slice = tensor.Slice(elem, elem + 1);
+      CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
+               platform::CPUDeviceContext(), &slice);
     }
   }
   return tensor;
 }
 
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
+
+void AppendLoD(LoD* lod, const LoD& lod_length);
+
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index d147f1c425..10a8a7867f 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
 are transformed into offsets of elements/words as follows:
 
 ```
-0 9     10  15
-  =     =   =
-  3+2+4 1+9 2+3+10
-```
-
-so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
-
-The complete offset representation is as follows:
-
-```
-0           9 10       15
-0   3  5    9 10  12   15
- ||| || |||| |  ||  |||
+0 3 4   6
+  = =   =
+  3 3+1 4+2
 ```
 
 ## Slicing of LoD Tensors
diff --git a/paddle/operators/conv2d_op.cu b/paddle/framework/lod_tensor_array.h
similarity index 62%
rename from paddle/operators/conv2d_op.cu
rename to paddle/framework/lod_tensor_array.h
index c697c9466d..13f0608d24 100644
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/framework/lod_tensor_array.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -12,11 +12,12 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/conv2d_op.h"
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
 
-namespace ops = paddle::operators;
-
-REGISTER_OP_GPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>);
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index aa2f6c993d..02d84b6823 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -144,5 +144,48 @@ TEST(LodExpand, test) {
   }
 }
 
+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  lod.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
+
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
+}
+
+TEST(LoD, AppendLoD) {
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
+
+  LoD origin;
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
+
+  paddle::framework::AppendLoD(&origin, lod_lens);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
+  expected.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
+  EXPECT_EQ(origin, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index c96166f35d..2281d93df9 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,10 +65,13 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
   }
+  bool IsRuntime() const override;
+
+ protected:
+  VarDesc::VarType GetVarType(const std::string &name) const override;
 
- private:
   DDim GetDim(const std::string &name) const override;
 
   void SetDim(const std::string &name, const DDim &dim) override;
@@ -232,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
   need_update_ = true;
 }
 
+void OpDescBind::RenameOutput(const std::string &old_name,
+                              const std::string &new_name) {
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+void OpDescBind::RenameInput(const std::string &old_name,
+                             const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  need_update_ = true;
+}
+
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
   explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
   mutable OpDesc::Attr *attr_;
@@ -349,9 +369,13 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
     info.infer_var_type_(*this, block);
   } else {
     // all output type is LoDTensor by default
+    VLOG(10) << this->Type()
+             << " has not registered InferVarType. Set output variables to "
+                "LOD_TENSOR";
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
-        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+        block->FindRecursiveOrCreateVar(out_var_name)
+            ->SetType(VarDesc::LOD_TENSOR);
       }
     }
   }
@@ -441,13 +465,29 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
 DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  return framework::make_ddim(var->Shape());
+  try {
+    auto shape = var->Shape();
+    if (shape.empty()) {
+      return framework::make_ddim({0UL});
+    } else {
+      return framework::make_ddim(var->Shape());
+    }
+  } catch (...) {
+    VLOG(5) << "GetDim of variable " << name << " error";
+    std::rethrow_exception(std::current_exception());
+  }
 }
 
 void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                           const DDim &dim) {
   block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
 }
+bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
+
+VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+    const std::string &name) const {
+  return block_.FindVarRecursive(name)->GetType();
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index e3e96441bb..da032319af 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -73,6 +73,10 @@ class OpDescBind {
 
   void Rename(const std::string &old_name, const std::string &new_name);
 
+  void RenameOutput(const std::string &old_name, const std::string &new_name);
+
+  void RenameInput(const std::string &old_name, const std::string &new_name);
+
   // Only be used in C++
   const AttributeMap &GetAttrMap() const;
 
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 2bb5e0e8ec..daade439e5 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
-                                        PlaceType());
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 9295d36c2b..f1444eeee9 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/shape_inference.h"
+#include "paddle/framework/var_type.h"
 
 namespace paddle {
 namespace framework {
@@ -252,8 +254,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   return res;
 }
 
-std::ostream& operator<<(std::ostream& os,
-                         const OperatorWithKernel::OpKernelKey& kernel_key) {
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
   os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
      << "]";
   return os;
@@ -365,7 +366,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
     out_tensor->set_lod(in_tensor.lod());
   }
 
- private:
+  bool IsRuntime() const override { return true; }
+
+ protected:
   DDim GetDim(const std::string& name) const override {
     Variable* var = scope_.FindVar(name);
     if (var->IsType<LoDTensor>()) {
@@ -388,25 +391,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
+  VarDesc::VarType GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+ private:
   const OperatorBase& op_;
   const Scope& scope_;
 };
 
 void OperatorWithKernel::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
-  if (VLOG_IS_ON(1)) {
-    auto inputs = this->InputVars();
-    auto outputs = this->OutputVars(true);
-    std::ostringstream sout;
-    sout << "Run operator " << this->Type() << " From [";
-    std::ostream_iterator<std::string> out_it(sout, ",");
-    std::copy(inputs.begin(), inputs.end(), out_it);
-    sout << "] to [";
-    std::copy(outputs.begin(), outputs.end(), out_it);
-    sout << "]";
-    VLOG(1) << sout.str();
-  }
-
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
 
@@ -422,7 +418,7 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   // check if op[type] have kernel for kernel_key
   OpKernelMap& kernels = kernels_iter->second;
-  auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+  auto kernel_key = GetKernelType(ctx);
   auto kernel_iter = kernels.find(kernel_key);
 
   if (kernel_iter == kernels.end()) {
@@ -431,6 +427,38 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   kernel_iter->second->Compute(ctx);
 }
+OpKernelType OperatorWithKernel::GetKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+}
+DataType OperatorWithKernel::IndicateDataType(
+    const ExecutionContext& ctx) const {
+  auto& scope = ctx.scope();
+  int data_type = -1;
+  for (auto& input : this->inputs_) {
+    for (auto& ipt_name : input.second) {
+      auto* var = scope.FindVar(ipt_name);
+      if (var != nullptr) {
+        const Tensor* t = nullptr;
+        if (var->IsType<Tensor>()) {
+          t = &var->Get<Tensor>();
+        } else if (var->IsType<LoDTensor>()) {
+          t = &var->Get<LoDTensor>();
+        } else if (var->IsType<SelectedRows>()) {
+          t = &(var->Get<SelectedRows>().value());
+        }
+        if (t != nullptr) {
+          int tmp = static_cast<int>(ToDataType(t->type()));
+          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                         "DataType of Paddle Op %s must be the same.", Type());
+          data_type = tmp;
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  return static_cast<DataType>(data_type);
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 5c1989c26b..60861d9293 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -298,11 +298,10 @@ class ExecutionContext {
   }
 
 #ifdef PADDLE_WITH_CUDA
-  const platform::CUDADeviceContext& cuda_device_context() const {
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
     PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    auto cuda_ctx =
-        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
-    return *cuda_ctx;
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
   }
 #endif
 
@@ -346,27 +345,10 @@ class OpKernel : public OpKernelBase {
   using ELEMENT_TYPE = T;
 };
 
-class OperatorWithKernel : public OperatorBase {
- public:
-  struct OpKernelKey {
-    platform::Place place_;
-    DataType data_type_;
-
-    OpKernelKey(DataType data_type, platform::Place place)
-        : place_(place), data_type_(data_type) {}
-
-    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
-        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-
-    bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_) &&
-             data_type_ == o.data_type_;
-    }
-  };
-
-  struct OpKernelHash {
+struct OpKernelType {
+  struct Hash {
     std::hash<int> hash_;
-    size_t operator()(const OpKernelKey& key) const {
+    size_t operator()(const OpKernelType& key) const {
       int place = key.place_.which();
       int data_type = static_cast<int>(key.data_type_);
       int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
@@ -375,9 +357,26 @@ class OperatorWithKernel : public OperatorBase {
     }
   };
 
+  platform::Place place_;
+  DataType data_type_;
+
+  OpKernelType(DataType data_type, platform::Place place)
+      : place_(place), data_type_(data_type) {}
+
+  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_;
+  }
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
   using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
-                         OpKernelHash>;
+      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
+                         OpKernelType::Hash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -405,40 +404,15 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
+  virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+
+ private:
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
-  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
-    auto& scope = ctx.scope();
-    int data_type = -1;
-    for (auto& input : this->inputs_) {
-      for (auto& ipt_name : input.second) {
-        auto* var = scope.FindVar(ipt_name);
-        if (var != nullptr) {
-          const Tensor* t = nullptr;
-          if (var->IsType<Tensor>()) {
-            t = &var->Get<Tensor>();
-          } else if (var->IsType<LoDTensor>()) {
-            t = &var->Get<LoDTensor>();
-          } else if (var->IsType<SelectedRows>()) {
-            t = &(var->Get<SelectedRows>().value());
-          }
-          if (t != nullptr) {
-            int tmp = static_cast<int>(ToDataType(t->type()));
-            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op %s must be the same.",
-                           Type());
-            data_type = tmp;
-          }
-        }
-      }
-    }
-    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-    return static_cast<DataType>(data_type);
-  }
+  DataType IndicateDataType(const ExecutionContext& ctx) const;
 };
 
-std::ostream& operator<<(std::ostream& os,
-                         const OperatorWithKernel::OpKernelKey& kernel_key);
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
 
 extern bool OpSupportGPU(const std::string& op_type);
 
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 42e0d52eed..1e19f82b34 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
-  DataType IndicateDataType(const ExecutionContext& ctx) const override {
-    return DataType::FP32;
+  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
+    return OpKernelType(DataType::FP32, ctx.device_context());
   }
 };
 
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index bf3066983c..da76052eb4 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -26,6 +26,8 @@ namespace framework {
 
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";
+const std::string kDropOutOpType = "dropout";
+const std::string kBatchNormOpType = "batch_norm";
 
 bool HasDependentVar(const OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
@@ -106,5 +108,26 @@ void Prune(const ProgramDesc& input, ProgramDesc* output) {
   prune_impl(input, output, 0);
 }
 
+void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
+                             int block_id) {
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  for (auto& op_desc : *op_field) {
+    if (op_desc.type() == kDropOutOpType ||
+        op_desc.type() == kBatchNormOpType) {
+      for (auto& attr : *op_desc.mutable_attrs()) {
+        if (attr.name() == "is_test") {
+          attr.set_b(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+  inference_optimize_impl(input, output, 0);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
index 8cfb16343a..23db014894 100644
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -22,5 +22,7 @@ namespace framework {
 
 void Prune(const ProgramDesc& input, ProgramDesc* output);
 
+void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index fb2c691056..656736e238 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -36,13 +36,11 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
-  auto iter = vars_.find(name);
-  if (iter != vars_.end()) {
-    return iter->second;
-  }
-  Variable* v = new Variable();
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+  v = new Variable();
   vars_[name] = v;
-  VLOG(3) << "Create variable " << name << " on scope";
+  VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
 }
@@ -56,8 +54,10 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
   return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
 
@@ -98,5 +98,28 @@ void Scope::DeleteScope(Scope* scope) {
   delete scope;
 }
 
+void Scope::Rename(const std::string& origin_name,
+                   const std::string& new_name) const {
+  auto origin_it = vars_.find(origin_name);
+  PADDLE_ENFORCE(origin_it != vars_.end(),
+                 "Cannot find original variable with name %s", origin_name);
+  auto new_it = vars_.find(new_name);
+  PADDLE_ENFORCE(new_it == vars_.end(),
+                 "The variable with name %s is already in the scope", new_name);
+  vars_[new_name] = origin_it->second;
+  vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  Rename(origin_name, var_name);
+  return var_name;
+}
+Variable* Scope::FindVarLocally(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return nullptr;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index fb66094939..56e815db54 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -68,11 +68,20 @@ class Scope {
   // enumerate all the variables current contains.
   std::vector<std::string> GetAllNames(bool recursive = false) const;
 
+  // Rename variable to a new name
+  void Rename(const std::string& origin_name,
+              const std::string& new_name) const;
+
+  // Rename variable to a new name and return the new name
+  std::string Rename(const std::string& origin_name) const;
+
  private:
+  Variable* FindVarLocally(const std::string& name) const;
+
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 8169df8e46..7dac1cfd5e 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -12,6 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include "paddle/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
   return GetDims(names);
 }
 
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
+
 void InferShapeContext::SetOutputsDim(
     const std::string &name, const std::vector<framework::DDim> &dims) {
   auto &names = Outputs(name);
@@ -43,9 +51,29 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
   size_t length = names.size();
   PADDLE_ENFORCE_EQ(length, dims.size());
   for (size_t i = 0; i < length; ++i) {
+    if (names[i] == framework::kEmptyVarName) {
+      continue;
+    }
     SetDim(names[i], dims[i]);
   }
 }
+std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Inputs(name));
+}
+std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Outputs(name));
+}
+std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+    const std::vector<std::string> &names) const {
+  std::vector<VarDesc::VarType> retv;
+  retv.resize(names.size());
+  std::transform(names.begin(), names.end(), retv.begin(),
+                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
+                           std::placeholders::_1));
+  return retv;
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 6f19900ef1..46f2ea84b4 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/framework.pb.h"
 
 namespace paddle {
 namespace framework {
@@ -26,12 +27,17 @@ class InferShapeContext {
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
 
+  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
+  std::vector<VarDesc::VarType> GetOutputsVarType(
+      const std::string &name) const;
+
   virtual bool HasInputs(const std::string &name) const = 0;
   virtual bool HasOutputs(const std::string &name) const = 0;
 
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
 
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
 
   virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
   void SetOutputsDim(const std::string &name,
@@ -46,6 +52,12 @@ class InferShapeContext {
   virtual void ShareLoD(const std::string &in, const std::string &out,
                         size_t i = 0, size_t j = 0) const = 0;
 
+  virtual bool IsRuntime() const = 0;
+
+  // Note: In while op, we need this to be public
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<framework::DDim> &dims);
+
  protected:
   virtual framework::DDim GetDim(const std::string &name) const = 0;
   virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@@ -53,8 +65,10 @@ class InferShapeContext {
   std::vector<framework::DDim> GetDims(
       const std::vector<std::string> &names) const;
 
-  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+  std::vector<VarDesc::VarType> GetVarTypes(
+      const std::vector<std::string> &names) const;
+
+  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 28d0fcf94e..6a0c5133c9 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -89,34 +89,6 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor& ShareDataWith(const Tensor& src);
 
-  /**
-   * @brief   Copy the content of external tensor to a new place.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] dst_place  The dst place.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
-   */
-  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
-  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
-  // and make them global functions
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                       const platform::DeviceContext& ctx);
-
-  /**
-   * @brief   Copy the content of an external vector to a tensor.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * * @note    CopyFromVector assumes that the tensor has been resized
-   *            before invoking.
-   */
-  template <typename T>
-  inline void CopyFromVector(const std::vector<T>& src,
-                             const platform::DeviceContext& ctx);
-
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
@@ -141,7 +113,6 @@ class Tensor {
 
   size_t memory_size() const;
 
- private:
   inline void check_memory_size() const;
 
  private:
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
deleted file mode 100644
index 0947e33548..0000000000
--- a/paddle/framework/tensor_array.cc
+++ /dev/null
@@ -1,444 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-
-
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <limits>
-
-#include "paddle/framework/eigen.h"
-
-namespace paddle {
-namespace framework {
-
-namespace detail {
-
-/*
- * Offer an iterator over the length-sorted lod-tensor's top level. The top
- * level of a lod-tensor stores batch-size of sequences, each top-level sequence
- * may contains several lower-level sequences, sort top-level lod by the numbers
- * of lower-level sequences in descending order, so that during RNN's running,
- * the batch-size will keep decreasing, the short sentences will end at the tail
- * of each batch.
- *
- * Let's take a simple lod-tensor for example
- *
- *   |(0)       |(1)        top-level has two instances
- *   |||        |||||    lower-level
- *
- * sort by lower-level's length
- *
- *   |(1)       |(0)
- *   |||||      |||
- *
- * when RNN runs, it get 5 batches (equals the number of elements the longest
- * sequence has)
- *
- * |||||
- * |||
- *
- * the first three batches has two elements, the last two elements just has 1
- * element each.
- */
-struct DynamicBatchUnpacker {
-  using value_type = float;
-
-  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
-                       bool descend = true)
-      : source(&source), level(level) {
-    BuildLengthSortedMeta(descend);
-  }
-
-  LoDTensor GetBatch(size_t index);
-
-  std::vector<DySeqMeta> meta;
-
-  LoDTensor const* source;
-  size_t level;
-
- protected:
-  void BuildLengthSortedMeta(bool descend);
-};
-
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level);
-
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + batch_id;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
-  return indice;
-}
-
-}  // namespace detail
-
-const LoDTensor& TensorArray::Read(size_t index) const {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  return values_[index];
-}
-
-void TensorArray::Write(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].Resize(value.dims());
-  values_[index].mutable_data<value_type>(value.place());
-  values_[index].CopyFrom(value, value.place(), platform::CPUDeviceContext());
-}
-
-void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].ShareDataWith(value);
-}
-
-LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
-                            const LoD& lod) const {
-  return detail::PackDynamicBatch(values_, meta, lod, level);
-}
-
-DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
-                                   bool length_desend) {
-  detail::DynamicBatchUnpacker unpacker(source, level,
-                                        length_desend /*descend*/);
-
-  // find max length of all the sequences
-  size_t max_length = 0;
-  for (const auto& seq : unpacker.meta) {
-    max_length = std::max(max_length, seq.end - seq.begin);
-  }
-
-  // write batches to values
-  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
-    Write(batch_id, unpacker.GetBatch(batch_id));
-  }
-
-  PADDLE_ENFORCE(!unpacker.meta.empty());
-  return unpacker.meta;
-}
-
-LoDTensor TensorArray::LodPack(size_t level) const {
-  PADDLE_ENFORCE_GT(size(), 0UL, "no time step exists");
-  // the levels should be no less than 2
-  LoDTensor merged;
-  const LoDTensor *pre, *cur;
-  pre = &Read(0);
-
-  for (size_t step = 1; step < size(); step++) {
-    cur = &Read(step);
-    PADDLE_ENFORCE_GT(cur->NumLevels(), 0);
-    PADDLE_ENFORCE_GT(pre->NumLevels(), 0);
-    PADDLE_ENFORCE_EQ(pre->NumLevels(), cur->NumLevels());
-    PADDLE_ENFORCE_EQ(pre->NumElements(level), cur->NumElements(level));
-
-    merged = LodPackTwo(*pre, *cur, level);
-    pre = &merged;
-  }
-  return merged;
-}
-
-/*
- * NOTE currently, only the lowest level supports packing.
- * The lowest LoD will be changed, while the relative offsets in levels above
- * stay unchanged.
- *
- * previous step : [0] [1] [3]
- * current step: [0 1 2] [2 3] []
- * packed to
- *   [0 0] [0 1] [0 2] [1 2] [1 3] [3]
- */
-LoDTensor TensorArray::LodPackTwo(const LoDTensor& pre, const LoDTensor& cur,
-                                  size_t level) const {
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), cur.NumLevels());
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), level + 1,
-                    "Only the lowest LoD level supports pack temporarily.");
-  // calculate the result tensor's shape first
-  size_t num_instances = 0;
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-    if (num_candidates > 0) {
-      num_instances += num_candidates * (prefix_size + 1);
-    } else {
-      num_instances += prefix_size;
-    }
-  }
-
-  auto res_dims = pre.dims();
-  res_dims[0] = num_instances;
-  LoDTensor result;
-  result.Resize(res_dims);
-  result.mutable_data<value_type>(cur.place());
-
-  Vector<size_t> last_lod_level;
-  // copy data
-  size_t index = 0;
-  last_lod_level.push_back(index);
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-
-    // slice the prefix Tensor
-    LoDTensor prefix = pre;
-    prefix.ShrinkInLevel(level, elem, elem + 1);
-    LoDTensor candidate = cur;
-    if (num_candidates > 0) {
-      candidate.ShrinkInLevel(level, elem, elem + 1);
-    } else {  // just push prefix
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      last_lod_level.push_back(index);
-    }
-    for (size_t candi = 0; candi < num_candidates; candi++) {
-      // TODO(superjom) support GPU
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      // copy candidate record
-      result.Slice(index, index + 1)
-          .CopyFrom(candidate.Slice(candi, candi + 1), result.place(),
-                    platform::CPUDeviceContext());
-      index++;
-      last_lod_level.push_back(index);
-    }
-  }
-
-  // update lod
-  auto lod = cur.lod();
-  lod.back() = last_lod_level;
-  result.set_lod(lod);
-  return result;
-}
-
-/*
- * source [0 1 2] [3 4] [5 6 7] will be transformd to a list of LoDTensors such
- * as
- * [0 3 5] [1 4 6] [2 7] with 1-level LoDs:
- * - [0 1 2 3]
- * - [0 1 2 3]
- * - [0 1 1 2], the [1,1) here means the second sequence is empty
- *
- * NOTE Unpack a LoDTensor in this approach may result in a big LoD.
- */
-void TensorArray::LodUnpack(const LoDTensor& source, size_t level) {
-  PADDLE_ENFORCE_EQ(level, source.NumLevels() - 1,
-                    "only the lowest LoD level supports unpack.");
-  const size_t non_empty_instances = source.dims()[0];
-  size_t index = 0;
-  Vector<size_t> lowest_lod_level;
-  lowest_lod_level.push_back(index);
-
-  for (size_t step = 0; step < non_empty_instances; step++) {
-    size_t num_instances = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        num_instances++;
-        index++;
-      }
-      lowest_lod_level.push_back(index);
-    }
-
-    // create tensor for this time step
-    LoDTensor tensor;
-    auto dims = source.dims();
-    dims[0] = num_instances;
-    // set lod
-    auto lod = source.lod();
-    lod.back() = lowest_lod_level;
-    tensor.set_lod(lod);
-
-    index = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        // copy this instance
-        tensor.Slice(index, index + 1)
-            .CopyFrom(instance.Slice(step, step + 1), tensor.place(),
-                      platform::CPUDeviceContext());
-        index++;
-      }
-    }
-    Write(step, tensor);
-  }
-}
-
-LoDTensor TensorArray::Stack() const {
-  LoDTensor result;
-  if (size() == 0) return result;
-
-  const auto& first_dims = values_.front().dims();
-  // check all the values have the same shape
-  // TODO(superjom) check the same dtypes
-  for (size_t idx = 1; idx < size(); idx++) {
-    const auto& value_dims = values_[idx].dims();
-    PADDLE_ENFORCE_EQ(first_dims, value_dims);
-  }
-
-  // copy
-  auto result_dims = vectorize(first_dims);
-  result_dims.insert(result_dims.begin(), size());
-  result.Resize(make_ddim(result_dims));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice(idx, idx + 1)
-        .CopyFrom(Read(idx), platform::CPUPlace(),
-                  platform::CPUDeviceContext());
-  }
-  return result;
-}
-
-void TensorArray::Unstack(const LoDTensor& source) const {
-  Unstack(source, false /*data_shared*/);
-}
-
-void TensorArray::UnstackShared(const LoDTensor& source) const {
-  Unstack(source, true /*data_shared*/);
-}
-
-void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
-  size_t first_dim = source.dims()[0];
-  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
-  PADDLE_ENFORCE_GT(first_dim, 0,
-                    "source should have some data to be unstacked");
-
-  values_.resize(first_dim);
-
-  for (size_t elem = 0; elem < first_dim; elem++) {
-    // create a new value
-    auto& value = values_[elem];
-    if (data_shared) {
-      // share memory
-      value.ShareDataWith(source.Slice(elem, elem + 1));
-    } else {
-      // copy
-      value.Resize(value_dims);
-      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
-                     platform::CPUDeviceContext());
-    }
-  }
-}
-
-size_t TensorArray::size() const { return values_.size(); }
-
-namespace detail {
-
-void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
-  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
-  // collect meta for each sequence in some level
-  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
-
-  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
-    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
-    meta.push_back(seq_meta);
-  }
-
-  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
-
-  // sort by length
-  sort(meta.begin(), meta.end(),
-       [descend](const DySeqMeta& a, const DySeqMeta& b) {
-         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
-         return descend ? a_ge_b : !a_ge_b;
-       });
-}
-
-LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
-  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
-  LoDTensor result;
-
-  auto indice = detail::GenDyBatchIndice(meta, index);
-  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
-
-  // copy the indice of records in LoDTensor
-  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t i = 0; i < indice.size(); i++) {
-    auto index = indice[i];
-    auto target = result.Slice(i, i + 1);
-    auto slice = source->Slice(index, index + 1);
-
-    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
-  }
-
-  return result;
-}
-
-// TODO(supejom) to cache lod if reasonable
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level) {
-  PADDLE_ENFORCE(!source.empty());
-  PADDLE_ENFORCE(!meta.empty());
-  PADDLE_ENFORCE(!lod.empty());
-
-  LoDTensor result;
-
-  // init result space
-  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  auto height = lod[level].back();
-  record_dims_vec.insert(record_dims_vec.begin(), height);
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<float>(platform::CPUPlace());
-
-  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
-    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
-      const auto& seq_meta = meta[seq_id];
-      // source is source[batch_id][seq_id]
-      // target is result[index]
-      auto index = seq_meta.begin + batch_id;
-      if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
-      auto target = result.Slice(index, index + 1);
-      target.CopyFrom(source_, platform::CPUPlace(),
-                      platform::CPUDeviceContext());
-    }
-  }
-
-  result.set_lod(lod);
-  return result;
-}
-
-}  // namespace detail
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
deleted file mode 100644
index 78fad8cab7..0000000000
--- a/paddle/framework/tensor_array.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
- * after lod-tensor's re-assembling, its info can be used to recover the order
- * in original lod-tensor.
- */
-struct DySeqMeta {
-  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
-      : begin(begin), end(end), ori_idx(ori_idx) {}
-
-  size_t begin;
-  size_t end;  // not included
-  size_t ori_idx;
-};
-
-using DySeqMetaBatch = std::vector<DySeqMeta>;
-
-/*
- * Extract the indices of instances.
- */
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
-
-/*
- * TensorArray is a C-array-like array of tensors, it is meant to be used with
- * dynamic iteration primitives such as while_loop. It is used to segment inputs
- * and store states in all time steps.
- *
- * By providing some methods similar to a C++ array, the difinition of some
- * state-based dynamic models such as RNN cound be more natural and highly
- * flexible.
- */
-class TensorArray {
- public:
-  using value_type = float;
-
-  // max number of values allowed to store.
-  const size_t MAX_SIZE{100000};
-
-  /*
-   * Read the value at location `index` in the `TensorArray`.
-   */
-  const LoDTensor &Read(size_t index) const;
-
-  /*
-   * Write value into the index of the TensorArray.
-   */
-  void Write(size_t index, const LoDTensor &value);
-
-  /*
-   * Write value into the index of the TensorArray, with memory shared.
-   */
-  void WriteShared(size_t index, const LoDTensor &value);
-
-  /*
-   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
-   * `indice_map`.
-   */
-  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
-                 const LoD &lod) const;
-
-  /*
-   * Split LoDTensor in some `level` and write the generated batches to
-   * `values`, if set `desend`, will sort by length in descending order else in
-   * ascending order.
-   */
-  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
-
-  /*
-   * Pack an array of LoDTensors to a LoDTensor.
-   */
-  LoDTensor LodPack(size_t level) const;
-
-  /*
-   * Unpack a LoDTensor to an array of LoDTensors.
-   */
-  void LodUnpack(const LoDTensor &source, size_t level);
-
-  /*
-   * Pack the values into a tensor with rank one higher than each tensor in
-   * values.
-   */
-  LoDTensor Stack() const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
-   */
-  void Unstack(const LoDTensor &source) const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
-   * with memory of tensors shared.
-   */
-  void UnstackShared(const LoDTensor &source) const;
-
-  /*
-   * Return the number of values.
-   */
-  size_t size() const;
-
- protected:
-  void Unstack(const LoDTensor &source, bool data_shared) const;
-
-  LoDTensor LodPackTwo(const LoDTensor &pre, const LoDTensor &cur,
-                       size_t level) const;
-
- private:
-  mutable std::vector<LoDTensor> values_;
-};  // class TensorArray
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
deleted file mode 100644
index 83b52b442d..0000000000
--- a/paddle/framework/tensor_array_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-class TensorArrayTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    LoDTensor source;
-    source.Resize(make_ddim({batch_size, dim}));
-    int* data = source.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 16 * 32; i++) {
-      data[i] = i;
-    }
-    ta.Unstack(source);
-  }
-
-  TensorArray ta;
-  const int batch_size = 16;
-  const int dim = 32;
-};
-
-TEST_F(TensorArrayTester, Read) {
-  for (int i = 0; i < batch_size; i++) {
-    const auto& tensor = ta.Read(i);
-    ASSERT_EQ(tensor.dims()[0], 1);
-    ASSERT_EQ(tensor.dims()[1], dim);
-  }
-}
-
-TEST_F(TensorArrayTester, Write) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.Write(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-}
-
-TEST_F(TensorArrayTester, WriteShared) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.WriteShared(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-
-  EXPECT_EQ(source.data<int>(), tensor.data<int>());
-}
-
-class TensorArrayPackTester : public ::testing::Test {
- protected:
-  virtual void SetUp() override {
-    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
-
-    source.set_lod(lod);
-    source.Resize(make_ddim({13, 128}));
-    source.mutable_data<int>(platform::CPUPlace());
-
-    // content of each setence: 0 1 2 3 4
-    const auto& level = lod.front();
-    for (size_t i = 0; i < level.size() - 1; i++) {
-      size_t begin = level[i];
-      size_t end = level[i + 1];
-      for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice(j, j + 1);
-        for (int dim = 0; dim < 128; dim++) {
-          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
-        }
-      }
-    }
-
-    // unpack
-    meta = ta.Unpack(source, 0, true);
-  }
-
-  LoD lod;
-  TensorArray ta;
-  LoDTensor source;
-  std::vector<DySeqMeta> meta;
-};
-
-TEST_F(TensorArrayPackTester, Unpack) {
-  ASSERT_EQ(ta.size(), 7UL);
-
-  const auto& t0 = ta.Read(0);
-  const auto& t1 = ta.Read(1);
-
-  ASSERT_EQ(t0.data<int>()[0], int(0));
-  ASSERT_EQ(t1.data<int>()[0], int(1));
-}
-
-TEST_F(TensorArrayPackTester, Pack) {
-  LoDTensor packed = ta.Pack(0, meta, lod);
-}
-
-TEST_F(TensorArrayTester, size) {
-  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
-}
-
-TEST(TensorArray, LodPack) {
-  // three time steps, each step stores a LoDTensors
-  // - [0] [1]
-  // - [2 3], [4 5]
-  // - [6 7] [] [8], [9, 10]
-  // try to get a LoDTensor with content:
-  // - [0 2 6]
-  // - [0 2 7]
-  // - [0 3]
-  // - [1 4 8]
-  // - [1 5 9]
-  // - [1 5 10]
-  std::array<LoDTensor, 3> tensors;
-  tensors[0].Resize(make_ddim({2, 1}));
-  tensors[1].Resize(make_ddim({4, 1}));
-  tensors[2].Resize(make_ddim({5, 1}));
-  int index = 0;
-  for (auto& t : tensors) {
-    t.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < t.dims()[0]; i++) {
-      t.data<int>()[i] = index;
-      index++;
-    }
-  }
-
-  std::array<LoD, 3> lods;
-  std::vector<std::vector<size_t>> levels{
-      {0, 1, 2}, {0, 2, 4}, {0, 2, 2, 3, 5}};
-  for (int i = 0; i < 3; i++) {
-    lods[i].emplace_back(levels[i].begin(), levels[i].end());
-  }
-
-  TensorArray ta;
-  for (int i = 0; i < 3; i++) {
-    tensors[i].set_lod(lods[i]);
-    ta.Write(i, tensors[i]);
-  }
-
-  auto merged = ta.LodPack(0);
-
-  std::vector<int> target_tensor_data{{0, 2, 6,  // 0
-                                       0, 2, 7,  // 1
-                                       0, 3,     // 2
-                                       1, 4, 8,  // 3
-                                       1, 5, 9,  // 5
-                                       1, 5, 10}};
-  EXPECT_EQ(merged.dims()[0], (int)target_tensor_data.size());
-  for (size_t i = 0; i < target_tensor_data.size(); i++) {
-    EXPECT_EQ(target_tensor_data[i], merged.data<int>()[i]);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index d78a2c4c21..aba1f9f093 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
   size_t size = functor(type);
   PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
   return size;
@@ -150,84 +150,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-  src.check_memory_size();
-  Resize(src.dims());
-
-  auto src_place = src.holder_->place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-template <typename T>
-inline void Tensor::CopyFromVector(const std::vector<T>& src,
-                                   const platform::DeviceContext& ctx) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 1bb0fb71b0..ceca64365a 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -188,178 +188,6 @@ TEST(Tensor, Slice) {
 #endif
 }
 
-TEST(Tensor, CopyFrom) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    CPUDeviceContext cpu_ctx((CPUPlace()));
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
-
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new paddle::platform::GPUPlace(0);
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-
-    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#endif
-}
-
-TEST(Tensor, CopyFromVector) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    delete cpu_place;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
-
 TEST(Tensor, ReshapeToMatrix) {
   using namespace paddle::framework;
   using namespace paddle::platform;
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
new file mode 100644
index 0000000000..4e34b90d57
--- /dev/null
+++ b/paddle/framework/tensor_util.h
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+ */
+
+inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                     const platform::DeviceContext& ctx, Tensor* dst) {
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
new file mode 100644
index 0000000000..03a70de182
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cc
@@ -0,0 +1,228 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <string>
+
+namespace paddle {
+namespace framework {
+TEST(CopyFrom, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+
+  auto cpu_place = new platform::CPUPlace();
+  CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::GPUPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
+}
+
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::GPUPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+
+    CPUPlace place;
+    CPUDeviceContext cpu_ctx(place);
+    std::vector<int> dst;
+    CopyToVector<int>(src, cpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    GPUPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 8e92c81d11..0babec29f6 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -37,13 +37,29 @@ std::vector<int64_t> VarDescBind::Shape() const {
 DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
 
 void VarDescBind::SetLoDLevel(int32_t lod_level) {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
-  desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
+      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
+  }
 }
 
 int32_t VarDescBind::GetLodLevel() const {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
-  return desc_.lod_tensor().lod_level();
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().lod_level();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
+  }
 }
 
 const TensorDesc &VarDescBind::tensor_desc() const {
@@ -53,6 +69,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
       return desc_.selected_rows();
     case VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
   }
@@ -66,6 +84,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
       return desc_.mutable_selected_rows();
     case VarDesc::LOD_TENSOR:
       return desc_.mutable_lod_tensor()->mutable_tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
   }
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
new file mode 100644
index 0000000000..0f19870bec
--- /dev/null
+++ b/paddle/framework/var_type.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
+
+namespace paddle {
+namespace framework {
+inline VarDesc::VarType ToVarType(std::type_index type) {
+  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+    return VarDesc_VarType_LOD_TENSOR;
+  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+    return VarDesc_VarType_LOD_RANK_TABLE;
+  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return VarDesc_VarType_SELECTED_ROWS;
+  } else {
+    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+  }
+}
+
+template <typename Visitor>
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index cde5ec2413..e5a94759f9 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -48,6 +48,11 @@ class Variable {
 
   void Clear() { holder_.reset(); }
 
+  std::type_index Type() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
+    return holder_->Type();
+  }
+
  private:
   struct Placeholder {
     virtual ~Placeholder() {}
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 4fd72d64a9..9b2779b42c 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_GPU)
     add_simple_unittest(BlockExpandOpTest)
     add_simple_unittest(CropOpTest)
     add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
 endif()
 
 add_simple_unittest(Im2ColTest)
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index baf78bc6c8..062ea25a11 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -61,6 +61,7 @@ public:
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
     paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
     groups_ = config.get<size_t>("groups");
 
     // number of inputs and outputs
@@ -118,6 +119,7 @@ protected:
 
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;
 
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
@@ -133,6 +135,10 @@ protected:
 
   inline int paddingW() const { return paddings_[1]; }
 
+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
   // A temporary memory in convolution calculation.
   MemoryHandlePtr memory_;
 
diff --git a/paddle/function/ConvOpTest.h b/paddle/function/ConvOpTest.h
index cb02a96d0d..d8d3c792df 100644
--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1,
             if (outputChannels < inputChannels) continue;
             for (size_t stride : {1, 2}) {
               for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
+                for (size_t dilation : {1, 3}) {
+                  if (padding >= filterSize) break;
+                  size_t filterS = (filterSize - 1) * dilation + 1;
 
-                // NNPACK only supports stride = 1 if batchSize > 1
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
-                    batchSize > 1 && stride > 1)
-                  break;
+                  if (inputSize + 2 * padding < filterS) break;
 
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
+                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
+                       conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      dilation > 1)
+                    break;
 
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", (size_t)1)
-                        .set("algo", (std::string) "auto"));
+                  // NNPACK only supports stride = 1 if batchSize > 1
+                  if ((conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      batchSize > 1 && stride > 1)
+                    break;
 
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{
-                    outputChannels, inputChannels, filterSize, filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
+                  size_t outputSize =
+                      (inputSize - filterS + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
 
-                function(test, input, filter, output);
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  std::vector<size_t> dilations = {dilation, dilation};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("dilations", dilations)
+                          .set("groups", (size_t)1)
+                          .set("algo", (std::string) "auto"));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+
+                  function(test, input, filter, output);
+                }
               }
             }
           }
@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
               for (size_t outputChannels : {7}) {
                 size_t stride = 1;
                 size_t padding = 0;
+                size_t dilation = 1;
                 size_t outputHeight =
                     (inputHeight - filterHeight + 2 * padding + stride) /
                     stride;
@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,
 
                 std::vector<size_t> paddings = {padding, padding};
                 std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {dilation, dilation};
                 Compare2Function<DType1, DType2> test(
                     conv1,
                     conv2,
@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", (size_t)1)
+                        .set("dilations", dilations)
                         .set("algo", (std::string) "auto"));
 
                 TensorShape input{
@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,
 
                 std::vector<size_t> paddings = {padding, padding};
                 std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {1, 1};
                 size_t groups = inputChannels;
                 Compare2Function<DType1, DType2> test(
                     conv1,
@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", groups)
+                        .set("dilations", dilations)
                         .set("algo", (std::string) "auto"));
 
                 TensorShape input{
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index b3e666e860..644098a9e7 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -21,7 +21,7 @@ template <class T>
 struct EigenBlasGemm {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
                            Eigen::Aligned>
-      Matrix;
+      EigenMatrix;
 
   static void compute(const bool transA,
                       const bool transB,
@@ -56,14 +56,13 @@ struct EigenBlasGemm {
       sizeB[1] = N;
       CHECK_EQ(N, ldb);
     }
-    Eigen::array<int, 2> sizeC;
-    sizeC[0] = M;
-    sizeC[1] = N;
-    CHECK_EQ(N, ldc);
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
 
-    const Matrix a(const_cast<T*>(A), sizeA);
-    const Matrix b(const_cast<T*>(B), sizeB);
-    Matrix c(C, sizeC);
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
 
     typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
     Eigen::array<DimPair, 1> dims;
@@ -72,12 +71,23 @@ struct EigenBlasGemm {
     dims[0].second = transB ? 1 : 0;
 
     Eigen::DefaultDevice device;
-    if (alpha == T(1) && beta == T(0)) {
-      c.device(device) = a.contract(b, dims);
-    } else if (alpha == T(1) && beta == T(1)) {
-      c.device(device) += a.contract(b, dims);
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) += a.contract(b, dims);
+      } else {
+        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      }
     } else {
-      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
     }
   }
 };
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index ba446bf92d..370940532e 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -110,6 +110,7 @@ public:
         function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
     function1_->init(config);
     function2_->init(config);
+    initArgsCallback_ = nullptr;
   }
 
   ~Compare2Function() {}
@@ -170,6 +171,10 @@ public:
                                       *seq2_));
   }
 
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
   // output need only contains shape, do not contains data.
   void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
     size_t size =
@@ -340,6 +345,10 @@ protected:
         initArg(*func1Inputs_[i]);
       }
 
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
       copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
     }
   }
@@ -386,6 +395,7 @@ protected:
   std::shared_ptr<SequenceIdArg> seq1_;
   std::shared_ptr<SequenceIdArg> seq2_;
   test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
 };
 
 class CpuGpuFuncCompare
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index bdb56ddac3..8d34eee886 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -100,7 +100,9 @@ public:
                  strideH(),
                  strideW(),
                  paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
         } else {
           colData = inputData + g * inputOffset;
         }
@@ -223,7 +225,9 @@ public:
                  strideH(),
                  strideW(),
                  paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
         }
       }
       inputGrad += inputChannels * inputHeight * inputWidth;
@@ -310,7 +314,9 @@ public:
                  strideH(),
                  strideW(),
                  paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
         } else {
           colData = inputData + g * inputOffset;
         }
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 1e0cff436f..0c37fc9724 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -78,7 +78,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };
 
 template <ColFormat Format, DeviceType Device, class T>
@@ -91,7 +93,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };
 
 }  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
index b7d1eb1ede..f864d42f80 100644
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -31,7 +31,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -47,8 +49,8 @@ public:
       int c_im = c / filterWidth / filterHeight;
       for (int h = 0; h < outputHeight; ++h) {
         for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
           if ((imRowIdx - paddingHeight) < 0 ||
               (imRowIdx - paddingHeight) >= inputHeight ||
               (imColIdx - paddingWidth) < 0 ||
@@ -81,7 +83,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -97,8 +101,8 @@ public:
       int c_im = c / filterWidth / filterHeight;
       for (int h = 0; h < outputHeight; ++h) {
         for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
           if ((imRowIdx - paddingHeight) >= 0 &&
               (imRowIdx - paddingHeight) < inputHeight &&
               (imColIdx - paddingWidth) >= 0 &&
@@ -134,7 +138,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -147,9 +153,10 @@ public:
         for (int channel = 0; channel < inputChannels; ++channel) {
           for (int filterH = 0; filterH < filterHeight; ++filterH) {
             for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
               int colDataOffset =
                   (((outputH * outputWidth + outputW) * inputChannels +
                     channel) *
@@ -189,7 +196,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -202,9 +211,10 @@ public:
         for (int channel = 0; channel < inputChannels; ++channel) {
           for (int filterH = 0; filterH < filterHeight; ++filterH) {
             for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
               int colDataOffset =
                   (((outputH * outputWidth + outputW) * inputChannels +
                     channel) *
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index bd98610498..71da11b955 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
                        int strideW,
                        int paddingH,
                        int paddingW,
+                       int dilationH,
+                       int dilationW,
                        int height_col,
                        int width_col,
                        T* data_col) {
@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
     data_col += (channel_out * height_col + h_out) * width_col + w_out;
     for (int i = 0; i < blockH; ++i) {
       for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i);
-        int cIdx = int(w_in + j);
+        int rIdx = int(h_in + i * dilationH);
+        int cIdx = int(w_in + j * dilationW);
         if ((rIdx - (int)paddingH) >= (int)height ||
             (rIdx - (int)paddingH) < 0 ||
             (cIdx - (int)paddingW) >= (int)width ||
@@ -77,7 +79,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -102,6 +106,8 @@ public:
                                                     strideWidth,
                                                     paddingHeight,
                                                     paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
                                                     outputHeight,
                                                     outputWidth,
                                                     colData);
@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
                        size_t strideW,
                        size_t paddingH,
                        size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
                        size_t height_col,
                        size_t width_col,
                        T* data_im) {
@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
     int w = int(index % width);
     int h = int((index / width) % height);
     int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
+
     if ((w - (int)paddingW) >= 0 &&
         (w - (int)paddingW) < (width - 2 * paddingW) &&
         (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
       // compute the start and end of the output
       int w_col_start =
-          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
       int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
       int h_col_start =
-          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
       int h_col_end = min(int(h / strideH + 1), int(height_col));
+
       for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
         for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
           // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH * blockW) +
-                      (h - h_col * (int)strideH) * (int)blockW +
-                      (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+          int h_k = (h - h_col * strideH);
+          int w_k = (w - w_col * strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
         }
       }
       h -= paddingH;
@@ -173,7 +192,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -205,6 +226,8 @@ public:
         strideWidth,
         paddingHeight,
         paddingWidth,
+        dilationHeight,
+        dilationWidth,
         outputHeight,
         outputWidth,
         imData);
@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
                           int strideWidth,
                           int paddingHeight,
                           int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                           int outputHeight,
                           int outputWidth) {
   int swId = blockIdx.x;
@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
        channelId += blockDim.z) {
     for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
         int imOffset = widthOffset + heightOffset * inputWidth +
                        channelId * inputHeight * inputWidth;
 
@@ -273,7 +300,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -312,6 +341,8 @@ public:
                                                        strideWidth,
                                                        paddingHeight,
                                                        paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                        outputHeight,
                                                        outputWidth);
     CHECK_SYNC("Im2ColFunctor GPU failed");
@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
                           int strideWidth,
                           int paddingHeight,
                           int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                           int outputHeight,
                           int outputWidth) {
   int swId = blockIdx.x;
@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
        channelId += blockDim.z) {
     for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
         int imOffset = widthOffset + heightOffset * inputWidth +
                        channelId * inputHeight * inputWidth;
 
@@ -372,7 +407,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -411,6 +448,8 @@ public:
                                                        strideWidth,
                                                        paddingHeight,
                                                        paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                        outputHeight,
                                                        outputWidth);
     CHECK_SYNC("Col2ImFunctor GPU failed");
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index a0a01a5fc7..1f085538d8 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -29,82 +29,98 @@ void TestIm2ColFunctor() {
           for (size_t filterWidth : {3, 7}) {
             for (size_t stride : {1, 2}) {
               for (size_t padding : {0, 1}) {
-                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
-                  break;
-                if (padding >= filterHeight || padding >= filterWidth) break;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-
-                TensorShape imShape =
-                    TensorShape({channels, inputHeight, inputWidth});
-                TensorShape colShape1 = TensorShape({channels,
-                                                     filterHeight,
-                                                     filterWidth,
-                                                     outputHeight,
-                                                     outputWidth});
-                TensorShape colShape2 = TensorShape({outputHeight,
-                                                     outputWidth,
-                                                     channels,
-                                                     filterHeight,
-                                                     filterWidth});
-
-                size_t height = channels * filterHeight * filterWidth;
-                size_t width = outputHeight * outputWidth;
-                VectorPtr input1 = Vector::create(imShape.getElements(), false);
-                VectorPtr input2 = Vector::create(imShape.getElements(), false);
-                MatrixPtr output1 = Matrix::create(height, width, false, false);
-                MatrixPtr output2 = Matrix::create(width, height, false, false);
-                input1->uniform(0.001, 1);
-                input2->copyFrom(*input1);
-
-                Im2ColFunctor<kCFO, Device, T> im2Col1;
-                Im2ColFunctor<kOCF, Device, T> im2Col2;
-                im2Col1(input1->getData(),
-                        imShape,
-                        output1->getData(),
-                        colShape1,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-                im2Col2(input2->getData(),
-                        imShape,
-                        output2->getData(),
-                        colShape2,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-
-                // The transposition of the result of ColFormat == kCFO
-                // is equal to the result of ColFormat == kOCF.
-                MatrixPtr test;
-                output2->transpose(test, true);
-                autotest::TensorCheckErr(*output1, *test);
-
-                Col2ImFunctor<kCFO, Device, T> col2Im1;
-                Col2ImFunctor<kOCF, Device, T> col2Im2;
-                col2Im1(input1->getData(),
-                        imShape,
-                        output1->getData(),
-                        colShape1,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-                col2Im2(input2->getData(),
-                        imShape,
-                        output2->getData(),
-                        colShape2,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-
-                autotest::TensorCheckErr(*input1, *input2);
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
               }
             }
           }
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000..a080505d7d
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000..0480c8577f
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ *        region. Indices must be provided to indcate the location and shape of
+ *        the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs  Output value.
+ * \param[in]  inputs   Input data which contains NCHW information.
+ * \param[in]  indices  Indices data to indcate the sub region.
+ * \param[in]  shape    Tensor shape of input value.
+ * \param[in]  conf     Configure variable which contains the multiplied value.
+ */
+template <DeviceType Device>
+void ScaleSubRegion(real* outputs,
+                    const real* inputs,
+                    const real* indices,
+                    const TensorShape shape,
+                    const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad   Gradients of previous layer.
+ * \param[in]  outGrad  Output gradient.
+ * \param[in]  indices  Indices data.
+ * \param[in]  shape    The Shape of input tensor.
+ * \param[in]  conf     Configure variable.
+ */
+template <DeviceType Device>
+void ScaleSubRegionGrad(const real* inGrad,
+                        real* outGrad,
+                        const real* indices,
+                        const TensorShape shape,
+                        const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000..8aae2e44c3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+                                 const real* inputs,
+                                 const real* indices,
+                                 real value,
+                                 int channel,
+                                 int height,
+                                 int width,
+                                 int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outputs[idx] = inputs[idx] * value;
+    } else {
+      outputs[idx] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+                                     real* outGrad,
+                                     const real* indices,
+                                     real value,
+                                     int channel,
+                                     int height,
+                                     int width,
+                                     int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outGrad[idx] += inGrad[idx] * value;
+    } else {
+      outGrad[idx] += inGrad[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inGrad, outGrad, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000..43331f258d
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 32}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (real value : {-0.5, 0.0, 0.5}) {
+            for (bool firstHalf : {false, true}) {
+              VLOG(3) << " numSamples=" << numSamples
+                      << " channels=" << channels << " imgSizeH=" << imgSizeH
+                      << " imgSizeW=" << imgSizeW;
+
+              for (bool testGrad : {false, true}) {
+                CpuGpuFuncCompare compare(
+                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+                    FuncConfig().set<real>("value", value));
+
+                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+                TensorShape indicesShape{numSamples, 6};
+
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+                  if (index == 1) {
+                    real* data = (real*)arg.data();
+
+                    for (size_t i = 0; i < numSamples; ++i) {
+                      size_t offset = i * 6;
+                      data[offset] = firstHalf ? 1 : channels / 2;
+                      data[offset + 1] = firstHalf ? channels / 2 : channels;
+                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+                    }
+                  }
+                });
+
+                compare.addOutputs(
+                    BufferArg(
+                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+                    testGrad ? ADD_TO : ASSIGN_TO);
+                compare.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 5f39167afc..41ead3c5ec 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
     list(REMOVE_ITEM GSERVER_SOURCES
          dataproviders/DataProvider.cpp
          dataproviders/MultiDataProvider.cpp
-         dataproviders/ProtoDataProvider.cpp
          dataproviders/PyDataProvider2.cpp
          dataproviders/PyDataProvider.cpp)
 
@@ -85,9 +84,49 @@ if(MOBILE_INFERENCE)
          gradientmachines/GradientMachineMode.cpp
          gradientmachines/MultiGradientMachine.cpp)
 
-    # Remove useless layers
+    # Remove layers that used in training
     list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp)
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 8b7b2e9b65..f5a41b66bf 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
 /**
  * @brief Relu Activation.
  * forward. y = max(0, z)
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 0478256f9c..106cf5b622 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <algorithm>
-#include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
deleted file mode 100644
index c6f5cab191..0000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ /dev/null
@@ -1,932 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoDataProvider.h"
-#include <algorithm>
-#include <fstream>
-#include <istream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "DataProviderGroup.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_double(memory_threshold_on_load_data,
-              1.0,
-              "stop loading data when memory is not sufficient");
-
-namespace paddle {
-
-REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
-REGISTER_DATA_PROVIDER(proto_sequence_group,
-                       DataProviderGroup<ProtoSequenceDataProvider>);
-
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
-                                     bool useGpu,
-                                     bool loadDataAll)
-    : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
-  if (loadDataAll) {
-    loadData(config_.files());
-  }
-}
-
-void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
-  for (auto& file : fileList) {
-    if (FLAGS_memory_threshold_on_load_data < 1.0) {
-      double memUsage = getMemoryUsage();
-      if (memUsage > FLAGS_memory_threshold_on_load_data) {
-        LOG(INFO) << "memUsage is " << memUsage << ", > "
-                  << FLAGS_memory_threshold_on_load_data
-                  << " therefore SKIP ALL REMAINING file.";
-        break;
-      }
-    }
-    LOG(INFO) << "load data file " << file;
-    loadDataFile(file);
-  }
-
-  if (sequenceStartPositions_.size() == sampleNums_) {
-    // This means that each sample is one sequence
-    shuffledSequenceIds_.swap(sequenceStartPositions_);
-  } else {
-    sequenceStartPositions_.push_back(sampleNums_);
-    shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
-    for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
-      shuffledSequenceIds_.push_back(i);
-    }
-  }
-
-  LOG(INFO) << "read done, num of instance=" << sampleNums_;
-  showDataStats();
-}
-
-void ProtoDataProvider::loadData(const std::string& fileName) {
-  std::vector<std::string> fileList;
-  loadFileList(fileName, fileList);
-  loadData(fileList);
-}
-
-void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
-  if (header_.slot_defs_size()) {
-    // header_ is already set. Need to check consistency.
-    CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
-        << "Different header";
-    for (int i = 0; i < header.slot_defs_size(); ++i) {
-      CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
-      CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
-    }
-    return;
-  }
-
-  // header_ is not set before
-  CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
-  int i;
-  for (i = 0; i < header.slot_defs_size(); ++i) {
-    if (header.slot_defs(i).type() == SlotDef::INDEX ||
-        header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
-      break;
-    }
-    constexpr int kBufLen = 100;
-    char buf[kBufLen];
-    snprintf(buf, kBufLen, "slot%d_nnz", i);
-    nnzStats_.push_back(getStat(buf));
-  }
-  numVecSlots_ = i;
-
-  // Check that INDEX slots are after VECTOR slots
-  for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
-    CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
-          header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
-  }
-
-  slots_.clear();
-  slots_.reserve(header.slot_defs_size());
-  for (int i = 0; i < header.slot_defs_size(); ++i) {
-    slots_.emplace_back();
-    slots_.back().type = header.slot_defs(i).type();
-    slots_.back().dim = header.slot_defs(i).dim();
-    if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
-        SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
-      slots_.back().indices.push_back(0);
-    }
-  }
-
-  header_ = header;
-}
-
-void ProtoDataProvider::checkSample(const DataSample& sample) {
-  CHECK_EQ(numVecSlots_, sample.vector_slots_size());
-  CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
-        header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
-  for (int i = 0; i < numVecSlots_; ++i) {
-    uint32_t dim = header_.slot_defs(i).dim();
-    switch (header_.slot_defs(i).type()) {
-      case SlotDef::VECTOR_DENSE: {
-        CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          CHECK_EQ(0, sample.vector_slots(i).values_size());
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).values_size());
-        CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(sample.vector_slots(i).values_size(),
-                 sample.vector_slots(i).ids_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        if (static_cast<int>(dim) != 0) {
-          CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-          if (sample.vector_slots(i).dims_size() != 0) {
-            int totalDim = sample.vector_slots(i).dims(0);
-            for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-              totalDim *= sample.vector_slots(i).dims(j);
-            }
-            CHECK_EQ(static_cast<int>(dim), totalDim);
-          }
-        } else {
-          CHECK_NE(sample.vector_slots(i).dims_size(), 0);
-          int totalDim = sample.vector_slots(i).dims(0);
-          for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-            totalDim *= sample.vector_slots(i).dims(j);
-          }
-          CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
-        }
-        break;
-      }
-      case SlotDef::STRING: {
-        CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        break;
-      }
-      default:
-        LOG(FATAL) << "BUG: Should not reach here";
-    }
-  }
-  for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
-    if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
-      uint32_t id = sample.id_slots(i - numVecSlots_);
-      if (id == -1U) continue;
-      CHECK_LT(id, header_.slot_defs(i).dim());
-    } else {
-      for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
-           ++j) {
-        uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
-        CHECK_LT(id, header_.slot_defs(i).dim());
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  bool dataCompression = str::endsWith(fileName, ".gz");
-  std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
-  CHECK(reader) << "Fail to create proto data input stream";
-
-  DataHeader header;
-  CHECK(reader->read(&header));
-  checkDataHeader(header);
-
-  DataSample sample;
-  do {
-    if (!reader->read(&sample)) {
-      break;
-    }
-    checkSample(sample);
-    if (sample.is_beginning()) {
-      sequenceStartPositions_.push_back(sampleNums_);
-    }
-    fillSlots(sample);
-    ++sampleNums_;
-  } while (true);
-
-  CHECK(is.eof()) << "Fail to read file";
-  reader.reset(nullptr);
-  is.close();
-}
-
-// checkSample has done before, no check here
-void ProtoDataProvider::fillSlots(const DataSample& sample) {
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    int dim = slot.dim;
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE: {
-        size_t oldSize = slot.denseData.size();
-        slot.denseData.resize(oldSize + dim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + dim, slot.denseData.begin() + oldSize);
-#else
-        memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
-#endif
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        int slotSize = sample.vector_slots(i).ids_size();
-        int subSlotSize = 0;
-        int id = 0;  // the slot id
-        // find whether this vector_slots has subseq. If not has subseq,
-        // subSlotSize = 0.
-        for (id = 0; id < sample.subseq_slots_size(); id++) {
-          if (sample.subseq_slots(id).slot_id() == i) {
-            subSlotSize = sample.subseq_slots(id).lens_size();
-            break;
-          }
-        }
-        if (subSlotSize && slot.subIndices.size() == 0UL) {
-          // If has subSeq, the first element of subIndices = 0.
-          slot.subIndices.push_back(0);
-        }
-        if (slotSize == 0UL) {
-          // if has no id, new indices = old indices.
-          slot.indices.push_back(slot.indices.back());
-          // if has subSeq, new subIndices = old subIndices.
-          if (slot.subIndices.size()) {
-            slot.subIndices.push_back(slot.subIndices.back());
-          }
-          break;
-        }
-        slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
-               ids,
-               sizeof(*ids) * slotSize);
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        if (subSlotSize) {
-          for (int ii = 0; ii < subSlotSize; ++ii) {
-            slot.subIndices.push_back(slot.subIndices.back() +
-                                      sample.subseq_slots(id).lens(ii));
-          }
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          slot.indices.push_back(slot.indices.back());
-          break;
-        }
-        int slotSize = sample.vector_slots(i).ids_size();
-        slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        const float* values = sample.vector_slots(i).values().data();
-        for (int ii = 0; ii < slotSize; ++ii) {
-          slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
-          slot.sparseFloatValueData[slot.indices.back() + ii].value =
-              values[ii];
-        }
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        break;
-      }
-      case SlotDef::INDEX: {
-        slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        size_t oldSize = slot.varDenseData.size();
-        slot.varDenseData.resize(oldSize + 1);
-        size_t varDim = sample.vector_slots(i).values_size();
-        slot.varDenseData[oldSize].data.resize(varDim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(
-            values, values + varDim, slot.varDenseData[oldSize].data.data());
-#else
-        memcpy(slot.varDenseData[oldSize].data.data(),
-               values,
-               sizeof(real) * varDim);
-#endif
-        slot.varDenseData[oldSize].dims.resize(
-            sample.vector_slots(i).dims_size());
-        memcpy(slot.varDenseData[oldSize].dims.data(),
-               sample.vector_slots(i).dims().data(),
-               sizeof(uint32_t) * sample.vector_slots(i).dims_size());
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        size_t oldSize = slot.varIndices.size();
-        slot.varIndices.resize(oldSize + 1);
-        size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
-        slot.varIndices[oldSize].resize(varDim);
-        memcpy(slot.varIndices[oldSize].data(),
-               sample.var_id_slots(i - numVecSlots_).ids().data(),
-               sizeof(uint32_t) * varDim);
-        break;
-      }
-      case SlotDef::STRING: {
-        slot.strData.push_back(sample.vector_slots(i).strs(0));
-        break;
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::showDataStats() {
-  std::ostringstream oss;
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
-      size_t nnz = slot.sparseNonValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
-      size_t nnz = slot.sparseFloatValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    }
-  }
-  LOG(INFO) << oss.str();
-}
-
-void ProtoDataProvider::reset() {
-  currentSequenceIndex_ = 0;
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  DataProvider::reset();
-}
-
-void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(),
-               shuffledSequenceIds_.end(),
-               ThreadLocalRandomEngine::get());
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sequence ranging from [begin, end),
-  op(begin, end) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
-  int64_t sz = 0;
-  size_t i;
-  size_t sequenceCount = shuffledSequenceIds_.size();
-  if (usageRatio_ < 1.0f) {
-    sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
-  }
-  for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
-    size_t id = shuffledSequenceIds_[i];
-    int64_t begin = sequenceStartPositions_[id];
-    int64_t end = sequenceStartPositions_[id + 1];
-    int64_t len = end - begin;
-    if (sz + len > size && sz > 0) break;
-    sz += len;
-    op(begin, end);
-  }
-  return i - currentSequenceIndex_;
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
-  if (iidData()) {
-    size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-    for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-         ++i) {
-      size_t pos = shuffledSequenceIds_[i];
-      op(pos);
-    }
-    return size;
-  } else {
-    auto f = [op](int64_t begin, int64_t end) {
-      for (int64_t pos = begin; pos < end; ++pos) {
-        op(pos);
-      }
-    };
-    return sequenceLoop(f, size);
-  }
-}
-
-/*
-  Loop through sub-sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sub-sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sub-sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
-  CHECK(iidData()) << "subSampleLoop only accepts iid data";
-  size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-  int subSize = 0;
-  for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-       ++i) {
-    size_t pos = shuffledSequenceIds_[i];
-    int64_t* indexs = slots_[slot].indices.data();
-    int64_t* subIndexs = slots_[slot].subIndices.data();
-    int64_t subSeqStart = 0;
-    int64_t subSeqEnd = 0;
-    for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
-      if (subIndexs[j] == indexs[pos]) {
-        subSeqStart = j;
-        if (subIndexs[pos] == subIndexs[pos + 1]) {
-          subSeqEnd = j + 1;
-          break;
-        }
-      } else if (subIndexs[j] == indexs[pos + 1]) {
-        subSeqEnd = j;
-        break;
-      }
-    }
-    for (int j = subSeqStart; j < subSeqEnd; j++) {
-      op(j);
-    }
-    subSize += subSeqEnd - subSeqStart;
-  }
-  return subSize;
-}
-
-int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  if (iidData()) {
-    size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-    numScannedSeqs = numSequences = size;
-  } else {
-    int64_t sz = 0;
-    auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
-      ++numSequences;
-      sz += end - begin;
-    };
-    numScannedSeqs = sequenceLoop(op, size);
-    VLOG_IF(1, numScannedSeqs > numSequences)
-        << numScannedSeqs - numSequences
-        << " sequences are skipped because longer than " << size;
-    size = sz;
-  }
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  if (!iidData()) {
-    ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                  numSequences + 1,
-                                  /* useGpu= */ false);
-    int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
-    int pos = 0;
-    int i = 0;
-    auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
-      buf[i] = pos;
-      pos += end - begin;
-      ++i;
-    };
-    sequenceLoop(op, size);
-    buf[i] = size;
-    for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
-      cpuArguments[slot].sequenceStartPositions =
-          cpuArguments[0].sequenceStartPositions;
-    }
-  }
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    size_t dim = header_.slot_defs(slot).dim();
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         NO_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        size_t numElements = 0;
-        for (auto pos : dataPos) {
-          numElements +=
-              slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
-        }
-        nnzStats_[slot]->addSample(numElements);
-
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         FLOAT_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        CHECK_EQ(size, 1);
-        auto mat = cpuArguments[slot].value;
-        size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
-
-        CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
-        size_t height, width, depth, oldWidth;
-        /* dims[2] is depth, will be changed to dims[0] in future */
-        depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
-        height = slots_[slot].varDenseData[dataPos[0]].dims[1];
-        width = slots_[slot].varDenseData[dataPos[0]].dims[0];
-        oldWidth = width;
-        /* process the undesirable sample */
-        if (oldWidth < height) {
-          width = height;
-        }
-        cpuArguments[slot].setFrameHeight(height);
-        cpuArguments[slot].setFrameWidth(width);
-
-        if (oldWidth < height) {
-          totalDim = width * height * depth;
-        }
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               totalDim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        cpuArguments[slot].value->zeroMem();
-        if (oldWidth < height) {
-          real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
-          for (size_t i = 0; i < depth; i++) {
-            for (size_t j = 0; j < height; j++) {
-              for (size_t k = 0; k < oldWidth; k++) {
-                buf[i * height * width + j * width + k] =
-                    srcBuf[i * height * oldWidth + j * oldWidth + k];
-              }
-            }
-          }
-        } else {
-          memcpy(buf,
-                 slots_[slot].varDenseData[dataPos[0]].data.data(),
-                 sizeof(real) * totalDim);
-        }
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        bufStarts[1] = 1;
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        CHECK_EQ(size, 1);
-        size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalDim,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf,
-               slots_[slot].varIndices[dataPos[0]].data(),
-               sizeof(int) * totalDim);
-
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        /* we expand the convolutinal feature map to a sequence data,
-         * so there should be a corresponding sequence labels */
-        bufStarts[1] = totalDim;
-        break;
-      }
-      case SlotDef::STRING: {
-        if (cpuArguments[slot].strs) {
-          cpuArguments[slot].strs->resize(size);
-        } else {
-          cpuArguments[slot].strs =
-              std::make_shared<std::vector<std::string>>(size);
-        }
-        for (int i = 0; i < size; ++i) {
-          (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
-        }
-        break;
-      }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (int i = 0; i < header_.slot_defs_size(); ++i) {
-      SlotDef::SlotType slotType = header_.slot_defs(i).type();
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-
-  return batch->getSize();
-}
-
-ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
-                                                     bool useGpu,
-                                                     bool loadDataAll)
-    : ProtoDataProvider(config, useGpu, loadDataAll) {}
-
-int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
-                                                        DataBatch* batch) {
-  CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-  numScannedSeqs = numSequences = size;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                  size + 1,
-                                  /* useGpu= */ false);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_SPARSE_VALUE:
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "ProtoSequenceDataProvider only support"
-                   << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // copy to IDS, not value
-        // pointers used in current slot
-        sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
-        int64_t* indexs = slots_[slot].indices.data();
-        int64_t* seqs = dataPos.data();
-
-        // current slot: i need size instances. what is the total length?
-        int totalFeatureInCurrentSlot = 0;
-        for (int ins = 0; ins < size; ins++) {
-          int64_t currInsId = seqs[ins];
-          totalFeatureInCurrentSlot +=
-              indexs[currInsId + 1] - indexs[currInsId];
-          // special: if current instance has NO feature in current slot
-          if (indexs[currInsId + 1] == indexs[currInsId]) {
-            totalFeatureInCurrentSlot++;
-          }
-        }
-        // done
-
-        // current slot: ids
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalFeatureInCurrentSlot,
-                                /* useGpu= */ false);
-
-        // where to write
-        int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
-        int* currPosOfArgumentSeqStart =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        int allSequenceLength = 0;
-        currPosOfArgumentSeqStart[0] = 0;
-        // for each instance, copy data and fill sequence positions
-        for (int instance = 0; instance < size; instance++) {
-          int64_t currInstanceId = seqs[instance];
-          int64_t currInstanceLength =
-              indexs[currInstanceId + 1] - indexs[currInstanceId];
-          sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
-          // write sequenceStartPositions
-          allSequenceLength += currInstanceLength;
-          currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-          // copy features
-          for (int featCopier = 0; featCopier < currInstanceLength;
-               featCopier++) {
-            currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
-          }
-          currPosOfArgumentId += currInstanceLength;
-          // special: if current instance has NO feature in current slot
-          if (currInstanceLength == 0) {
-            allSequenceLength++;
-            currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-            currPosOfArgumentId[0] = -1;
-            currPosOfArgumentId++;
-          }
-          // done
-        }
-        if (slots_[slot].subIndices.size()) {
-          std::vector<int64_t> dataSubPos;
-          auto op = [this, &dataSubPos](int64_t pos) {
-            dataSubPos.push_back(pos);
-          };
-          int subSize = subSampleLoop(op, size, slot);
-          ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
-          int* currPosOfArgumentSubSeqStart =
-              cpuArguments[slot].subSequenceStartPositions->getMutableData(
-                  false);
-          int64_t* subSeqs = dataSubPos.data();
-          int64_t* subIndexs = slots_[slot].subIndices.data();
-          int allSubSequenceLength = 0;
-          currPosOfArgumentSubSeqStart[0] = 0;
-          // for each instance, compute sub-sequence number
-          for (int instance = 0; instance < subSize; instance++) {
-            int64_t currSubInstanceId = subSeqs[instance];
-            int64_t currSubInstanceLength =
-                subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
-            // write subSequenceStartPositions
-            allSubSequenceLength += currSubInstanceLength;
-            currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            // special: if current instance has NO feature in current slot
-            if (currSubInstanceLength == 0) {
-              allSubSequenceLength++;
-              currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            }
-          }
-          cpuArguments[slot].checkSubset();
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /* useGpu= */ false);
-        // fill labels
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        // label HAS sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // copy values
-        size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        // sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(
-          cpuArguments[i], useGpu_, HPPL_STREAM_1);
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
deleted file mode 100644
index 7dd45e0622..0000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-
-#include "DataProvider.h"
-#include "ProtoReader.h"
-
-namespace paddle {
-
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- *    header
- *
- *    sample1
- *
- *    sample2
- *
- *    ...
- *
- *    sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
-  ProtoDataProvider(const DataConfig& config,
-                    bool useGpu,
-                    bool loadDataAll = true);
-  virtual void reset();
-
-  /**
-   * @note this size includes the sequences which are skipped because they
-   * are longer than the batch size.
-   */
-  virtual int64_t getSize() {
-    int64_t size = sampleNums_;
-    if (usageRatio_ < 1.0f) {
-      size = static_cast<int64_t>(size * usageRatio_);
-    }
-    return size;
-  }
-  virtual void shuffle();
-
-  void loadData(const std::vector<std::string>& fileList);
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-protected:
-  /**
-   * @brief load protobuf data from a list of file
-   * @param[in]  fileName  file name of a file which contains
-   * a list of file names
-   */
-  void loadData(const std::string& fileName);
-
-  /**
-   * @brief load protobuf data from file
-   * @param[in]  fileName   data file name
-   */
-  void loadDataFile(const std::string& fileName);
-  /** @brief check data header of each data sample
-   *  @param[in] header     data header read from protobuf data
-   */
-  void checkDataHeader(const DataHeader& header);
-  /**
-   * @brief fill protobuf data into slot_,
-   * slot_ is a vector of ProtoSlot in memory.
-   * @param[in]  sample     data sample read from protobuf data
-   */
-  void fillSlots(const DataSample& sample);
-
-  /**
-   * @brief return true if each sample is one sequence, i.e., independent
-   * of other samples.
-   */
-  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-
-  /**
-   * @brief check that sample is consistent with header_
-   */
-  void checkSample(const DataSample& sample);
-
-  template <class Op>
-  int64_t sequenceLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t sampleLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t subSampleLoop(Op op, int64_t size, int slot);
-
-  void showDataStats();
-
-protected:
-  struct ProtoVarSlot {
-    std::vector<real> data;
-    std::vector<int> dims;
-  };
-
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    std::vector<int> indexData;
-    std::vector<real> denseData;
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    std::vector<int64_t> indices;
-    std::vector<int64_t> subIndices;
-
-    std::vector<ProtoVarSlot> varDenseData;
-    std::vector<std::vector<int>> varIndices;
-    std::vector<std::string> strData;
-  };
-  DataHeader header_;
-  int numVecSlots_;
-
-  std::vector<ProtoSlot> slots_;
-  size_t sampleNums_;
-
-  /**
-   * The starting position of each sequence in samples.
-   * The last element should be num of samples.
-   * If empty, each sample is one sequence.
-   */
-  std::vector<size_t> sequenceStartPositions_;
-
-  int64_t currentSequenceIndex_;
-
-  // The size should be the number of sequences.
-  std::vector<size_t> shuffledSequenceIds_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-
-  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
-};
-
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
-  ProtoSequenceDataProvider(const DataConfig& config,
-                            bool useGpu,
-                            bool loadDataAll = true);
-  ~ProtoSequenceDataProvider() {}
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index dbadc352a4..be112b4123 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
-#include "paddle/gserver/layers/AgentLayer.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -28,6 +27,7 @@ limitations under the License. */
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
+#include "paddle/gserver/layers/AgentLayer.h"
 #endif
 
 namespace paddle {
@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
 void NeuralNetwork::connect(LayerPtr agentLayer,
                             LayerPtr realLayer,
                             int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
   AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
   CHECK_NOTNULL(agent);
   agent->setRealLayer(realLayer, height);
+#endif
 }
 
 void NeuralNetwork::connect(std::string agentLayerName,
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index bc7d1c83a4..925af31289 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
 
   weight_.reset(new Weight(1, channels_, parameters_[0]));
   movingMean_.reset(new Weight(1, channels_, parameters_[1]));
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index e721d2d267..2ac3cd9d67 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -94,6 +94,8 @@ protected:
   bool useGlobalStats_;
   // use to compute moving mean and variance.
   real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index dacff25e59..25ab5cd927 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -22,8 +22,6 @@ namespace paddle {
 
 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
 
-const real BatchNormalizationLayer::EPS = 1E-5;
-
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
 
   calMovingMeanAndVar();
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
   savedInvVar_->copyFrom(*(movingVar_->getW()));
   savedInvVar_->downClip(real(0.0));
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index f6115801fc..1fdb5e2070 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
-
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index 08f36c516c..19efed7b52 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
 
 ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
                                        ParameterPtr parameter,
@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
 }
 
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
   if (convMem.empty()) {
     int numDevices = hl_get_device_count();
     convMem.resize(numDevices);
   }
 
   int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
+  MemoryHandlePtr localMem = convMem[devId];
+  if (NULL == localMem || size > localMem->getAllocSize()) {
+    localMem = std::make_shared<GpuMemoryHandle>(size);
   }
-  return (*localMem)->getBuf();
+  return localMem->getBuf();
 }
 
 ConvBaseProjection::~ConvBaseProjection() {
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index ebdb57845b..bb7ffa627b 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -105,7 +105,7 @@ protected:
   bool bias_;
 
   std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
index 48132a3ce4..e7f081c023 100644
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
   if (outputH_ == 0) outputH_ = configOutH_;
   if (outputW_ == 0) outputW_ = configOutW_;
   imageH_ = imageSize(outputH_,
-                      filterH_,
+                      (filterH_ - 1) * dilationH_ + 1,
                       paddingH_,
                       strideH_,
                       /* caffeMode */ true);
 
   imageW_ = imageSize(outputW_,
-                      filterW_,
+                      (filterW_ - 1) * dilationW_ + 1,
                       paddingW_,
                       strideW_,
                       /* caffeMode */ true);
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 49a9540c0b..8390b55026 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
 
-const double CudnnBatchNormLayer::EPS = 1E-5;
-
 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
   real* movingMean = movingMean_->getW()->getData();
   real* movingVar = movingVar_->getW()->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   if (!useGlobalStats_) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    1.0 - movingAvgFraction_,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    savedMean,
                                    savedInvVar);
   } else {
@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                       beta,
                                       movingMean,
                                       movingVar,
-                                      EPS);
+                                      eps_);
     } else {
       // There is a limitation in cudnn library.
       // When the batch size is larger than 1024 in cuDNN v5.1,
@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    beta,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    batchSize,
                                    channels_,
                                    imageH_ * imageD_,
@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
   real* savedMean = savedMean_->getData();
   real* savedInvVar = savedInvVar_->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
     Matrix::resizeOrCreate(m, h, w, false, true);
     m->zeroMem();
@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                          gamma,
                          gammaGrad,
                          betaGrad,
-                         EPS,
+                         eps_,
                          savedMean,
                          savedInvVar);
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 413efd4d3e..1a3f0c0cbf 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cudnn.h>
 #include "BatchNormBaseLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
@@ -46,12 +47,9 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /**
-   * Epsilon value used in the batch normalization formula.
-   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
-   * Same epsilon value should be used in forward and backward functions.
-   */
-  static const double EPS;
+  /// Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;
 
   /// Input/output tensor descriptor desc
   hl_tensor_descriptor ioDesc_;
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000..9e2dbe3c3c
--- /dev/null
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 48dfcb49a4..7ff0c73721 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
 
     // Convolution Layer uses the GemmConv function by default.
     convType = "GemmConv";
@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       if ((filterSize_[i] == filterSizeY_[i]) &&
           (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
         convType = "NeonDepthwiseConv";
       }
 #endif
     }
 
-    if (FLAGS_use_nnpack && !isDeconv_) {
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
       createFunction(forward_,
                      "NNPACKConv",
                      FuncConfig()
@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                         .set("dilations", dilations)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                         .set("dilations", dilations)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                         .set("dilations", dilations)
                          .set("groups", (size_t)groups_[i]));
     }
   }
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000..be26b9ba88
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000..df20a49934
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e..236f8096bd 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b167..7f896e61ca 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.cpp b/paddle/gserver/layers/L2DistanceLayer.cpp
new file mode 100644
index 0000000000..c71df1b92c
--- /dev/null
+++ b/paddle/gserver/layers/L2DistanceLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "L2DistanceLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(l2_distance, L2DistanceLayer);
+
+bool L2DistanceLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
+                                     << "only two inputs.";
+  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
+                           << "is fixed to be 1.";
+
+  return true;
+}
+
+void L2DistanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const auto inV1 = getInputValue(0);
+  const auto inV2 = getInputValue(1);
+
+  CHECK(inV1 && inV2);
+  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
+      << "The height of two inputs of this layer must be the same.";
+  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
+      << "The width of two inputs of this layer must be the same.";
+
+  int batchSize = inV1->getHeight();
+  int output_dim = getSize();
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    reserveOutput(batchSize, output_dim);
+    auto outV = getOutputValue();
+    CHECK(outV) << "The output matrix should not be null.";
+
+    Matrix::resizeOrCreate(
+        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
+
+    inputSub_->assign(*inV1);
+    inputSub_->sub(*inV2);
+    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
+    outV->sqrt2(*outV);
+  }
+}
+
+void L2DistanceLayer::backward(const UpdateCallback& callback) {
+  const auto outG = getOutputGrad();
+  const auto outV = getOutputValue();
+  CHECK(outG && outV);
+
+  auto inGrad1 = getInputGrad(0);
+  auto inGrad2 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+
+    if (inGrad1 || inGrad2) {
+      outV->scalarDiv(*outV, 1.);
+      outV->dotMul(*outG, *outV);
+    }
+
+    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
+
+    if (inGrad2) {
+      inputSub_->mulScalar(-1.);
+      inGrad2->addRowScale(0, *inputSub_, *outV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
new file mode 100644
index 0000000000..9b12847a10
--- /dev/null
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+
+class L2DistanceLayer : public Layer {
+public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 01f2aae6cf..b55b86221c 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
   std::string type = config.type();
 
+#ifndef PADDLE_MOBILE_INFERENCE
   // NOTE: As following types have illegal character '-',
   // they can not use REGISTER_LAYER to registrar.
   // Besides, to fit with old training models,
@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
     return LayerPtr(new MultiClassCrossEntropy(config));
   else if (type == "rank-cost")
     return LayerPtr(new RankingCost(config));
-#ifndef PADDLE_MOBILE_INFERENCE
   else if (type == "auc-validation")
     return LayerPtr(new AucValidation(config));
   else if (type == "pnpair-validation")
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
new file mode 100644
index 0000000000..39bffc26f7
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -0,0 +1,219 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNAddtoLayer.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
+
+bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  layerSize_ = getSize();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
+  }
+  if (biasParameter_.get() != NULL) {
+    biases_ =
+        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNAddtoLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
+  }
+
+  oc = ic;
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
+                                MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs, biasVal_, out);
+
+  std::shared_ptr<sum::primitive_desc> fwdPD;
+  std::shared_ptr<sum::primitive_desc> biasPD;
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
+}
+
+void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
+                                MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inputs, biasGrad_, out);
+
+  // backward only need share output grad to input grad
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i] != nullptr) {
+      inputs[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
+    }
+  }
+
+  // backward bias
+  bwdBias_ = nullptr;
+  if (biasGrad_) {
+    std::vector<float> scales(bs_, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(bs_,
+                                               biasGrad_->getPrimitiveDesc());
+    auto biasPD =
+        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
+    std::vector<primitive::at> srcs;
+    for (size_t i = 0; i < grads_.size(); ++i) {
+      srcs.push_back(*(grads_[i]));
+    }
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
+    pipeline.push_back(*bwdBias_);
+  }
+}
+
+void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+                                   const MatrixPtr& biasMat,
+                                   const MKLDNNMatrixPtr& out,
+                                   std::vector<MKLDNNMatrixPtr>& outs) {
+  auto pd = MKLDNNMatrix::createPrimitiveDesc(
+      {(int)layerSize_}, memory::format::x, engine_);
+  bias = MKLDNNMatrix::create(pd, biasMat);
+  outs.clear();
+  real* data = out->getData();
+  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+  for (int i = 0; i < bs_; ++i) {
+    MatrixPtr tmp =
+        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
+                                       MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    inputs[i]->downSpatial();
+  }
+  for (size_t i = 1; i < inputs.size(); i++) {
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
+  }
+
+  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+  if (biases_ && biases_->getW()) {
+    prepareBias(bias, biases_->getW(), out, vals_);
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::shared_ptr<sum::primitive_desc>& biasPD,
+                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr bias,
+                                  MKLDNNMatrixPtr out) {
+  std::vector<float> scales(inputs.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+  biasPD = nullptr;
+  if (bias) {
+    std::vector<float> scales(2, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
+    biasPD.reset(
+        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<sum::primitive_desc>& pd,
+    std::shared_ptr<sum::primitive_desc>& biasPD,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new sum(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+
+  fwdBias_.clear();
+  if (biasPD == nullptr || bias == nullptr) {
+    return;
+  }
+  fwdBias_.resize(vals_.size());
+  for (size_t i = 0; i < vals_.size(); ++i) {
+    std::vector<primitive::at> srcs;
+    srcs.push_back(*(vals_[i]));
+    srcs.push_back(*bias);
+    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+    pipeline.push_back(*fwdBias_[i]);
+  }
+}
+
+void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
+                                       MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    prepareBias(bias, biases_->getWGrad(), out, grads_);
+  } else {
+    bias = nullptr;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
new file mode 100644
index 0000000000..0ea3e208e5
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+protected:
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+
+  std::unique_ptr<Weight> biases_;
+
+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
+
+public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNAddtoLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index 9b0ae20f08..7faca0f8b7 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -21,8 +21,6 @@ namespace paddle {
 
 REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
 
-const real MKLDNNBatchNormLayer::EPS = 1E-5;
-
 bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
                                 const ParameterMap& parameterMap) {
   if (!MKLDNNLayer::init(layerMap, parameterMap)) {
@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
+
   VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
                     << " --- global stats";
   VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
@@ -116,22 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
 }
 
 void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
   oh = ih;
-  ow = ow;
+  ow = iw;
   // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
       << "Input channel can not be changed";
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-  printSizeInfo();
 }
 
 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                     MKLDNNMatrixPtr& out) {
   // In training phase, it will always calculate mean and var,
   // so useGlobalStats must be false.
@@ -141,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
     useGlobalStats_ = false;
   }
 
-  resetFwdBuffers(in, wgt, out);
+  resetFwdBuffers(inputs[0], wgtVal_, out);
 
-  resetFwdPD(fwdPD_, in, wgt, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
 }
 
 void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
                                     MKLDNNMatrixPtr& out) {
   std::shared_ptr<bn_bwd::primitive_desc> pd;
 
-  resetBwdBuffers(in, wgt, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, out);
 
-  resetBwdPD(pd, in, wgt, out);
+  resetBwdPD(pd, inputs[0], wgtGrad_, out);
 
-  resetBwdPipeline(pipeline, pd, in, wgt, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
 }
 
 void MKLDNNBatchNormLayer::forward(PassType passType) {
@@ -214,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
   if (wgt) {
     flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
   }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
   pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
   CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
   if (wgt) {
@@ -261,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
 void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                            MKLDNNMatrixPtr& wgt,
                                            MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
   if (gradScaleShift_) {
     CHECK(wgtVal_);
     resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
@@ -281,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
   }
   CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
   auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
   pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
   CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
   CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
@@ -298,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
   if (pd == nullptr) {
     return;
   }
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   bwdData_.reset(
       wgt && wgtVal_
-          ? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
-          : new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
+          ? new bn_bwd(
+                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
   pipeline.push_back(*bwdData_);
 }
 
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
index 456c0424ec..1cf33cb34f 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -32,7 +32,8 @@ protected:
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
   // Epsilon value used in the batch normalization formula.
-  static const real EPS;
+  real epsilon_;
+
   // weight and bias in paddle
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
@@ -73,18 +74,14 @@ public:
   void forward(PassType passType) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -98,11 +95,7 @@ protected:
    * moving = moving * AvgFraction + local * (1 - AvgFraction)
    */
   void calMovingMeanAndVar();
-  /**
-   * Forward functions: reset buffers(input, weight, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
+
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);
@@ -115,12 +108,6 @@ protected:
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, weight, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
new file mode 100644
index 0000000000..44bb0883b8
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConcatLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_GT(inputLayers_.size(), 1UL);
+  CHECK(!biasParameter_);
+  return true;
+}
+
+void MKLDNNConcatLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
+  CHECK_GT(inputLayers_.size(), 1UL);
+  channels_.resize(inputLayers_.size());
+  channels_[0] = ic;
+  oc = ic;
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int batchsize, height, witdh;
+    reshapeInput(batchsize, height, witdh, i);
+    CHECK_EQ(bs, batchsize);
+    CHECK_EQ(ih, height);
+    CHECK_EQ(iw, witdh);
+
+    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+    oc += channels_[i];
+  }
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
+                                 MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs, out);
+
+  std::shared_ptr<concat::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inputs, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inputs, out);
+}
+
+void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
+                                 MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inputs, out);
+
+  resetBwdPipeline(pipeline, bwds_, inputs, out);
+}
+
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  bool has8c = false, has16c = false, hasnc = false;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i, channels_[i]);
+    CHECK(inputs[i]);
+    auto dm = inputs[i]->getDims();
+    // inputs format can be different, but ndims must equal
+    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+    CHECK_EQ(bs_, dm[0]);
+    CHECK_EQ(channels_[i], dm[1]);
+    if (dm.size() > 2) {
+      CHECK_EQ(ih_, dm[2]);
+      CHECK_EQ(iw_, dm[3]);
+    }
+    if (inputs[i]->getFormat() == format::nc) {
+      hasnc = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw8c) {
+      has8c = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw16c) {
+      has16c = true;
+    }
+  }
+
+  format outFmt;
+  if (has16c && oc_ % 16 == 0) {
+    outFmt = format::nChw16c;
+  } else if (has8c && oc_ % 8 == 0) {
+    outFmt = format::nChw8c;
+  } else if (hasnc) {
+    CHECK(oh_ == 1 && ow_ == 1);
+    outFmt = format::nc;
+  } else {
+    outFmt = format::nchw;
+  }
+  memory::dims outDims =
+      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
+                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                   MKLDNNMatrixPtr out) {
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNConcatLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<concat::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new concat(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CHECK(inVals_[i]);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+  }
+}
+
+void MKLDNNConcatLayer::resetBwdPipeline(
+    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  // reset the backward primitives
+  memory::dims offsets = {0, 0, 0, 0};
+  prims.resize(inputs.size());
+  CHECK_EQ(inputs.size(), channels_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto viewPD = view::primitive_desc(
+        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+                                         inputs[i]->getPrimitiveDesc());
+    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+    offsets[axis_] += channels_[i];
+    // push to pipeline
+    pipeline.push_back(*prims[i]);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
new file mode 100644
index 0000000000..37f3a26c5e
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+
+public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+
+  ~MKLDNNConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+
+  size_t keepCondition() {
+    // reset when the total element size of all inputs changed
+    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
+    for (size_t i = 1; i < inputLayers_.size(); ++i) {
+      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
+    }
+    return totalSize;
+  }
+
+protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index b8120eda1e..ab1d0f7b04 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -90,7 +90,7 @@ void MKLDNNConvLayer::convertWeightsToPaddle() {
 }
 
 void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
 
   // cal output sizes
@@ -102,26 +102,20 @@ void MKLDNNConvLayer::reshape(
 
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-
-  printSizeInfo();
 }
 
 void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
   resetFwdPD(fwdPD_);
 
-  resetFwdBuffers(fwdPD_, in, wgt, bias, out);
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 }
 
 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
   std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
   std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
@@ -130,9 +124,10 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
 
   resetBwdDataPD(bwdDataPD);
 
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 
-  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 }
 
 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -238,14 +233,14 @@ void MKLDNNConvLayer::resetBwdWgtPD(
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
 
   // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have internal input value";
+  CHECK(inVals_[0]) << "Should have internal input value";
   CHECK(outVal_) << "Should have internal output value";
   CHECK(wgtVal_) << "Should have weight value";
   algorithm algo = algorithm::convolution_direct;
   padding_kind padKind = padding_kind::zero;
   auto bwdWgtDesc = biasVal_ != nullptr
                         ? conv_bwdWgt::desc(algo,
-                                            inVal_->getMemoryDesc(),
+                                            inVals_[0]->getMemoryDesc(),
                                             wgtVal_->getMemoryDesc(),
                                             biasVal_->getMemoryDesc(),
                                             outVal_->getMemoryDesc(),
@@ -254,7 +249,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                             padR,
                                             padKind)
                         : conv_bwdWgt::desc(algo,
-                                            inVal_->getMemoryDesc(),
+                                            inVals_[0]->getMemoryDesc(),
                                             wgtVal_->getMemoryDesc(),
                                             outVal_->getMemoryDesc(),
                                             strides,
@@ -262,7 +257,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                             padR,
                                             padKind);
   pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
   CHECK_PRIMITIVE_DESC_EQ(
       outVal_,
       pd->diff_dst_primitive_desc(),
@@ -282,12 +277,12 @@ void MKLDNNConvLayer::resetBwdDataPD(
 
   memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have internal input value";
+  CHECK(inVals_[0]) << "Should have internal input value";
   CHECK(outVal_) << "Should have internal output value";
   // create backward data using input and output value memory desc
   // but using weight memory desc with any format
   auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVal_->getMemoryDesc(),
+                                        inVals_[0]->getMemoryDesc(),
                                         MKLDNNMatrix::createMemoryDesc(wgtDims),
                                         outVal_->getMemoryDesc(),
                                         strides,
@@ -296,7 +291,7 @@ void MKLDNNConvLayer::resetBwdDataPD(
                                         padding_kind::zero);
   pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
   CHECK_PRIMITIVE_DESC_EQ(
-      inVal_,
+      inVals_[0],
       pd->diff_src_primitive_desc(),
       "primitive desc of in value and grad should be equal");
   CHECK_PRIMITIVE_DESC_EQ(
@@ -348,12 +343,12 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   // add bwdWgt handle
   if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
   } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt));
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
   }
   pipeline.push_back(*bwdWgt_);
 
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 1fed0e1c65..3e754a0e65 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -69,18 +69,14 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -92,7 +88,7 @@ public:
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
@@ -107,48 +103,26 @@ protected:
                         mkldnn::memory::dims& padL,
                         mkldnn::memory::dims& padR);
 
-  /**
-   * reset the forward primitive descriptor.
-   */
   void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
-  /**
-   * reset the MKLDNNMatrix buffers used in forward.
-   */
   void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  /**
-   * reset the forward pipeline.
-   */
   void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                         std::shared_ptr<conv_fwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * reset the backward weight primitive descriptor.
-   */
   void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
-  /**
-   * reset the backward data primitive descriptor.
-   */
   void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
-  /**
-   * reset the MKLDNNMatrix buffers used in backward.
-   */
   void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
                        MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  /**
-   * reset the backward pipeline.
-   */
   void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                         std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                         std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d82063a713..c8778bdd07 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -60,23 +60,21 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   }
 
   CHECK(wgtVal_) << "should have been initialized";
-  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
+  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
   wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
 
 void MKLDNNFcLayer::convertWeightsToPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
-  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
+  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
 void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
 
   CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
@@ -86,37 +84,32 @@ void MKLDNNFcLayer::reshape(
 
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc);
-
-  printSizeInfo();
 }
 
 void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
                              MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(in, wgt, bias, out);
+  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
 
-  resetFwdPD(fwdPD_, in, wgt, bias, out);
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
 }
 
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             MKLDNNMatrixPtr& in,
-                             MKLDNNMatrixPtr& wgt,
-                             MKLDNNMatrixPtr& bias,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
                              MKLDNNMatrixPtr& out) {
   std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
   std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
 
-  resetBwdBuffers(in, wgt, bias, out);
+  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
 
-  resetBwdWgtPD(bwdWgtPD, wgt, bias, out);
+  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
 
-  resetBwdDataPD(bwdDataPD, in, out);
+  resetBwdDataPD(bwdDataPD, inputs[0], out);
 
-  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
 }
 
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -197,9 +190,9 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
 
   CHECK(wgtVal_);
   resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
@@ -216,14 +209,15 @@ void MKLDNNFcLayer::resetBwdWgtPD(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
-  fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                      wgt->getMemoryDesc(),
-                                                      bias->getMemoryDesc(),
-                                                      out->getMemoryDesc())
-                                    : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
-                                                      wgt->getMemoryDesc(),
-                                                      out->getMemoryDesc());
+  CHECK(inVals_[0]);
+  fc_bwdWgt::desc bwdWgtDesc =
+      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             bias->getMemoryDesc(),
+                             out->getMemoryDesc())
+           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             out->getMemoryDesc());
   pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
 }
 
@@ -249,11 +243,11 @@ void MKLDNNFcLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  CHECK(inVal_);
+  CHECK(inVals_[0]);
   if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
   } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt));
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
   }
   pipeline.push_back(*bwdWgt_);
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index ee861763ff..283dc9b540 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -52,18 +52,14 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void updateWeights(const UpdateCallback& callback) override;
@@ -73,11 +69,6 @@ public:
   void convertWeightsToPaddle() override;
 
 protected:
-  /**
-   * Forward functions: reset buffers(input, output, weight and bias),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
@@ -93,13 +84,6 @@ protected:
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, output, weight and bias),
-   *                     reset primitive descriptor for backward weight,
-   *                     reset primitive descriptor for backward data,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000..741984bb68
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1UL);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000..cfe5621252
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+
+public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNLRNLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 663a105098..6fbf3c7fde 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -21,8 +21,8 @@ namespace paddle {
 
 bool MKLDNNLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                          << "Please set WITH_MKLDNN=ON "
+  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
+                          << "Please set WITH_MKL=ON "
                           << "and set use_mkldnn=True";
   CHECK(!useGpu_) << "Do not support GPU yet";
 
@@ -48,40 +48,29 @@ void MKLDNNLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
     CHECK(!inputLayers_.empty());
     copySeqInfoToOutputs();
-    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
-    if (inputElemenCnt_ != elemenCnt) {
+    if (condition_ != keepCondition()) {
       VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-      // reset when input total sizes changed, not only the batchsize
-      inputElemenCnt_ = elemenCnt;
-      pipelineFwd_.clear();
+      condition_ = keepCondition();
       reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-      // all cpu device output grad or value share output's
+      printSizeInfo();
+      // the output_.value and output_.grad are shared with CPU device
       shareCPUDevice();
-      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-      // MKLDNNLayer output value should be MKLDNNMatrix
-      // so external output value is necessary.
-      // Then external input value is not necessary,
-      // since input may be mkldnn internal buffer.
-      CHECK(extOutVal_) << "external output value is necessary";
-      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-      CHECK(inVal_ && outVal_) << "internal memories are necessary";
-      if (cvtInVal_) {
-        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
-      }
-      if (cvtOutVal_) {
-        pipelineFwd_.push_back(*cvtOutVal_);
-      }
+      pipelineFwd_.clear();
+      inVals_.resize(inputLayers_.size(), nullptr);
+      extInVals_.resize(inputLayers_.size(), nullptr);
+      cvtInVals_.resize(inputLayers_.size(), nullptr);
+      resetFwd(pipelineFwd_, inVals_, outVal_);
+      prepareValueConversions(pipelineFwd_);
       convertWeightsFromPaddle();
-      printSizeInfo();
       printValueFormat();
       needResetBwd_ = true;
     }
 
-    if (inputLayers_[0]->getType() == "data") {
+    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
       // Update input value data when input layer is "data" type,
       // since the input value data address might be changed.
-      CHECK(extInVal_);
-      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+      CHECK(extInVals_[0]);
+      extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
     }
 
     if (!outputOnlyMKLDNN_) {
@@ -99,22 +88,13 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
   if (needResetBwd_) {
     VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
     pipelineBwd_.clear();
+    inGrads_.resize(inputLayers_.size(), nullptr);
+    extInGrads_.resize(inputLayers_.size(), nullptr);
+    cvtInGrads_.resize(inputLayers_.size(), nullptr);
     pipelineMergeGrad_.clear();
     mergeGrad_ = nullptr;
-    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
-    }
-    if (cvtInGrad_) {
-      pipelineBwd_.push_back(*cvtInGrad_);
-    }
+    resetBwd(pipelineBwd_, inGrads_, outGrad_);
+    prepareGradConversions(pipelineBwd_);
     printGradFormat();
     needResetBwd_ = false;
   }
@@ -138,8 +118,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
   }
 }
 
-void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
-  const Argument& input = inputLayers_[0]->getOutput();
+void MKLDNNLayer::reshapeInput(int& batchsize,
+                               int& height,
+                               int& width,
+                               size_t idx) {
+  const Argument& input = inputLayers_[idx]->getOutput();
   batchsize = input.getBatchSize();
   int h = input.getFrameHeight();
   int w = input.getFrameWidth();
@@ -171,31 +154,32 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 }
 
 void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
-  cvtInVal_ = nullptr;
-  extInVal_ = nullptr;
+    MKLDNNMatrixPtr& in,
+    const std::shared_ptr<memory::primitive_desc>& intPD,
+    size_t idx,
+    int inputChannel) {
+  cvtInVals_[idx] = nullptr;
+  extInVals_[idx] = nullptr;
   in = nullptr;
-  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  inputChannel = inputChannel == 0 ? ic_ : inputChannel;
+  CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
   auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
-  if (in == nullptr || in->getFormat() == format::nc) {
-    in = MKLDNNMatrix::create(extPD, inMat);
-  }
-  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
-  if (in->getFormat() == format::nc) {
-    CHECK(ih_ == 1 && iw_ == 1);
+      {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
+  extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
+  if (extInVals_[idx] == nullptr ||
+      extInVals_[idx]->getFormat() == format::nc) {
+    extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
   }
+  in = extInVals_[idx];
   if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
     return;
   }
   // need create reorder
   in = MKLDNNMatrix::create(*intPD);
-  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
-  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
-  CHECK(cvtInVal_) << "should not be emptry";
+  cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
+  CHECK(cvtInVals_[idx]) << "should not be emptry";
 }
 
 void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
@@ -216,11 +200,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
 }
 
 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD) {
-  cvtInGrad_ = nullptr;
-  extInGrad_ = nullptr;
+                              memory::primitive_desc intPD,
+                              size_t idx) {
+  cvtInGrads_[idx] = nullptr;
+  extInGrads_[idx] = nullptr;
   in = nullptr;
-  LayerPtr& input = inputLayers_[0];
+  LayerPtr& input = inputLayers_[idx];
   if (input->getOutputGrad() == nullptr) {
     // no need input grad
     return;
@@ -235,24 +220,25 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
   in = MKLDNNMatrix::create(intPD, inMat);
   Argument& arg = input->getOutput(this->getName());
   arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
   if (inputIsOnlyMKLDNN()) {
     return;
   }
 
-  extInGrad_ = in;
-  if (isPaddleFormat(extInGrad_->getFormat())) {
+  extInGrads_[idx] = in;
+  if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
     return;
   }
   // need create reorder
-  // TODO(TJ): add macro definition to simplify it
-  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+  CHECK(extInVals_[idx] != nullptr &&
+        isPaddleFormat(extInVals_[idx]->getFormat()))
       << "should have external input value and the format must be nchw(nc)";
-  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
+  extInGrads_[idx] =
+      MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
   in = MKLDNNMatrix::create(intPD);
-  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
-  CHECK(cvtInGrad_);
+  cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
+  CHECK(cvtInGrads_[idx]);
 }
 
 void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
@@ -289,7 +275,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     return;
   }
   CHECK(out) << "should have reset internal ouput grad";
-  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<float> scales(outputMap_.size(), 1.0);
   std::vector<memory::primitive_desc> srcPDs;
   std::vector<primitive::at> srcs;
   for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
@@ -308,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     srcs.push_back(*src);
   }
 
-  // TODO(TJ): remove me when mkldnn sum support different formats
-  for (size_t i = 1; i < srcPDs.size(); ++i) {
-    CHECK(srcPDs[0] == srcPDs[i]);
-  }
-  tmpOutGrad_ = out;
-  tmpCvt_ = nullptr;
-  if (out->getPrimitiveDesc() != srcPDs[0]) {
-    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
-    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-    CHECK(tmpCvt_);
-    pipelineMergeGrad_.push_back(*tmpCvt_);
-  }
-
-  auto sumPD =
-      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *out));
   pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
 }
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 2c21a5b2aa..e48b9b5a91 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -34,15 +34,16 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
  */
 class MKLDNNLayer : public Layer {
 protected:
-  // input value element count
-  size_t inputElemenCnt_;
   // batch size
   int bs_;
+  // their sizes are always from the first input layer
   // input image channel, height and width
   int ic_, ih_, iw_;
   // output image channel, height and width
   int oc_, oh_, ow_;
 
+  // the condition that forward need be reset
+  size_t condition_;
   // backward also need reset after reset forward handle
   bool needResetBwd_;
 
@@ -67,18 +68,18 @@ protected:
    * When all layers are mkldnn layers, they could save internal data.
    */
   // below MKLDNNMatrix buffers are all internal buffers
-  MKLDNNMatrixPtr inVal_;
-  MKLDNNMatrixPtr inGrad_;
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
   MKLDNNMatrixPtr outVal_;
   MKLDNNMatrixPtr outGrad_;
   // below are external value and grad
-  MKLDNNMatrixPtr extInVal_;
-  MKLDNNMatrixPtr extInGrad_;
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
   MKLDNNMatrixPtr extOutVal_;
   MKLDNNMatrixPtr extOutGrad_;
   // convert handle between external and internal buffers
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
   std::shared_ptr<mkldnn::reorder> cvtOutVal_;
   std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
 
@@ -93,23 +94,11 @@ protected:
   std::vector<mkldnn::primitive> pipelineMergeGrad_;
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
-  // since mkldnn sum do not support different formats:
-  // can refer to https://github.com/01org/mkl-dnn/issues/134
-  // so need create reorder manually and save tmp MKLDNNMatrix
-  MKLDNNMatrixPtr tmpOutGrad_;
-  std::shared_ptr<mkldnn::primitive> tmpCvt_;
 
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
-        inputElemenCnt_(0),
-        bs_(0),
-        ic_(0),
-        ih_(0),
-        iw_(0),
-        oc_(0),
-        oh_(0),
-        ow_(0),
+        condition_(0),
         needResetBwd_(true),
         outputOnlyMKLDNN_(false),
         engine_(mkldnn::engine::cpu, 0),
@@ -125,31 +114,28 @@ public:
   virtual void backward(const UpdateCallback& callback);
 
   /**
-   * reshape the input image sizes
-   * and reset output image and buffer size
-   * output channel can not be changed
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
    */
   virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
 
   /**
    * reset the mkldnn forward primitve and memories
    * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
    */
   virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
    * reset the mkldnn backward primitve and memories
    * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
    */
   virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
@@ -175,10 +161,19 @@ public:
   void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
 
 protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
   /**
    * reshape the input image sizes and input batchsize
    */
-  void reshapeInput(int& batchsize, int& height, int& width);
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
 
   /**
    * reshape output image sizes
@@ -196,10 +191,13 @@ protected:
   /**
    * reset input value from input MKLDNNMatrix and internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
    */
   void resetInValue(
       MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t idx = 0,
+      int inputChannel = 0);
 
   /**
    * reset output value from internal primitive desc.
@@ -212,7 +210,9 @@ protected:
    * reset input grad from internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
    */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t idx = 0);
 
   /**
    * reset output grad from internal primitive desc.
@@ -290,17 +290,19 @@ protected:
    * print the mkldnn memory format of value
    */
   virtual void printValueFormat() {
-    if (extInVal_) {
-      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
-    }
-    if (inVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
     }
     if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
-    }
-    if (extOutVal_) {
-      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
     }
     if (wgtVal_) {
       VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
@@ -314,17 +316,19 @@ protected:
    * print the mkldnn memory format of grad
    */
   virtual void printGradFormat() {
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
-    }
     if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
     }
-    if (inGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
-    }
-    if (extInGrad_) {
-      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
     }
     if (wgtGrad_) {
       VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
@@ -431,6 +435,41 @@ private:
       outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
     }
   }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 6e89260f49..a8252593c8 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -58,10 +58,11 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
 }
 
 void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
   // ic_ and oc can not be changed
-  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
       << "Input channel can not be changed";
 
   // cal output sizes
@@ -71,34 +72,28 @@ void MKLDNNPoolLayer::reshape(
   reshapeOutput(oh, ow);
 
   resizeOutput(bs, oc * oh * ow);
-
-  printSizeInfo();
 }
 
 void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(in, out);
+  resetFwdBuffers(inputs[0], out);
 
-  resetFwdPD(fwdPD_, in, out);
+  resetFwdPD(fwdPD_, inputs[0], out);
 
-  resetFwdPipeline(pipeline, fwdPD_, in, out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
 }
 
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               MKLDNNMatrixPtr& in,
-                               MKLDNNMatrixPtr& wgt,
-                               MKLDNNMatrixPtr& bias,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
                                MKLDNNMatrixPtr& out) {
   std::shared_ptr<pool_bwd::primitive_desc> pd;
 
-  resetBwdBuffers(in, out);
+  resetBwdBuffers(inputs[0], out);
 
-  resetBwdPD(pd, in, out);
+  resetBwdPD(pd, inputs[0], out);
 
-  resetBwdPipeline(pipeline, pd, in, out);
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
 }
 
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
@@ -153,9 +148,9 @@ void MKLDNNPoolLayer::resetFwdPipeline(
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
-  CHECK(inVal_ && outVal_);
+  CHECK(inVals_[0] && outVal_);
   resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index c5ec87828b..dad60156f0 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -53,18 +53,14 @@ public:
             const ParameterMap& parameterMap) override;
 
   void reshape(
-      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
 
   void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                MKLDNNMatrixPtr& in,
-                MKLDNNMatrixPtr& wgt,
-                MKLDNNMatrixPtr& bias,
+                std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
   void printSizeInfo() override {
@@ -75,11 +71,6 @@ public:
   }
 
 protected:
-  /**
-   * Forward functions: reset buffers(input, output),
-   *                    reset primitive descriptor,
-   *                    reset pipeline.
-   */
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
@@ -88,12 +79,6 @@ protected:
                         std::shared_ptr<pool_fwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& out);
-
-  /**
-   * Backward functions: reset buffers(input, output),
-   *                     reset primitive descriptor,
-   *                     reset pipeline.
-   */
   void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr& in,
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
new file mode 100644
index 0000000000..d810a58d9a
--- /dev/null
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
new file mode 100644
index 0000000000..e0174add9d
--- /dev/null
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+protected:
+  Argument mask_;
+
+public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 7b932d5a76..fceb389d06 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #ifdef PADDLE_WITH_CUDA
@@ -45,6 +46,7 @@ bool PoolLayer::init(const LayerMap& layerMap,
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
 
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
   return true;
 }
 
@@ -57,6 +59,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
   } else {
     LOG(FATAL) << "Unknown pool type: " << pool;
     return nullptr;
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index d43292ad2d..9df672a935 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
 
   std::string poolType_;
 
+  bool excludeMode_;
+
 public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index d90b438448..6a9de394ce 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
 }
 
 size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
                        outputY_,
                        outputX_,
                        confPaddingY_,
-                       confPadding_);
+                       confPadding_,
+                       excludeMode_);
 }
 
 void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
                              1,
                              1,
                              confPaddingY_,
-                             confPadding_);
+                             confPadding_,
+                             excludeMode_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9a75f465f6..a0412714bc 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
   int confPaddingY_, confPadding_;
   size_t channels_;
   std::string poolType_;
+  bool excludeMode_;
 
 public:
   PoolProjection(const ProjectionConfig& config,
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000..2c8256b91c
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+#include <cfloat>
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  pooledWidth_ = layerConf.pooled_width();
+  pooledHeight_ = layerConf.pooled_height();
+  spatialScale_ = layerConf.spatial_scale();
+
+  return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  height_ = getInput(0).getFrameHeight();
+  if (!height_) height_ = layerConf.height();
+  width_ = getInput(0).getFrameWidth();
+  if (!width_) width_ = layerConf.width();
+  channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t numROIs = getInput(1).getBatchSize();
+
+  MatrixPtr dataValue = getInputValue(0);
+  MatrixPtr roiValue = getInputValue(1);
+  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+  MatrixPtr outputValue = getOutputValue();
+
+  if (useGpu_) {  // TODO(guosheng): implement on GPU later
+    MatrixPtr dataCpuBuffer;
+    Matrix::resizeOrCreate(dataCpuBuffer,
+                           dataValue->getHeight(),
+                           dataValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    dataCpuBuffer->copyFrom(*dataValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    dataValue = dataCpuBuffer;
+    roiValue = roiCpuBuffer;
+    MatrixPtr outputCpuBuffer;
+    Matrix::resizeOrCreate(outputCpuBuffer,
+                           outputValue->getHeight(),
+                           outputValue->getWidth(),
+                           false,
+                           false);
+    outputCpuBuffer->copyFrom(*outputValue);
+    outputValue = outputCpuBuffer;
+  }
+
+  real* bottomData = dataValue->getData();
+  size_t batchOffset = dataValue->getWidth();
+  size_t channelOffset = height_ * width_;
+  real* bottomROIs = roiValue->getData();
+  size_t roiOffset = roiValue->getWidth();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+  real* outputData = outputValue->getData();
+  Matrix::resizeOrCreate(maxIdxs_,
+                         numROIs,
+                         channels_ * pooledHeight_ * pooledWidth_,
+                         false,
+                         false);
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    // the first five elememts of each RoI should be:
+    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+    size_t roiBatchIdx = bottomROIs[0];
+    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+    CHECK_GE(roiBatchIdx, 0UL);
+    CHECK_LT(roiBatchIdx, batchSize);
+    size_t roiHeight =
+        std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
+    size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
+    real binSizeH =
+        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
+    real binSizeW =
+        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
+    real* batchData = bottomData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
+          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
+          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
+          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
+          hstart = std::min(
+              std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
+          wstart = std::min(
+              std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
+          hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
+                          height_);
+          wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
+                          width_);
+
+          bool isEmpty = (hend <= hstart) || (wend <= wstart);
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
+          argmaxData[poolIndex] = -1;
+
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              size_t index = h * width_ + w;
+              if (batchData[index] > outputData[poolIndex]) {
+                outputData[poolIndex] = batchData[index];
+                argmaxData[poolIndex] = index;
+              }
+            }
+          }
+        }
+      }
+      batchData += channelOffset;
+      outputData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+  if (useGpu_) {
+    getOutputValue()->copyFrom(*outputValue);
+  }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGradValue = getInputGrad(0);
+  MatrixPtr outGradValue = getOutputGrad();
+  MatrixPtr roiValue = getInputValue(1);
+
+  if (useGpu_) {
+    MatrixPtr inGradCpuBuffer;
+    Matrix::resizeOrCreate(inGradCpuBuffer,
+                           inGradValue->getHeight(),
+                           inGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr outGradCpuBuffer;
+    Matrix::resizeOrCreate(outGradCpuBuffer,
+                           outGradValue->getHeight(),
+                           outGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    inGradCpuBuffer->copyFrom(*inGradValue);
+    outGradCpuBuffer->copyFrom(*outGradValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    inGradValue = inGradCpuBuffer;
+    outGradValue = outGradCpuBuffer;
+    roiValue = roiCpuBuffer;
+  }
+
+  real* bottomROIs = roiValue->getData();
+  size_t numROIs = getInput(1).getBatchSize();
+  size_t roiOffset = getInputValue(1)->getWidth();
+
+  real* inDiffData = inGradValue->getData();
+  size_t batchOffset = getInputValue(0)->getWidth();
+  size_t channelOffset = height_ * width_;
+
+  real* outDiffData = outGradValue->getData();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    size_t roiBatchIdx = bottomROIs[0];
+    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (argmaxData[poolIndex] > 0) {
+            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
+            batchDiffData[index] += outDiffData[poolIndex];
+          }
+        }
+      }
+      batchDiffData += channelOffset;
+      outDiffData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+
+  if (useGpu_) {
+    getInputGrad(0)->copyFrom(*inGradValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000..4f07e49d6f
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ *          convolution layer; The second input layer contains the ROI data
+ *          which is the output of ProposalLayer in Faster R-CNN. layers for
+ *          generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t width_;
+  size_t height_;
+  size_t pooledWidth_;
+  size_t pooledHeight_;
+  real spatialScale_;
+
+  // Since there is no int matrix, use real maxtrix instead.
+  MatrixPtr maxIdxs_;
+
+public:
+  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000..aa6778aef4
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
+  auto& conf = config_.inputs(0).scale_sub_region_conf();
+  value_ = conf.value();
+
+  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+  createFunction(
+      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+  return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in0 = getInput(0);
+  imgH_ = in0.getFrameHeight();
+  imgW_ = in0.getFrameWidth();
+  if (imgH_ == 0 || imgW_ == 0) {
+    auto& conf = config_.inputs(0).scale_sub_region_conf();
+    imgH_ = conf.image_conf().img_size_y();
+    imgW_ = conf.image_conf().img_size();
+  }
+  MatrixPtr imgV = in0.value;
+  size_t batchSize = imgV->getHeight();
+  size_t spatialSize = imgH_ * imgW_;
+  channelsNum_ = imgV->getWidth() / spatialSize;
+  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+  resetOutput(batchSize, imgV->getWidth());
+  auto& out = getOutput();
+  out.setFrameHeight(imgH_);
+  out.setFrameWidth(imgW_);
+
+  MatrixPtr indicesV = getInputValue(1);
+  indicesShape_ = TensorShape({batchSize, 6});
+
+  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*imgV, shape_);
+  inArgs.addArg(*indicesV, indicesShape_);
+  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+  forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*getOutputGrad(), shape_);
+  inArgs.addArg(*getInputValue(1), indicesShape_);
+  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+  backward_[0]->calc(inArgs, outArgs);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000..a27c56de93
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  For each instance, this layer can be used to multiply a value to a
+ *         specified sub continuous region. By providing start index and end
+ *         index for C/H/W, you can specify the location and shape of the
+ *         region.
+ *
+ *         input_0: Input value.
+ *         input_1: Indices value to specify the location an shape of the
+ *                  region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScaleSubRegionLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  TensorShape shape_;
+  TensorShape indicesShape_;
+  size_t imgH_;
+  size_t imgW_;
+  size_t channelsNum_;
+  real value_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 19b7ad1869..00d8ce017a 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
   CHECK_EQ(numSequences2, numSequences3);
 
   MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue = offsetSeq.ids;
-  IVectorPtr sizeValue = sizeSeq.ids;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   CHECK_EQ(offsetValue->getSize(), numSequences1);
   CHECK_EQ(sizeValue->getSize(), numSequences1);
@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
-  IVectorPtr offsetValue = getInput(1).ids;
-  IVectorPtr sizeValue = getInput(2).ids;
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   int* offsets = offsetValue->getData();
   int* sizes = sizeValue->getData();
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index aa94ee406e..b578a906c2 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,9 +1,11 @@
 # gserver pacakge unittests
-
 add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_MultinomialSampler)
 add_simple_unittest(test_RecurrentLayer)
 
+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
 function(gserver_test TARGET)
   add_unittest_without_exec(${TARGET}
       ${TARGET}.cpp
@@ -24,85 +26,72 @@ gserver_test(test_ConvUnify)
 gserver_test(test_BatchNorm)
 gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
 
-########## test_Mkldnn layers and activations ##########
+########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
     add_unittest_without_exec(test_MKLDNN
         test_MKLDNN.cpp
         MKLDNNTester.cpp
         LayerGradUtil.cpp)
     add_test(NAME test_MKLDNN
-        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
-            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
             WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
-############## test_PyDataProvider ########################
-if(WITH_PYTHON)
-    add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp)
-
-    add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
-
 ############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE)
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
     add_unittest_without_exec(test_WarpCTCLayer
         test_WarpCTCLayer.cpp)
-
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################### test_ProtoDataProvider ############
-    add_unittest_without_exec(test_ProtoDataProvider
-        test_ProtoDataProvider.cpp)
-
-    # test_ProtoDataProvider will mkdir as same name,
-    # so if WORKING_DIRECTORY is default directory, then
-    # mkdir will get error.
-    add_test(NAME test_ProtoDataProvider
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
-################## test_Evaluator #######################
+    ################## test_Evaluator #############
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
       
-############### test_RecurrentGradientMachine ###############
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-    # I will fix it.
-    add_unittest_without_exec(test_RecurrentGradientMachine
-        test_RecurrentGradientMachine.cpp)
-    add_test(NAME test_RecurrentGradientMachine
-        COMMAND .set_python_path.sh -d
-                ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-                ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-      
-############### test_NetworkCompare ###############
+    ########### test_NetworkCompare ###############
     add_unittest_without_exec(test_NetworkCompare
         test_NetworkCompare.cpp)
     if(WITH_GPU)
-        add_test(NAME test_NetworkCompare
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        set(use_gpu true)
     else()
-        add_test(NAME test_NetworkCompare
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        set(use_gpu false)
     endif()
-endif()
-
-
-add_unittest_without_exec(test_PyDataProvider2
-        test_PyDataProvider2.cpp)
+    add_test(NAME test_NetworkCompare
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 
-add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-)
+    ############ test_CompareSparse ################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
+endif()
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 7670cb88fb..afe1608eab 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
   VLOG(MKLDNN_TESTS) << "Check Forward";
   printTopDatas();
   double delta =
-      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
   EXPECT_LE(fabs(delta), eps_);
 }
 
@@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
     VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
     printMatrix(refDiff);
 
-    double delta = compareMatrix(dnnDiff, refDiff);
+    double delta = compareMatrix(refDiff, dnnDiff);
     EXPECT_LE(fabs(delta), eps_);
     if (isBN) {
       // the other two inputs in batch norm are for moving mean and var
@@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
                      << parameters_[REF][i]->getName();
     printVector(ref);
 
-    double delta = compareVector(dnn, ref);
+    double delta = compareVector(ref, dnn);
     EXPECT_LE(fabs(delta), eps_);
   }
 
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index ca55a45bc7..9d61533c0b 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief test the functionality of Mkldnnlayers
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
  * refer to paddle original function
  */
 class MKLDNNTester {
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
index 8bbe91e56d..0e9d6b31fa 100644
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp,
             padding=1,
             pool_type=MaxPooling())
 
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
+
 tmp = fc_layer(input=tmp,
             size=channels,
             bias_attr=False,
diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt
deleted file mode 100644
index 691b38c794..0000000000
--- a/paddle/gserver/tests/proto_files.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt
deleted file mode 100644
index 7413c81e18..0000000000
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000..f49a827f22
--- /dev/null
+++ b/paddle/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000..4895df186b
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000..a1d54542e3
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
index e2635b4400..59e8c91733 100644
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
 
 
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_mixed_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
     outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
 
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
index 84a66e2944..6fe9dca6e2 100644
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
 
 
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_matched_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
     outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
 
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
similarity index 98%
rename from paddle/trainer/tests/test_CompareSparse.cpp
rename to paddle/gserver/tests/test_CompareSparse.cpp
index 5f1834bd73..c6e07650fc 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile1 =
-    "trainer/tests/sample_trainer_config_compare_sparse.conf";
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
similarity index 95%
rename from paddle/trainer/tests/test_CompareTwoNets.cpp
rename to paddle/gserver/tests/test_CompareTwoNets.cpp
index 94f65e545d..801d960756 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/gserver/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
 DECLARE_string(config);
 DECLARE_string(nics);
 
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_int32(seed);
 
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "gserver/tests/sequence_recurrent_group.py";
+
 struct ComData {
   vector<Argument> outArgs;
   vector<ParameterPtr> parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
   DataBatch dataBatch;
   int32_t batchSize = trainer.getConfig().opt_config().batch_size();
 
+  trainer.getDataProvider()->reset();
   trainer.getDataProvider()->setSkipShuffle();
   trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
 
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
+  calcGradient(dataA, config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
+  calcGradient(dataB, config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 
   compareGradient(dataA, dataB);
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 1a46fb4915..a2f07937b8 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -53,7 +53,7 @@ TEST(Operator, dot_mul) {
 TEST(Projection, context) {
   for (auto contextStart : {-5, -3, -1, 0, 3}) {
     for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 50}) {
+      for (auto batchSize : {1, 2, 5, 20}) {
         for (auto trainablePadding : {false, true}) {
           LOG(INFO) << " contextStart=" << contextStart
                     << " contextLength=" << contextLength
@@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
   conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
   if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
     conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
   } else {
     conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
     conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -434,7 +449,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  int dilation = 1;
+  int dilation = 2;
   if (type == "cudnn_conv") {
 #if CUDNN_VERSION >= 6000
     dilation = 2;
@@ -583,16 +598,17 @@ TEST(Layer, maxoutLayer) {
     testLayerGrad(config, "maxout", 10, false, useGpu);
   }
 }
+
 void testFcLayer(string format, size_t nnz) {
   TestConfig config;
-  config.biasSize = 4096;
+  config.biasSize = 1024;
   config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(4096);
+  config.layerConfig.set_size(1024);
   config.layerConfig.set_active_type("sigmoid");
   config.layerConfig.set_drop_rate(0.1);
 
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
   config.layerConfig.add_inputs();
 
   LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
@@ -609,9 +625,9 @@ void testFcLayer(string format, size_t nnz) {
 }
 
 TEST(Layer, fcLayer) {
-  testFcLayer("", 4096 * 4096 * 2);
-  testFcLayer("csc", 4096 * 40);
-  testFcLayer("csr", 4096 * 40);
+  testFcLayer("", 1024 * 1024 * 2);
+  testFcLayer("csc", 1024 * 10);
+  testFcLayer("csr", 1024 * 10);
 }
 
 TEST(Layer, SelectiveFullyConnectedLayer) {
@@ -680,12 +696,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {
@@ -1081,6 +1098,21 @@ TEST(Layer, InterpolationLayer) {
   }
 }
 
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
 TEST(Layer, OuterProdLayer) {
   TestConfig config;
   config.layerConfig.set_type("out_prod");
@@ -1194,7 +1226,10 @@ void setPoolConfig(TestConfig* config,
   pool->set_output_y(oh);
 }
 
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1202,6 +1237,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
 
   pool->set_img_size(14);
   pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
   setPoolConfig(&config, pool, poolType);
   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                               pool->channels());
@@ -1233,15 +1269,27 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 
 TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 
 #ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2(
+      "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
 
@@ -1995,7 +2043,7 @@ TEST(Layer, multibox_loss) {
 TEST(Layer, TransLayer) {
   TestConfig config;
   const int height = 128;
-  const int width = 1028;
+  const int width = 256;
   config.layerConfig.set_type("trans");
   config.layerConfig.set_size(width);
 
@@ -2056,6 +2104,43 @@ TEST(Layer, CropLayer) {
   }
 }
 
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
 TEST(Layer, SwitchOrderLayer) {
   TestConfig config;
   // config input_0
@@ -2358,6 +2443,76 @@ TEST(Layer, ScaleShiftLayer) {
   }
 }
 
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, L2DistanceLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("l2_distance");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  const size_t input_dim = 27;
+  const size_t batch_size = 11;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
+  }
+}
+
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index d60b0f04a1..ad1dbc3ee2 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -269,22 +269,137 @@ void testBatchNormLayer(const testBatchNormDesc& pm) {
 TEST(MKLDNNLayer, BatchNormLayer) {
   testBatchNormLayer({4, 10, 6, 6});
   testBatchNormLayer({16, 32, 16, 16});
+  testBatchNormLayer({4, 16, 8, 10});
 }
 
-struct testActDesc {
+struct testLRNDesc {
   int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
 };
 
-static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
+struct testImageDesc {
+  int bs, ic, ih, iw;
+};
+
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
   cfg.biasSize = 0;
   cfg.layerConfig.set_type("addto");
   size_t layerSize = pm.ic * pm.ih * pm.iw;
   cfg.layerConfig.set_size(layerSize);
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1UL);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
+}
+
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
 }
 
-void testActivation(std::string actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testImageDesc& pm) {
   // TODO(TJ): remove me when paddle support elu activation
   if (actType == "mkldnn_elu") {
     return;
diff --git a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
new file mode 100644
index 0000000000..16438886df
--- /dev/null
+++ b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
deleted file mode 100644
index af6472619d..0000000000
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ /dev/null
@@ -1,732 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;  // NOLINT
-
-std::vector<string> protoFiles{
-    "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
-};
-std::vector<string> protoFilesCompressed{
-    "./test_ProtoDataProvider/data1.bin.gz",
-    "./test_ProtoDataProvider/data2.bin.gz",
-};
-
-const char* kTestDir = "./test_ProtoDataProvider";
-const char kProtoFileList[] = "gserver/tests/proto_files.txt";
-const char kProtoFileListCompressed[] =
-    "gserver/tests/proto_files_compressed.txt";
-const int kSpraseMatrixDim = 1024;
-
-using namespace paddle;  // NOLINT
-
-void prepareData(DataBatch* batch,
-                 const int* numPerSlotType,
-                 bool iid,
-                 bool useGpu) {
-  batch->clear();
-  int64_t size = uniformRandom(100) + 10;
-  batch->setSize(size);
-
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  if (!iid) {
-    int numSeqs = uniformRandom(10) + 1;
-    sequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* buf = sequenceStartPositions->getMutableData(false);
-    subSequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* subBuf = subSequenceStartPositions->getMutableData(false);
-    int64_t pos = 0;
-    int maxLen = 2 * size / numSeqs;
-    for (int i = 0; i < numSeqs; ++i) {
-      int len =
-          uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
-      buf[i] = pos;
-      subBuf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = size;
-    subBuf[numSeqs] = size;
-  }
-
-  vector<Argument>& arguments = batch->getStreams();
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
-    mat->randomizeUniform();
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arg.subSequenceStartPositions = subSequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    SVectorPtr vec = std::make_shared<std::vector<std::string>>();
-    for (int j = 0; j < size; ++j) {
-      vec->push_back(randStr(dim));
-    }
-    Argument arg;
-    arg.strs = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
-    int* buf = vec->getData();
-    for (int j = 0; j < size; ++j) {
-      buf[j] = uniformRandom(dim);
-    }
-    Argument arg;
-    arg.ids = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-}
-
-inline int getSlotDim(const Argument& arg) {
-  if (arg.value) {
-    return arg.value->getWidth();
-  } else if (arg.ids) {
-    return arg.ids->getMax() + 1;
-  } else if (arg.strs) {
-    return 1;
-  }
-  LOG(FATAL) << "Invalid argument";
-  return 0;
-}
-
-inline SlotDef::SlotType getSlotType(const Argument& arg) {
-  if (arg.value) {
-    auto& m = *arg.value;
-    auto& type = typeid(m);
-    if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
-      return SlotDef::VECTOR_DENSE;
-    }
-    if (type == typeid(CpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-    if (type == typeid(GpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-
-    LOG(FATAL) << "Unknown matrix type";
-  }
-  if (arg.ids) return SlotDef::INDEX;
-  if (arg.strs) return SlotDef::STRING;
-  LOG(FATAL) << "Invalid argument";
-  return SlotDef::VECTOR_DENSE;
-}
-
-void getColRow(const Argument& arg,
-               int64_t pos,
-               bool useGpu,
-               int* colNum,
-               const int** rowCols,
-               const real** rowValues) {
-  SlotDef::SlotType type = getSlotType(arg);
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
-  *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
-  if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-    *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
-  } else {
-    *rowValues = NULL;
-  }
-}
-
-void makeSample(const vector<Argument>& arguments,
-                int64_t pos,
-                bool isBeginning,
-                DataSample* sample,
-                bool useGpu) {
-  sample->set_is_beginning(isBeginning);
-  int slotid = 0;
-  for (auto& arg : arguments) {
-    SlotDef::SlotType type = getSlotType(arg);
-    int64_t dim = getSlotDim(arg);
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        values->Reserve(dim);
-        for (int i = 0; i < dim; ++i) {
-          values->AddAlreadyReserved(
-              static_cast<float>(arg.value->getElement(pos, i)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        sample->add_id_slots(arg.ids->get(pos));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;  // nullptr
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-        }
-        SubseqSlot* subseqSlot = sample->add_subseq_slots();  // subseq
-        subseqSlot->set_slot_id(slotid);
-        auto lens = subseqSlot->mutable_lens();
-        lens->Add(colNum);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        values->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-          values->AddAlreadyReserved(rowValues[i]);
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        vecSlot->add_strs((*arg.strs)[pos]);
-        break;
-      }
-    }
-    slotid++;
-  }
-}
-
-void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
-  DataHeader header;
-  const vector<Argument>& arguments = batch.getStreams();
-  for (auto& argument : arguments) {
-    SlotDef* slotDef = header.add_slot_defs();
-    slotDef->set_type(getSlotType(argument));
-    slotDef->set_dim(getSlotDim(argument));
-  }
-  VLOG(1) << "header=" << header.DebugString();
-
-  int64_t totalSeqs = batch.getNumSequences();
-  int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
-  int64_t numWritten = 0;
-  vector<string> curProtoFiles =
-      dataCompression ? protoFilesCompressed : protoFiles;
-  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
-    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
-                      totalSeqs * i / curProtoFiles.size();
-    ofstream os(curProtoFiles[i]);
-    CHECK(os) << "Fail to open " << curProtoFiles[i];
-    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
-    CHECK(writer->write(header));
-    for (int j = 0; j < numSeqs; ++j, ++seq) {
-      int64_t begin = seq;
-      int64_t end = seq + 1;
-      if (sequenceStartPositions) {
-        begin = sequenceStartPositions->getElement(seq);
-        end = sequenceStartPositions->getElement(seq + 1);
-      }
-      for (int pos = begin; pos < end; ++pos) {
-        DataSample sample;
-        makeSample(arguments, pos, pos == begin, &sample, useGpu);
-        CHECK(writer->write(sample));
-        ++numWritten;
-      }
-    }
-
-    writer.reset(nullptr);
-    os.close();
-  }
-  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
-}
-
-// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1,
-                 int64_t pos1,
-                 const vector<Argument>& args2,
-                 int64_t pos2,
-                 bool useGpu) {
-  EXPECT_EQ(args1.size(), args2.size());
-  VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
-
-  for (size_t i = 0; i < args1.size(); ++i) {
-    auto type = getSlotType(args1[i]);
-    int dim = getSlotDim(args1[i]);
-    EXPECT_EQ(type, getSlotType(args2[i]));
-    if (type == SlotDef::INDEX) {
-      EXPECT_GE(dim, getSlotDim(args2[i]));
-    } else {
-      EXPECT_EQ(dim, getSlotDim(args2[i]));
-    }
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        for (int j = 0; j < dim; ++j) {
-          EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
-                    static_cast<float>(args2[i].value->getElement(pos2, j)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        int colNum1, colNum2;
-        const int *rowCols1, *rowCols2;
-        const real *rowValues1, *rowValues2;
-        getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
-        getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
-        EXPECT_EQ(colNum1, colNum2);
-        for (int j = 0; j < colNum1; ++j) {
-          EXPECT_EQ(rowCols1[j], rowCols2[j]);
-          if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-            EXPECT_EQ(rowValues1[j], rowValues2[j]);
-          }
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
-        break;
-      }
-    }
-  }
-}
-
-void testProtoDataProvider(int* numPerSlotType,
-                           bool iid,
-                           bool async,
-                           bool useGpu,
-                           bool dataCompression,
-                           int numConstantSlots = 0) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data, numPerSlotType, iid, useGpu);
-  writeData(data, useGpu, dataCompression);
-
-  DataConfig config;
-  config.set_type("proto");
-  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
-  config.set_async_load_data(async);
-
-  for (int i = 0; i < numConstantSlots; ++i) {
-    config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(),
-                                 1,
-                                 /* trans= */ false,
-                                 /* useGpu= */ false);
-    w->assign(config.constant_slots(i));
-    data.appendData(w);
-  }
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  size_t seq1 = 0;
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args2) {
-      EXPECT_EQ(iid, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    VLOG(1) << "numSeqs=" << numSeqs;
-    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
-      int64_t begin1 = seq1;
-      int64_t end1 = seq1 + 1;
-      if (sequenceStartPositions1) {
-        begin1 = sequenceStartPositions1->getElement(seq1);
-        end1 = sequenceStartPositions1->getElement(seq1 + 1);
-        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
-      }
-
-      int64_t begin2 = seq2;
-      int64_t end2 = seq2 + 1;
-      if (sequenceStartPositions2) {
-        begin2 = sequenceStartPositions2->getElement(seq2);
-        end2 = sequenceStartPositions2->getElement(seq2 + 1);
-      }
-      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
-              << " begin2=" << begin2 << " end2=" << end2;
-      EXPECT_EQ(end1 - begin1, end2 - begin2);
-      for (int i = 0; i < end1 - begin1; ++i) {
-        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
-      }
-    }
-  }
-
-  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
-  const int numSlot = 5;
-  int combination[numSlot] = {0};
-  int k = numSlot - 1;
-  while (k >= 0) {
-    int numDenseVecSlots = numSlotsArray[combination[0]];
-    int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
-    int numSparseValueVectorSlots = numSlotsArray[combination[2]];
-    int numStrSlots = numSlotsArray[combination[3]];
-    int numIdSlots = numSlotsArray[combination[4]];
-    // while loop : traverse all cases
-    k = numSlot - 1;
-    while (k >= 0) {
-      if (combination[k] < (numSlotsArraySize - 1)) {
-        ++combination[k];
-        break;
-      } else {
-        combination[k] = 0;
-        --k;
-      }
-    }
-    if (numDenseVecSlots + numSparseNonValueVecSlots +
-            numSparseValueVectorSlots + numStrSlots + numIdSlots <
-        1)
-      continue;
-    for (int iid : numTwoArray) {
-      for (int async : numTwoArray) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numSparseValueVectorSlots="
-                      << numSparseValueVectorSlots
-                      << " numStrSlots=" << numStrSlots
-                      << " numIdSlots=" << numIdSlots << " iid=" << iid
-                      << " async=" << async << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
-                numSparseValueVectorSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(
-                numPerSlotType, iid, async, useGpu, dataCompression);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int async : numTwoArray)
-    }        // end for (int iid : numTwoArray)
-  }          // end for (while, traverse all slots)
-}
-
-TEST(ProtoDataProvider, constant_slots) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numDenseVecSlots : numSlotsArray) {
-    for (int numSparseNonValueVecSlots : numSlotsArray) {
-      if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
-      for (int numConstantSlots : {1, 2}) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numConstantSlogs=" << numConstantSlots
-                      << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
-            numPerSlotType[SlotDef::INDEX] = 1;
-            testProtoDataProvider(numPerSlotType,
-                                  /* iid= */ true,
-                                  /* async= */ false,
-                                  useGpu,
-                                  dataCompression,
-                                  numConstantSlots);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int numConstantSlots : {1, 2})
-    }        // end for (int numSparseNonValueVecSlots : numSlotsArray)
-  }          // end for (int numDenseVecSlots : numSlotsArray)
-}
-
-void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2,
-                         int64_t offset,
-                         int64_t numSeqs,
-                         bool useGpu) {
-  // check slot num are equal
-  EXPECT_EQ(args1.size(), args2.size());
-  for (size_t i = 0; i < args1.size(); i++) {
-    auto type = getSlotType(args1[i]);
-    // check for args2: sequenceStartPositions vs numSeqs
-    // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
-    // (2) content
-    auto checkArgContent = [&](const Argument& args, int numSeqs) {
-      for (int j = 0; j <= numSeqs; j++) {
-        int start_pos = args.sequenceStartPositions->getElement(j);
-        EXPECT_EQ(start_pos, j);
-      }
-    };
-    switch (type) {
-      case SlotDef::INDEX: {
-        // args1: for label
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: ids are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // args1: for sparse_non_value
-        // args2 should put sparse indexes in ids
-        int colNum1;
-        const int* rowCols1;
-        const real* rowValues1;  // nullptr
-        int totalLength = 0;
-        for (int j = 0; j < numSeqs; j++) {
-          getColRow(
-              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
-          // (1) lengths
-          EXPECT_EQ(totalLength,
-                    args2[i].sequenceStartPositions->getElement(j));
-          EXPECT_EQ(totalLength,
-                    args2[i].subSequenceStartPositions->getElement(j));
-          // (2) content
-          for (int k = 0; k < colNum1; k++) {
-            EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
-          }
-          totalLength += colNum1;
-          if (colNum1 == 0) {
-            // special case here: we will put a "-1" into ids when column num is
-            // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
-            EXPECT_EQ(-1, args2[i].ids->get(totalLength));
-            totalLength++;
-          }
-        }
-        EXPECT_EQ(totalLength,
-                  args2[i].sequenceStartPositions->getElement(numSeqs));
-        EXPECT_EQ(totalLength,
-                  args2[i].subSequenceStartPositions->getElement(numSeqs));
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // args1: for dense vector
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: values are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
-        EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
-            EXPECT_EQ(
-                static_cast<float>(args1[i].value->getElement(j + offset, k)),
-                static_cast<float>(args2[i].value->getElement(j, k)));
-          }
-        }
-        break;
-      }
-      default: { EXPECT_EQ(true, false) << "should not reach here"; }
-    }
-  }
-}
-
-void testProtoSequenceDataProvider(int* numPerSlotType,
-                                   bool async,
-                                   bool useGpu) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data,
-              numPerSlotType,
-              /* iid */ true,
-              useGpu);
-  writeData(data, useGpu, /* dataCompression */ false);
-
-  DataConfig config;
-  config.set_type("proto_sequence");
-  config.set_files(kProtoFileList);
-  config.set_async_load_data(async);
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  size_t args1Offset = 0;
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args1) {
-      // args1 should not has sequence
-      EXPECT_EQ(true, !arg.sequenceStartPositions);
-    }
-    for (auto& arg : args2) {
-      // args2 should has sequence
-      EXPECT_NE(true, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
-    args1Offset += numSeqs;
-  }
-
-  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoSequenceDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numSparseNonValueVecSlots : numSlotsArray) {
-    for (int numIdSlots : numSlotsArray) {
-      for (int numDenseVecSlots : numSlotsArray) {
-        if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
-          continue;
-        for (int async : numTwoArray) {
-          for (int useGpu : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifndef PADDLE_WITH_CUDA
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numIdSlots=" << numIdSlots << " async=" << async
-                      << " useGpu=" << useGpu;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
-          }  // end for (int useGpu : numTwoArray)
-        }    // end for (int async : numTwoArray)
-      }      // end for (int numDenseVecSlots : numSlotsArray)
-    }        // end for (int numIdSlots : numSlotsArray)
-  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
-}
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 53dd538360..e3eff59dc5 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
 }
 
 template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
 }  // namespace paddle
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 68b5296228..922fb51722 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -25,6 +25,17 @@ else()
     message(STATUS "Compile with MKLDNNMatrix")
 endif()
 
+if(MOBILE_INFERENCE)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
 set(MATH_SOURCES
     "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
     "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03..dc6979cf5a 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 36d57bbb65..522b436a2a 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "Matrix.h"
 
@@ -236,6 +239,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
@@ -309,3 +321,57 @@ private:
   using Matrix::subMatrix;
 };
 }  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 21a8f73c3e..a710479bab 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -152,12 +152,7 @@ void MKLDNNMatrix::downSpatial() {
   }
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  mkldnn_primitive_t result;
-  mkldnn::error::wrap_c_api(
-      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-      "could not create a memory primitive");
-  reset(result);
-  set_data_handle(data_);
+  resetMKLDNNMemory(pd, data_);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54cfefe23b..39d40a1f61 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -145,6 +145,27 @@ public:
     m_.reset();
   }
 
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
   /**
    * override Matrix::getData
    * check data before return
@@ -215,6 +236,17 @@ protected:
                    memory::format srcFmt,
                    memory::format dstFmt,
                    memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
 
 private:
   // save the CpuMatrixPtr in case the buffer released outside
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index c2f17beeb8..ba86eacbb5 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
 }
 #endif
 
-#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_USE_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -295,38 +295,6 @@ template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
-#ifdef PADDLE_USE_MKL
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
-#else
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -357,6 +325,4 @@ template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
 
-#endif
-
 }  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8193aa4adf..f6e77029bd 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -21,11 +21,6 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
 
-#ifdef PADDLE_USE_MKL
-#include <mkl.h>
-#include <mkl_lapacke.h>
-#endif
-
 #if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c3e34d5309..ebbbdfab1d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 }
 
 void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
   GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
@@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
     hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
   }
+#endif
 }
 
 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
@@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                     const GpuMatrix& b,
                     real scaleAB,
                     real scaleT) {
+#ifdef PADDLE_WITH_CUDA
   CHECK(isContiguous());
   CHECK(b.isContiguous());
   CHECK(b.useGpu_ == true) << "Matrix type are not equal";
@@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                           b.height_,
                           scaleAB,
                           scaleT);
+#endif
 }
 
 void GpuMatrix::mul(const GpuMatrix& a,
                     const GpuSparseMatrix& b,
                     real scaleAB,
                     real scaleT) {
+#ifdef PADDLE_WITH_CUDA
   CHECK(isContiguous());
   CHECK(a.isContiguous());
   CHECK(a.useGpu_ == true) << "Matrix type are not equal";
@@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
                             scaleAB,
                             scaleT);
   }
+#endif
 }
 
 /* this = a*b */
@@ -1028,15 +1034,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
+  real* maskData = NULL;
   size_t frameNum = inputMat.getHeight();
   CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
   hl_maxpool_forward(frameNum,
                      inputData,
                      channels,
@@ -1051,7 +1065,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     maskData);
 }
 
 void GpuMatrix::maxPoolBackward(Matrix& inputMat,
@@ -1115,7 +1130,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -1138,7 +1154,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     excludeMode);
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1153,7 +1170,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -1179,7 +1197,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       scaleTargets,
                       scaleOutput,
                       data_,
-                      outGrad.getStride());
+                      outGrad.getStride(),
+                      excludeMode);
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
@@ -1548,6 +1567,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
   GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
   auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
@@ -1563,9 +1583,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
   hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
   hl_matrix_multi_binary_cross_entropy(
       output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
   GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
   auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
@@ -1581,6 +1603,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
   hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
   hl_matrix_multi_binary_cross_entropy_bp(
       output_d, grad_d, mat_d, height_, width_);
+#endif
 }
 
 void GpuMatrix::vol2Col(real* dataSrc,
@@ -1973,9 +1996,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
   real* inputData = inputMat.getData();
   real* outData = data_;
+  real* maskData = NULL;
   size_t num = inputMat.getHeight();
   size_t inLength = imgSizeH * imgSizeW;
   size_t outLength = outputH * outputW;
@@ -1984,6 +2009,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
   CHECK_EQ(channels * outLength, this->getWidth());
   size_t outStride = getStride();
 
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
   /* initialize the data_ */
   for (size_t i = 0; i < height_; i++) {
     for (size_t j = 0; j < width_; j++) {
@@ -2005,10 +2035,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
           int wstart = pw * strideW - paddingW;
           int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[ph * outputW + pw] = std::max(
-                  outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+          if (maskData == NULL) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                outData[ph * outputW + pw] = std::max(
+                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+              }
+            }
+          } else {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
+                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
+                  maskData[ph * outputW + pw] = h * imgSizeW + w;
+                }
+              }
             }
           }
         }
@@ -2016,6 +2057,8 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
       // compute offset
       inputData += inLength;
       outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
     }
   }
 }
@@ -2097,7 +2140,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   // The main loop
   size_t num = input.getHeight();
   size_t inLength = imgSizeH * imgSizeW;
@@ -2126,7 +2170,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
               tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
             }
           }
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
           tgtData[ph * outputW + pw] /= poolSize;
         }
@@ -2150,7 +2195,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   size_t inLength = imgSizeH * imgSizeW;
@@ -2172,7 +2218,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
           int wstart = pw * strideW - paddingW;
           int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
 
           for (int h = hstart; h < hend; ++h) {
@@ -3226,6 +3273,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                            real scaleAB,
                                                            real scaleT);
 
+#ifndef PADDLE_MOBILE_INFERENCE
 void SharedCpuMatrix::mul(CpuSparseMatrix* a,
                           CpuMatrix* b,
                           real scaleAB,
@@ -3354,6 +3402,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
   }
 }
 
+#endif
 /* Add a (column) vector b to matrix a, column by column */
 void CpuMatrix::addColumnVector(const Matrix& b) {
   BaseMatrix::addColVector(const_cast<Matrix&>(b));
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 44180bca8b..c8e690e642 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -861,7 +861,8 @@ public:
 
   /**
    * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
    */
   virtual void maxPoolForward(Matrix& inputMat,
                               size_t imgSizeH,
@@ -874,7 +875,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -909,7 +911,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -925,9 +928,11 @@ public:
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
+
   /**
    * Pooling 3D forward operation, pick out the largest element
    * in the sizeX of value
@@ -1426,7 +1431,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
 
   void maxPoolBackward(Matrix& image,
                        size_t imgSizeH,
@@ -1455,7 +1461,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1469,7 +1476,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
@@ -1697,7 +1705,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
 
   void maxPoolBackward(Matrix& image,
                        size_t imgSizeH,
@@ -1726,7 +1735,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1740,7 +1750,8 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
 
   void maxPool3DForward(Matrix& inputMat,
                         Matrix& maxPoolIdx,
@@ -2066,6 +2077,7 @@ public:
 
 class SharedCpuMatrix : public CpuMatrix {
 public:
+#ifndef PADDLE_MOBILE_INFERENCE
   /* blockNum is number of partitions of the matrix  */
   SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
       : CpuMatrix(height, width, trans) {
@@ -2111,6 +2123,7 @@ private:
   ThreadLocal<CpuMatrixPtr> localBuf_;
   ThreadLocal<std::vector<int>> localBufRows_;
   ThreadLocal<std::vector<int>> blockSeq_;
+#endif
 };
 
 typedef struct { unsigned int col; } sparse_non_value_t;
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 439f11b79d..76909720f6 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) {
 }
 
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len);
 void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
+#endif
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
 void decayL1AvxImpl(
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 16300db081..e0a3c6d228 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "CpuSparseMatrix.h"
 #include "Matrix.h"
@@ -237,3 +240,47 @@ private:
 };
 
 }  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 8704eb038d..ca7a6806da 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
@@ -313,3 +315,27 @@ private:
 };
 
 }  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 4adaaef983..a2ef731ecb 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -17,9 +17,13 @@ limitations under the License. */
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
+#ifndef PADDLE_MOBILE_INFERENCE
 DEFINE_int32(pool_limit_size,
              536870912,
              "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
 
 namespace paddle {
 
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000..f805cad08b
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,739 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include <immintrin.h>
+#endif  // PADDLE_ARM
+
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+
+namespace paddle {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+public:
+  uint16_t x;
+
+  // Constructors
+  HOSTDEVICE inline float16() : x(0) {}
+
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit float16(const half& h) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(&h)->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(PADDLE_NEON)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(&rhs)->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit operator half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit operator Eigen::half() const {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_NEON)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
+}
+
+#endif  // PADDLE_CUDA_FP16
+
+// Arithmetic operators on ARMv8.2-A CPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}
+#endif
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index ceb96b2e25..215bac1271 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -3,8 +3,10 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_SparseMatrix)
 add_simple_unittest(test_RowBuffer)
+if(NOT MOBILE_INFERENCE)
+    add_simple_unittest(test_SparseMatrix)
+endif()
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
@@ -20,6 +22,7 @@ if(WITH_GPU)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -31,3 +34,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+cc_test(test_float16 SRCS test_float16.cpp)
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067..b998e5772e 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
       count++;
     }
   }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
 }
 
 template <typename AssertEq, typename Tensor1, typename Tensor2>
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000..74cc55aa37
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+  // Explicit conversion from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+  EXPECT_EQ(float16(1.0).x, 0x3c00);
+  EXPECT_EQ(float16(0.5).x, 0x3800);
+  EXPECT_EQ(float16(0.33333).x, 0x3555);
+  EXPECT_EQ(float16(0.0).x, 0x0000);
+  EXPECT_EQ(float16(-0.0).x, 0x8000);
+  EXPECT_EQ(float16(65504.0).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0).x, 0x7c00);
+
+  // Conversion from int
+  EXPECT_EQ(float16(-1).x, 0xbc00);
+  EXPECT_EQ(float16(0).x, 0x0000);
+  EXPECT_EQ(float16(1).x, 0x3c00);
+  EXPECT_EQ(float16(2).x, 0x4000);
+  EXPECT_EQ(float16(3).x, 0x4200);
+
+  // Conversion from bool
+  EXPECT_EQ(float16(true).x, 0x3c00);
+  EXPECT_EQ(float16(false).x, 0x0000);
+
+  // Default constructor
+  float16 v_def;
+  EXPECT_EQ(v_def.x, 0x0000);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = v_def;
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = Eigen::half(1.0f);
+  EXPECT_EQ(v_assign.x, 0x3c00);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3800);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3555);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbc00);
+  v_assign = true;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+
+  // Conversion operator
+  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(bool(float16(true)), true);
+}
+
+TEST(float16, arithmetic_cpu) {
+  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
+
+TEST(float16, comparison_cpu) {
+  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
+
+}  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
new file mode 100644
index 0000000000..4b520feaaf
--- /dev/null
+++ b/paddle/math/tests/test_float16.cu
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define COMPOUND_KERNEL(op_type, sign) \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+
+#define COMPARISON_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2, *out;                                    \
+    half *d_in1, *d_in2, *d_out;                              \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc((void**)&d_out, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    out = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
+    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    free(out);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+    cudaFree(d_out);                                          \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2;                                          \
+    half *d_in1, *d_in2;                                      \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2);                          \
+    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
+    half *in1, *in2;                                         \
+    half *d_in1, *d_in2;                                     \
+    bool *out, *d_out;                                       \
+    int size = sizeof(half);                                 \
+    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc((void**)&d_out, 1);                           \
+    in1 = (half*)malloc(size);                               \
+    in2 = (half*)malloc(size);                               \
+    out = (bool*)malloc(1);                                  \
+    in1[0] = half(float16(v_in1));                           \
+    in2[0] = half(float16(v_in2));                           \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
+    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
+    EXPECT_EQ(out[0], v_out);                                \
+    free(in1);                                               \
+    free(in2);                                               \
+    free(out);                                               \
+    cudaFree(d_in1);                                         \
+    cudaFree(d_in2);                                         \
+    cudaFree(d_out);                                         \
+  }
+
+#ifdef PADDLE_CUDA_FP16
+namespace paddle {
+
+#if CUDA_VERSION < 9000
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
+
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
+
+// Negative sign kernel
+__global__ void Neg(half* in) { in[0] = -in[0]; }
+
+void TestNeg(float v_in, float v_out) {
+  LOG(INFO) << "Test Neg on GPU!";
+  half *in, *d_in;
+  int size = sizeof(half);
+  cudaMalloc((void**)&d_in, size);
+  in = (half*)malloc(size);
+  in[0] = half(float16(v_in));
+  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+  Neg<<<1, 1>>>(d_in);
+  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+  EXPECT_EQ(float(float16(in[0])), v_out);
+  free(in);
+  cudaFree(d_in);
+}
+
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
+
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+  TestAdd(1, 2, 3);
+  TestSub(2, 1, 1);
+  TestMul(2, 3, 6);
+  TestDiv(6, 2, 3);
+  TestNeg(1, -1);
+}
+
+TEST(float16, compound_on_gpu) {
+  TestAddAssign(1, 2, 3);
+  TestSubAssign(2, 1, 1);
+  TestMulAssign(2, 3, 6);
+  TestDivAssign(6, 2, 3);
+}
+
+TEST(float16, comparision_on_gpu) {
+  TestEqual(1, 1, true);
+  TestEqual(1, 2, false);
+  TestNotEqual(2, 3, true);
+  TestNotEqual(2, 2, false);
+  TestLess(3, 4, true);
+  TestLess(3, 3, false);
+  TestLessEqual(3, 3, true);
+  TestLessEqual(3, 2, false);
+  TestGreater(4, 3, true);
+  TestGreater(4, 4, false);
+  TestGreaterEqual(4, 4, true);
+  TestGreaterEqual(4, 5, false);
+}
+#endif  // CUDA_VERSION
+
+TEST(float16, conversion_on_gpu) {
+  // Explicit conversion to and from cuda half
+  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = half(float16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3c00);
+}
+
+}  // namespace paddle
+#endif  // PADDLE_CUDA_FP16
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index aed5275dbf..8841c14ee0 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.cc DEPS place)
+cc_library(memory SRCS memory.cc DEPS place enforce)
 cc_library(memcpy SRCS memcpy.cc)
 
 cc_library(paddle_memory
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index 7f95e80f98..6cb003c50b 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -1,4 +1,141 @@
 # Region-based Heterogeneous Memory Management
+## Design
 
-Please check out the [design documentation](http://gangliao.me) to find out more details about
-buddy memory allocator for both CPU and GPU.
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::GPUPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template <typename Place> void* Alloc(Place, size_t);
+template <typename Place> void Free(Place, void*);
+template <typename Place> size_t Used(Place);
+}  // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+
+```cpp
+template<>
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and 
+
+```cpp
+template<>
+void Alloc<GPUPlace>(GPUPlace p, size_t size) {
+  return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = NULL;
+  if (a == NULL) {
+    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+  }
+  return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator* as = NULL;
+  if (as == NULL) {
+    as = new BuddyAllocator*[platform::NumGPUs()];
+    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+    }
+  }
+  return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+  ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+  struct Block {
+    size_t size;
+    Block* left, right;
+    size_t index; // allocator id
+  };
+  ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 6b4e46f56a..b543b767e8 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -83,7 +83,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
   paddle::platform::GpuMemoryUsage(available, capacity);
 
   // Reserve memory for page tables, etc.
-  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize();
   size_t usable = available > reserving ? available - reserving : 0;
 
   // If remaining size no less than expected size, using general
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5eb1c44eb6..9cafdfda75 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -64,35 +64,52 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
     int gpu_num = platform::GetCUDADeviceCount();
     as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
-      platform::SetDeviceId(gpu);
-      as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
-                                   platform::GpuMinChunkSize(),
-                                   platform::GpuMaxChunkSize());
+      as[gpu] = nullptr;
     }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
     VLOG(10) << "\n\nNOTE: each GPU device use "
              << FLAGS_fraction_of_gpu_memory_to_use * 100
              << "% of GPU memory.\n"
-             << "You can set environment variable '"
-             << platform::kEnvFractionGpuMemoryToUse
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
              << "' to change the fraction of GPU usage.\n\n";
   }
-  platform::SetDeviceId(gpu_id);
   return as[gpu_id];
 }
 
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
-  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
 }
 
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
 #endif
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index d39f7bf452..38b89b9eb1 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -9,6 +9,7 @@ function(op_library TARGET)
     set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
+    set(cu_cc_srcs)
     set(op_common_deps operator op_registry math_function)
     set(options "")
     set(oneValueArgs "")
@@ -22,6 +23,9 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
             list(APPEND cc_srcs ${TARGET}.cc)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
@@ -29,6 +33,8 @@ function(op_library TARGET)
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -43,7 +49,7 @@ function(op_library TARGET)
     endif()
 
     if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
@@ -55,6 +61,25 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    if ("${TARGET}" STREQUAL "compare_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
+    endif()
+
+    # conv_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
+    endif()
+
+    # conv_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
+    endif()
+
     # pool_op contains several operators
     if ("${TARGET}" STREQUAL "pool_op")
         set(pybind_flag 1)
@@ -62,6 +87,18 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
     endif()
 
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+    endif()
+
+    if ("${TARGET}" STREQUAL "logical_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
+    endif()
+
     # pool_with_index_op contains several operators
     if ("${TARGET}" STREQUAL "pool_with_index_op")
         set(pybind_flag 1)
@@ -69,11 +106,18 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
     endif()
 
-    # pool_cudnn_op contains several operators
-    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+    # conv_transpose_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_transpose_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
+    endif()
+
+    # conv_transpose_cudnn_op contains two operators
+    if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
     endif()
 
     # save_restore_op contains several operators
@@ -104,6 +148,11 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
     endif()
 
+    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
+        set(pybind_flag 1)
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+    endif()
+
     # pybind USE_NO_KERNEL_OP
     # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
     file(READ ${TARGET}.cc TARGET_CONTENT)
@@ -116,7 +165,9 @@ function(op_library TARGET)
 
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+
+    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -134,33 +185,82 @@ set(DEPS_OPS
     cond_op
     cross_entropy_op
     recurrent_op
-    dynamic_recurrent_op
     softmax_with_cross_entropy_op
+    softmax_op
+    sequence_softmax_op
     sum_op
     pool_op
     maxout_op
+    unpool_op
     pool_with_index_op
+    conv_op
+    conv_transpose_op
     nccl_op
     sequence_conv_op
+    sequence_pool_op
     lod_rank_table_op
-    lstm_op)
+    lod_tensor_to_array_op
+    array_to_lod_tensor_op
+    max_sequence_len_op
+    lstm_op
+    tensor_array_read_write_op
+    gru_op
+    adagrad_op
+    sgd_op
+    save_op
+    load_op
+    send_op
+    recv_op)
+
+if(WITH_DISTRIBUTE)
+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    send_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    recv_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+endif()
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
-op_library(sum_op DEPS net_op selected_rows_functor)
+op_library(softmax_op DEPS softmax)
+op_library(sequence_softmax_op DEPS softmax)
+op_library(sum_op DEPS selected_rows_functor)
+op_library(sgd_op DEPS selected_rows_functor)
+op_library(adagrad_op DEPS selected_rows_functor)
+op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
+op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
+op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
 op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
-op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-        DEPS net_op tensor_array)
+op_library(conv_transpose_op DEPS vol2col)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
+
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
@@ -168,14 +268,14 @@ endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
-        rnn/recurrent_op_utils.cc
-        DEPS dynamic_recurrent_op)
 if(WITH_GPU)
-  nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
+  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index eaafb9ad54..2785a8c6fb 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel {
                    "Input (Label) of accuracy op should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
                    "Output (Accuracy) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
+                   "Output (Correct) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Total"),
+                   "Output (Total) of AccuracyOp should not be null.");
 
     auto inference_dim = ctx->GetInputDim("Out");
     auto label_dim = ctx->GetInputDim("Label");
@@ -43,14 +47,17 @@ class AccuracyOp : public framework::OperatorWithKernel {
                       " the same as label.");
 
     ctx->SetOutputDim("Accuracy", {1});
+    ctx->SetOutputDim("Correct", {1});
+    ctx->SetOutputDim("Total", {1});
     ctx->ShareLoD("Out", /*->*/ "Accuracy");
   }
 
  protected:
-  // IndicateDataType
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
   }
 };
 
@@ -65,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Label", "Label of the training data");
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
+    AddOutput("Correct", "The correct samples count of current batch");
+    AddOutput("Total", "The samples count of current batch");
 
     AddComment(R"DOC(
 Accuracy Operator. 
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index a0483f367e..d2dcab4e54 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <thrust/reduce.h>
 #include "paddle/operators/accuracy_op.h"
 #include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -24,7 +25,8 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                    const int64_t* Xdata,
-                                   const int64_t* labeldata, float* accuracy) {
+                                   const int64_t* labeldata, int* correct_data,
+                                   float* accuracy) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -43,6 +45,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
   // reduce the count with init value 0, and output accuracy.
   int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
   if (threadIdx.x == 0) {
+    *correct_data = result;
     *accuracy = static_cast<float>(result) / static_cast<float>(N);
   }
 }
@@ -56,34 +59,50 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
     auto* inference = ctx.Input<Tensor>("Out");
     auto* indices = ctx.Input<Tensor>("Indices");
     auto* label = ctx.Input<Tensor>("Label");
+
     auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
     // FIXME(typhoonzero): only support indices currently
     // if add support for output values, how to detect the data type?
     const int64_t* indices_data = indices->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
 
-    size_t num_samples = inference->dims()[0];
+    int num_samples = static_cast<int>(inference->dims()[0]);
     size_t infer_width = inference->dims()[1];
-    cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+    auto stream = ctx.cuda_device_context().stream();
+    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
 
     if (num_samples == 0) {
       return;
     }
+    platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
+                             cudaMemcpyHostToDevice, stream);
+
+    AccuracyCudaKernel<
+        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_samples, infer_width, indices_data, label_data, correct_data,
+        accuracy_data);
 
-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
-        1, PADDLE_CUDA_NUM_THREADS, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(
-            ctx.device_context())
-            .stream()>>>(num_samples, infer_width, indices_data, label_data,
-                         accuracy_data);
+    int d_num_samples, d_num_correct;
+    float d_accuracy;
+    platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
+                             cudaMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
+                             cudaMemcpyDeviceToHost, stream);
+    platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
+                             cudaMemcpyDeviceToHost, stream);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
 REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index 1968b53d19..d060e6eddd 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
@@ -22,18 +21,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-
 template <typename Place, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
@@ -42,7 +29,11 @@ class AccuracyKernel : public framework::OpKernel<T> {
     auto* indices = ctx.Input<Tensor>("Indices");
     auto* label = ctx.Input<Tensor>("Label");
     auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
 
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
 
     const int64_t* indices_data = indices->data<int64_t>();
@@ -68,7 +59,8 @@ class AccuracyKernel : public framework::OpKernel<T> {
       }
     }
 
-    // FIXME(typhoonzero): we don't accumulate the accuracy for now.
+    *correct_data = num_correct;
+    *total_data = num_samples;
     *accuracy_data =
         static_cast<float>(num_correct) / static_cast<float>(num_samples);
   }
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 483f988897..7f3118f176 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,9 +44,9 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sigmoid operator");
     AddOutput("Y", "Output of Sigmoid operator");
     AddComment(R"DOC(
-Sigmoid activation operator.
+Sigmoid Activation Operator
 
-$y = 1 / (1 + e^{-x})$
+$$y = \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -60,9 +60,9 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of LogSigmoid operator");
     AddOutput("Y", "Output of LogSigmoid operator");
     AddComment(R"DOC(
-Logsigmoid activation operator.
+Logsigmoid Activation Operator
 
-$y = \log(1 / (1 + e^{-x}))$
+$$y = \log \frac{1}{1 + e^{-x}}$$
 
 )DOC");
   }
@@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Exp operator");
     AddOutput("Y", "Output of Exp operator");
     AddComment(R"DOC(
-Exp activation operator.
+Exp Activation Operator.
 
 $y = e^x$
 
@@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Relu operator");
     AddOutput("Y", "Output of Relu operator");
     AddComment(R"DOC(
-Relu activation operator.
+Relu Activation Operator.
 
 $y = \max(x, 0)$
 
@@ -98,7 +98,6 @@ $y = \max(x, 0)$
   }
 };
 
-template <typename AttrType>
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   LeakyReluOpMaker(framework::OpProto *proto,
@@ -106,10 +105,9 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LeakyRelu operator");
     AddOutput("Y", "Output of LeakyRelu operator");
-    AddAttr<AttrType>("alpha", "The small negative slope")
-        .SetDefault(static_cast<AttrType>(0.02f));
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
     AddComment(R"DOC(
-LeakyRelu activation operator.
+LeakyRelu Activation Operator.
 
 $y = \max(x, \alpha * x)$
 
@@ -117,7 +115,6 @@ $y = \max(x, \alpha * x)$
   }
 };
 
-template <typename AttrType>
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftShrinkOpMaker(framework::OpProto *proto,
@@ -125,10 +122,9 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softshrink operator");
     AddOutput("Y", "Output of Softshrink operator");
-    AddAttr<AttrType>("lambda", "non-negative offset")
-        .SetDefault(static_cast<AttrType>(0.5f));
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softshrink activation operator.
+Softshrink Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -149,7 +145,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Tanh operator");
     AddOutput("Y", "Output of Tanh operator");
     AddComment(R"DOC(
-Tanh activation operator.
+Tanh Activation Operator.
 
 $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
@@ -165,7 +161,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of TanhShrink operator");
     AddOutput("Y", "Output of TanhShrink operator");
     AddComment(R"DOC(
-TanhShrink activation operator.
+TanhShrink Activation Operator.
 
 $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
@@ -173,7 +169,6 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
   }
 };
 
-template <typename AttrType>
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   HardShrinkOpMaker(framework::OpProto *proto,
@@ -181,10 +176,10 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Y", "Output of HardShrink operator");
-    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
-HardShrink activation operator.
+HardShrink Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -205,7 +200,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Sqrt operator");
     AddOutput("Y", "Output of Sqrt operator");
     AddComment(R"DOC(
-Sqrt activation operator.
+Sqrt Activation Operator.
 
 $y = \sqrt{x}$
 
@@ -220,7 +215,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Abs operator");
     AddOutput("Y", "Output of Abs operator");
     AddComment(R"DOC(
-Abs activation operator.
+Abs Activation Operator.
 
 $y = |x|$
 
@@ -228,6 +223,51 @@ $y = |x|$
   }
 };
 
+class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Ceil operator");
+    AddOutput("Y", "Output of Ceil operator");
+    AddComment(R"DOC(
+Ceil Activation Operator.
+
+$y = ceil(x)$
+
+)DOC");
+  }
+};
+
+class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Floor operator");
+    AddOutput("Y", "Output of Floor operator");
+    AddComment(R"DOC(
+Floor Activation Operator.
+
+$y = floor(x)$
+
+)DOC");
+  }
+};
+
+class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Round operator");
+    AddOutput("Y", "Output of Round operator");
+    AddComment(R"DOC(
+Round Activation Operator.
+
+$y = [x]$
+
+)DOC");
+  }
+};
+
 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReciprocalOpMaker(framework::OpProto *proto,
@@ -236,7 +276,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Reciprocal operator");
     AddOutput("Y", "Output of Reciprocal operator");
     AddComment(R"DOC(
-Reciprocal activation operator.
+Reciprocal Activation Operator.
 
 $$y = \frac{1}{x}$$
 
@@ -251,7 +291,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Log operator");
     AddOutput("Y", "Output of Log operator");
     AddComment(R"DOC(
-Log activation operator.
+Log Activation Operator.
 
 $y = \ln(x)$
 
@@ -268,7 +308,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Square operator");
     AddOutput("Y", "Output of Square operator");
     AddComment(R"DOC(
-Square activation operator.
+Square Activation Operator.
 
 $y = x^2$
 
@@ -284,7 +324,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Softplus operator");
     AddOutput("Y", "Output of Softplus operator");
     AddComment(R"DOC(
-Softplus activation operator.
+Softplus Activation Operator.
 
 $y = \ln(1 + e^{x})$
 
@@ -300,7 +340,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Softsign operator");
     AddOutput("Y", "Output of Softsign operator");
     AddComment(R"DOC(
-Softsign activation operator.
+Softsign Activation Operator.
 
 $$y = \frac{x}{1 + |x|}$$
 
@@ -308,19 +348,18 @@ $$y = \frac{x}{1 + |x|}$$
   }
 };
 
-template <typename AttrType>
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of BRelu operator");
     AddOutput("Y", "Output of BRelu operator");
-    AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(0));
-    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
-        .SetDefault(static_cast<AttrType>(24));
+    AddAttr<float>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<float>(0));
+    AddAttr<float>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<float>(24));
     AddComment(R"DOC(
-BRelu activation operator.
+BRelu Activation Operator.
 
 $y = \max(\min(x, t_{min}), t_{max})$
 
@@ -328,7 +367,6 @@ $y = \max(\min(x, t_{min}), t_{max})$
   }
 };
 
-template <typename AttrType>
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftReluOpMaker(framework::OpProto *proto,
@@ -336,10 +374,10 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of SoftRelu operator");
     AddOutput("Y", "Output of SoftRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
-        .SetDefault(static_cast<AttrType>(40));
+    AddAttr<float>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(40.0f);
     AddComment(R"DOC(
-SoftRelu activation operator.
+SoftRelu Activation Operator.
 
 $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
@@ -347,17 +385,15 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
   }
 };
 
-template <typename AttrType>
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ELU operator");
     AddOutput("Y", "Output of ELU operator");
-    AddAttr<AttrType>("alpha", "The alpha value of ELU")
-        .SetDefault(static_cast<AttrType>(1.0f));
+    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
     AddComment(R"DOC(
-ELU activation operator.
+ELU Activation Operator.
 
 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
@@ -368,17 +404,16 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
   }
 };
 
-template <typename AttrType>
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu6 operator");
     AddOutput("Y", "Output of Relu6 operator");
-    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
-        .SetDefault(static_cast<AttrType>(6));
+    AddAttr<float>("threshold", "The threshold value of Relu6")
+        .SetDefault(6.0f);
     AddComment(R"DOC(
-Relu6 activation operator.
+Relu6 Activation Operator.
 
 $y = \min(\max(0, x), 6)$
 
@@ -386,17 +421,15 @@ $y = \min(\max(0, x), 6)$
   }
 };
 
-template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Pow operator");
     AddOutput("Y", "Output of Pow operator");
-    AddAttr<AttrType>("factor", "The exponential factor of Pow")
-        .SetDefault(static_cast<AttrType>(1));
+    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
     AddComment(R"DOC(
-Pow activation operator.
+Pow Activation Operator.
 
 $y = x^{factor}$
 
@@ -404,19 +437,18 @@ $y = x^{factor}$
   }
 };
 
-template <typename AttrType>
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of STanh operator");
     AddOutput("Y", "Output of STanh operator");
-    AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
-        .SetDefault(static_cast<AttrType>(2 / 3));
-    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
-        .SetDefault(static_cast<AttrType>(1.7159));
+    AddAttr<float>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(2.0f / 3.0f);
+    AddAttr<float>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(1.7159f);
     AddComment(R"DOC(
-STanh activation operator.
+STanh Activation Operator.
 
 $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
@@ -424,7 +456,6 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
   }
 };
 
-template <typename AttrType>
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ThresholdedReluOpMaker(framework::OpProto *proto,
@@ -432,10 +463,10 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Y", "Output of ThresholdedRelu operator");
-    AddAttr<AttrType>("threshold", "The threshold location of activation")
-        .SetDefault(static_cast<AttrType>(1.0));
+    AddAttr<float>("threshold", "The threshold location of activation")
+        .SetDefault(1.0f);
     AddComment(R"DOC(
-ThresholdedRelu activation operator.
+ThresholdedRelu Activation Operator.
 
 $$
 y = \begin{cases} 
@@ -448,7 +479,6 @@ $$
   }
 };
 
-template <typename AttrType>
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   HardSigmoidOpMaker(framework::OpProto *proto,
@@ -456,12 +486,12 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardSigmoid operator");
     AddOutput("Y", "Output of HardSigmoid operator");
-    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.2));
-    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
-        .SetDefault(static_cast<AttrType>(0.5));
+    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(0.2f);
+    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
-HardSigmoid activation operator.
+HardSigmoid Activation Operator.
 
 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.
@@ -476,6 +506,22 @@ It is recommended to use the defaults for this activation.
   }
 };
 
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Swish operator");
+    AddOutput("Y", "Output of Swish operator");
+    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
+    AddComment(R"DOC(
+Swish Activation Operator.
+
+$$y = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -499,7 +545,7 @@ REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
 REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
             tanh_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker<float>,
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
             softshrink_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
@@ -508,6 +554,15 @@ REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
             reciprocal_grad, ops::ActivationOpGrad);
 
@@ -523,37 +578,39 @@ REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
 REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
             leaky_relu_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
-            soft_relu_grad, ops::ActivationOpGrad);
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
+            ops::ActivationOpGrad);
 
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker<float>, relu6_grad,
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
             ops::ActivationOpGrad);
 
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
             hard_shrink_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(thresholded_relu, ops::ActivationOp,
-            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
-            ops::ActivationOpGrad);
+REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
+            thresholded_relu_grad, ops::ActivationOpGrad);
 
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
             hard_sigmoid_grad, ops::ActivationOpGrad);
 
+REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
+            ops::ActivationOpGrad);
+
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
   REGISTER_OP_CPU_KERNEL(                                                     \
       act_type,                                                               \
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index ceb4b4e40b..ac0e0a3b01 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -283,6 +283,41 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// ceil(x) = ceiling(x)
+template <typename T>
+struct CeilFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.ceil();
+  }
+};
+
+template <typename T>
+struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = static_cast<T>(0) / x;
+  }
+};
+
+// floor(x) = flooring(x)
+template <typename T>
+struct FloorFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.ceil();
+  }
+};
+
+// round(x) = [x]
+template <typename T>
+struct RoundFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.round();
+  }
+};
+
 // abs(x) = |x|
 template <typename T>
 struct AbsFunctor : public BaseActivationFunctor<T> {
@@ -665,6 +700,35 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct SwishFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  }
+};
+
+template <typename T>
+struct SwishGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = static_cast<T>(1) /
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * y));
+    dx.device(d) = dy * ((beta * y) + temp2);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -677,6 +741,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
+  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
   __macro(square, SquareFunctor, SquareGradFunctor);                 \
@@ -692,4 +759,5 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(elu, ELUFunctor, ELUGradFunctor);                          \
   __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
   __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
   __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 24e419b532..16a7794d5b 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad",
-             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
     AddInput("AvgSquaredUpdate",
-             "(Tensor) Input expectation of squared parameter updates");
+             "(Tensor) Input average of squared parameter updates");
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
     AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output expectation of squared gradient");
+              "(Tensor) Output average of squared gradient");
     AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output expectation of squared parameter updates");
+              "(Tensor) Output average of squared parameter updates");
 
     AddAttr<float>("rho",
                    "(float, default 0.95) Exponential decay rate "
@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
                    "numerical stability")
         .SetDefault(1.0e-6f);
     AddComment(R"DOC(
-Adadelta Updates Operator.
+Adadelta Optimizer.
 
-This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
-adaptive learning rate method for gradient descent.
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.
 
-Adadelta updates:
+Adadelta updates are as follows:
 
-avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
-param_update =  - sqrt((avg_squared_update + epsilon) /
-                       (avg_squared_grad_out + epsilon)) * grad
-avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
-param_out = param + param_update
-
-References:
-  [1] ADADELTA: An Adaptive Learning Rate Method
-      https://arxiv.org/abs/1212.5701
+$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
+paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
+                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
+avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
+                                  {(paramUpdate)}^2 \break
+paramOut = param + paramUpdate$$
 
 )DOC");
   }
@@ -111,4 +109,5 @@ References:
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index 3af1c8c8e9..9fb6185207 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -17,4 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
+    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
index d29e15c435..a8c5f0c8aa 100644
--- a/paddle/operators/adadelta_op.h
+++ b/paddle/operators/adadelta_op.h
@@ -33,8 +33,8 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
     avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
     avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float rho = ctx.Attr<float>("rho");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T rho = static_cast<T>(ctx.Attr<float>("rho"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index bc081f87dc..d6686e3ef3 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/operators/adagrad_op.h"
 
+#include <cmath>
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
 namespace paddle {
 namespace operators {
 
@@ -21,7 +26,7 @@ class AdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Param"),
                    "Input(Param) of AdagradOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Grad"),
@@ -54,8 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel {
 
 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdagradOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  AdagradOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
@@ -73,20 +78,99 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 
 Adaptive Gradient Algorithm (Adagrad).
 
-moment_out = moment + grad * grad
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+The update is done as follows:
+
+$$momentOut = moment + grad * grad \break
+paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$
 
 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here for numerical stability 
-by avoiding division by zero.
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.
 
 )DOC");
   }
 };
+
+namespace {
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_rows = grad.rows();
+    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto grad_width = grad.value().dims()[1];
+    std::unique_ptr<framework::SelectedRows> grad_merge{
+        new framework::SelectedRows()};
+    grad_merge->set_rows(merge_rows);
+    grad_merge->set_height(grad.height());
+    grad_merge->mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), grad_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUPlace, T> constant_functor;
+    constant_functor(context, grad_merge->mutable_value(), 0.0);
+
+    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
+    auto* grad_data = grad.value().data<T>();
+
+    for (size_t i = 0; i < grad_rows.size(); i++) {
+      size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
+      for (int64_t j = 0; j < grad_width; j++) {
+        grad_merge_data[grad_merge_i * grad_width + j] +=
+            grad_data[i * grad_width + j];
+      }
+    }
+
+    // 2. m += g_m * g_m
+    std::unique_ptr<framework::SelectedRows> grad_square{
+        new framework::SelectedRows()};
+    grad_square->set_rows(grad_merge->rows());
+    grad_square->set_height(grad_merge->height());
+    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
+                                                  context.GetPlace());
+    auto gs =
+        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
+    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
+    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
+
+    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
+    functor(context, *grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    for (size_t i = 0; i < merge_rows.size(); i++) {
+      for (int64_t j = 0; j < grad_width; j++) {
+        param_data[merge_rows[i] * grad_width + j] -=
+            lr[0] * grad_merge_data[i * grad_width + j] /
+            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
+      }
+    }
+  }
+};
+
+template struct SparseAdagradFunctor<platform::CPUPlace, float>;
+template struct SparseAdagradFunctor<platform::CPUPlace, double>;
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(adagrad,
-                       ops::AdagradOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index a5b7951121..1c870214b2 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -14,7 +14,138 @@
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/adagrad_op.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T, int block_size>
+__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
+                                T* grad_merge, const int64_t* grad_merge_rows,
+                                size_t grad_merge_rows_size,
+                                int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t grad_merge_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < grad_merge_rows_size; i++) {
+      if (grad_rows[ty] == grad_merge_rows[i]) {
+        grad_merge_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  grad += ty * row_numel;
+  grad_merge += grad_merge_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
+  }
+}
+
+template <typename T, int block_size>
+__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
+                                           const T* learning_rate, T* param,
+                                           T* moment, int64_t row_numel,
+                                           T epsilon) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  grad += ty * row_numel;
+  param += rows[ty] * row_numel;
+  moment += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(param + index,
+                                    -1.0 * learning_rate[0] * grad[index] /
+                                        (sqrt(moment[index]) + epsilon));
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_rows = grad.rows();
+    std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto grad_width = grad.value().dims()[1];
+    std::unique_ptr<framework::SelectedRows> grad_merge{
+        new framework::SelectedRows()};
+    grad_merge->set_rows(merge_rows);
+    grad_merge->set_height(grad.height());
+    grad_merge->mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), grad_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::GPUPlace, T> constant_functor;
+    constant_functor(context, grad_merge->mutable_value(), 0.0);
+
+    auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
+    auto* grad_data = grad.value().data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid1(1, grad_rows.size());
+
+    MergeGradKernel<
+        T, 256><<<grid1, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(grad_data, grad.rows().data(),
+                                   grad_merge_data, grad_merge->rows().data(),
+                                   grad_merge->rows().size(), grad_width);
+
+    // 2. m += g_m * g_m
+    std::unique_ptr<framework::SelectedRows> grad_square{
+        new framework::SelectedRows()};
+    grad_square->set_rows(grad_merge->rows());
+    grad_square->set_height(grad_merge->height());
+    grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
+                                                  context.GetPlace());
+    auto gs =
+        framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
+    auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
+    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
+
+    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
+    functor(context, *grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    dim3 grid2(1, merge_rows.size());
+    SparseAdagradFunctorKernel<
+        T, 256><<<grid2, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(grad_merge_data, grad_merge->rows().data(),
+                                   lr, param_data, moment_data, grad_width,
+                                   epsilon);
+  }
+};
+
+template struct SparseAdagradFunctor<platform::GPUPlace, float>;
+template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(adagrad,
-                       ops::AdagradOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
+    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index c5d8f751d3..4d4a6434c7 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -19,35 +19,59 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename Place, typename T>
+struct SparseAdagradFunctor {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param);
+};
+
 template <typename Place, typename T>
 class AdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
 
     param_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float epsilon = ctx.Attr<float>("epsilon");
-
-    auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
-    auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
-    auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
-    auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
-
-    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
-    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
-
-    moment_out.device(place) = moment + grad * grad;
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-    param_out.device(place) =
-        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto* grad_var = ctx.InputVar("Grad");
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto param = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Param"));
+      auto grad = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Grad"));
+      auto moment = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Moment"));
+      auto lr = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("LearningRate"));
+
+      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+      auto place = ctx.GetEigenDevice<Place>();
+
+      moment_out.device(place) = moment + grad * grad;
+      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+      param_out.device(place) =
+          param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
+
+      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
+
+      SparseAdagradFunctor<Place, T> functor;
+      functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
+              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
+              moment_out_tensor, param_out_tensor);
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
   }
 };
 
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index 3572de06bd..03faa2a7c5 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                       "Beta1 power accumulator should have 1 dimension");
     auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
         "Param and Grad input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment input of AdamOp should have same dimension");
+        "Param and Moment1 input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment2"),
-        "Param and InfNorm input of AdamOp should have same dimension");
+        "Param and Moment2 input of AdamOp should have same dimension");
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("Moment1Out", param_dims);
@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1.0e-8f);
 
     AddComment(R"DOC(
-Adam Updates Operator.
+Adam Optimizer.
 
 This implements the Adam optimizer from Section 2 of the Adam
-paper[1]. Adam is a first-order gradient-based optimization
-method based on adaptive estimates of lower-order moments.
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
 
 Adam updates:
 
-moment1_out = beta1 * moment1 + (1 − beta1) * grad
-moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
-learning_rate_t = learning_rate_t *
-                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
-param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
+moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
+learningRate = learningRate *
+                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
 
 )DOC");
   }
@@ -130,4 +127,5 @@ References:
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
 REGISTER_OP_CPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
index a3def912e5..6e34f7818c 100644
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
@@ -17,4 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(adam,
-                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 45938006db..7f7fa1da1c 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -31,9 +31,9 @@ class AdamOpKernel : public framework::OpKernel<T> {
     moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index ff25657741..867ddd9790 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -99,26 +99,24 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "Constant for numerical stability")
         .SetDefault(1.0e-8f);
     AddComment(R"DOC(
-Adamax Updates Operator.
+Adamax Optimizer.
 
-This implements the Adamax optimizer from Section 7 of the Adam
-paper[1]. Adamax is a variant of the
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
 Adam algorithm based on the infinity norm.
 
 Adamax updates:
 
-moment_out = beta1 * moment + (1 - beta1) * grad
-inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
-learning_rate_t = learning_rate/(1 - beta1_pow)
-param_out = param - learning_rate_t * moment_out/inf_norm_out
+$$
+  momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\
+  infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\
+  learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\
+  paramOut = param - learningRate * \frac{momentOut}{infNormOut}
+$$
 
 The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability
-by preventing divide by 0.
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+However, it is added here for numerical stability to prevent the
+division by 0 error.
 
 )DOC");
   }
@@ -130,4 +128,5 @@ References:
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
 REGISTER_OP_CPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
index fee3b6fc6b..057ef39025 100644
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -17,4 +17,5 @@
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(adamax,
-                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
index 2c99832ec0..bf36ed7860 100644
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
@@ -31,9 +31,9 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
     inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    float beta1 = ctx.Attr<float>("beta1");
-    float beta2 = ctx.Attr<float>("beta2");
-    float epsilon = ctx.Attr<float>("epsilon");
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
 
     auto param = framework::EigenVector<T>::Flatten(
         *ctx.Input<framework::Tensor>("Param"));
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000..1f2b4fdb4b
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  size_t GetOffset(const framework::Scope &scope,
+                   const platform::DeviceContext &dev_ctx) const {
+    auto *i = scope.FindVar(Input("I"));
+    PADDLE_ENFORCE(i != nullptr, "I must be set");
+    auto &i_tensor = i->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+    size_t offset;
+    if (platform::is_gpu_place(i_tensor.place())) {
+      // FIXME: Avoid copy from GPU to CPU
+      framework::Tensor t;
+      framework::CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx, &t);
+      dev_ctx.Wait();
+      offset = static_cast<size_t>(*t.data<int64_t>());
+    } else {
+      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
+    }
+    VLOG(10) << " Offset = " << offset;
+    return offset;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
new file mode 100644
index 0000000000..faeba7f3ed
--- /dev/null
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <numeric>
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+  ArrayToLoDTensorOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    // Check dims, place and data type of input's elements and infer output's
+    // dim
+    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+    int rank = x[0].dims().size();
+    platform::Place place = x[0].place();
+    std::type_index data_type = x[0].type();
+    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+    int64_t batch_size = x[0].dims()[0];
+    for (size_t i = 1; i < x.size(); ++i) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+                        "The dimension of the %zu'th element in LoDTensorArray "
+                        "differs from previous ones.",
+                        i);
+      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+                     "The place class of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      PADDLE_ENFORCE(x[i].type() == data_type,
+                     "The date type of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      batch_size += x[i].dims()[0];
+    }
+    auto ins_dim_vec = framework::vectorize(ins_dims);
+    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto &table_items = rank_table.items();
+    std::vector<size_t> table_item_idx(table_items.size());
+    // table_item_idx = range(table_items_idx.size())
+    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+    std::sort(table_item_idx.begin(), table_item_idx.end(),
+              [&](size_t a, size_t b) {
+                return table_items[a].index < table_items[b].index;
+              });
+
+    // Build LoDTensor `out`
+    framework::LoD *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    auto prefix_lod = rank_table.coarse_lod();
+    prefix_lod.emplace_back();
+    auto &cur_level_lod = prefix_lod.back();
+    cur_level_lod.push_back(0);
+    for (size_t idx : table_item_idx) {
+      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x[x_idx].lod(), idx, idx + 1, 0);
+
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(out_lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
+        // Copy data
+        PADDLE_ENFORCE_GE(end_offset, start_offset);
+        size_t len = end_offset - start_offset;
+        if (len == 0) {
+          continue;
+        }
+        auto slice = out->Slice(out_offset, out_offset + len);
+        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
+                            dev_ctx, &slice);
+        out_offset += len;
+      }
+    }
+    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+  }
+};
+
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(std::vector<LodTensor>) A vector of tensors that is going to "
+             "be casted to a big LoDTensor.");
+    AddInput("RankTable",
+             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "build the output LoDTensor. See "
+             "'paddle/framework/lod_rank_table.h' for more details.");
+    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+    AddComment(
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
+          would be the output of RNN Op and the LoDRankTable would be build 
+          with RNN's input.)DOC");
+  }
+};
+
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "ArrayToLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("RankTable"),
+                   "ArrayToLoDTensorOp must has input RankTable.");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("lod_tensor_to_array");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+                  ops::ArrayToLoDTensorOpProtoMaker,
+                  ops::ArrayToLoDTensorInferShape,
+                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
new file mode 100644
index 0000000000..0a37f18729
--- /dev/null
+++ b/paddle/operators/assign_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    auto *m = out_rows.mutable_value();
+    framework::CopyFrom(t, t.place(), dev_ctx_, m);
+  }
+
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index f5784922af..6c3f67ec32 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices must be initialized.");
+                   "Input of Indices should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
-                   "Input of Label must be initialized.");
+                   "Input of Label should not be null.");
     auto inference_height = ctx->GetInputDim("Out")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
@@ -39,10 +39,11 @@ class AucOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  // IndicateDataType
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Out")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
   }
 };
 
@@ -52,20 +53,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is descend sorted. This input should be the"
+             "Each row is sorted in descending order. This input should be the"
              "output of topk."
              "Typically, this tensor indicates the probability of each label");
     AddInput("Indices",
              "An int 2D tensor, indicating the indices of original"
-             "tensor before sort. Typically, this tensor indicates which label"
-             "the probability stands for.");
+             "tensor before sorting. Typically, this tensor indicates which "
+             "label the probability stands for.");
     AddInput("Label",
              "A 2D int tensor indicating the label of the training data."
              "The height is batch size and width is always 1.");
     // TODO(typhoonzero): support weight input
     AddOutput("AUC",
               "A scalar representing the "
-              "current area-under-curve.");
+              "current area-under-the-curve.");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
@@ -74,19 +75,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
                  " roc curve.")
         .SetDefault(200);
 
-    AddComment(
-        R"DOC(Computes the AUC according forward output and label.
-Best to use for binary classification evaluations.
+    AddComment(R"DOC(
+Area Under The Curve (AUC) Operator.
 
+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
 If input label contains values other than 0 and 1, it will be cast
-to bool.
-
-You can find the definations here: 
+to bool. You can find the relevant definitions here:
 https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
 
-Possible curves are:
-- ROC: Receiver operating characteristic
-- PR: Precision Recall
+There are two types of possible curves:
+1. ROC: Receiver operating characteristic
+2. PR: Precision Recall
 )DOC");
   }
 };
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 9c4bfd24c1..ac97bd83ab 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -19,9 +19,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
 using EigenArrayMap =
@@ -65,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const TensorFormat tensor_format =
         StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                              : x_dims[x_dims.size() - 1]);
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input x must have 3 to 5 dimensions.");
-
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -97,16 +95,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input tensor");
     AddInput("Scale",
              "Scale is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
     AddInput("Bias",
              "Bias is a 1-dimensional tensor of size C "
-             "to be applied to the output");
+             "that is applied to the output");
     AddInput("Mean",
-             "The global mean (for training) or the "
+             "The global mean (for training) or "
              "estimated mean (for testing)");
     AddInput("Variance",
              "The global variance (for training) "
-             "or the estimated Variance (for testing)");
+             "or estimated Variance (for testing)");
     AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
@@ -123,10 +121,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "will apply to output when training")
         .AsIntermediate();
     AddComment(R"DOC(
-https://arxiv.org/pdf/1502.03167.pdf
+Batch Normalization.
 
-NHWC `[batch, in_height, in_width, in_channels]`
-NCHW `[batch, in_channels, in_height, in_width]`
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
 
 )DOC");
   }
@@ -145,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -299,7 +301,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
   }
 
-  framework::DataType IndicateDataType(
+ protected:
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
@@ -314,7 +317,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::ToDataType(t->type());
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.device_context());
   }
 };
 
@@ -336,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
diff --git a/paddle/operators/batch_norm_op.cu b/paddle/operators/batch_norm_op.cu.cc
similarity index 93%
rename from paddle/operators/batch_norm_op.cu
rename to paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8..7b2f318700 100644
--- a/paddle/operators/batch_norm_op.cu
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
                   const TensorFormat &tensor_format, int *N, int *C, int *H,
                   int *W, int *D) {
   *N = dims[0];
-  *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-  *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
-  *W = dims.size() > 3
-           ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
-           : 1;
-  *D = dims.size() > 4
-           ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
-           : 1;
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
+    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             : 1;
+  }
 }
 
 template <typename T>
@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 
     const auto &x_dims = x->dims();
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
new file mode 100644
index 0000000000..c796a0c5d0
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor, LoDTensor* score_tensor)
+      : step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+
+  template <typename T>
+  void operator()() const;
+
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+
+template <typename T>
+void BeamSearchDecodeFunctor::operator()() const {
+  BeamSearchDecoder<T> beam_search_decoder;
+  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+                                   score_tensor_);
+}
+
+template <>
+void BeamSearchDecodeFunctor::operator()<bool>() const {
+  PADDLE_THROW("beam search decode op does not support bool!");
+}
+
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+  BeamSearchDecodeOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
+    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
+    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
+    const size_t step_num = ids->size();
+    PADDLE_ENFORCE_GT(step_num, 0UL,
+                      "beam search steps should be larger than 0");
+    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+
+    for (size_t i = 0; i < step_num; ++i) {
+      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+                        "Level of LodTensor should be 2");
+    }
+
+    // prepare output
+    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
+    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
+
+    framework::VisitDataType(
+        framework::ToDataType(scores->at(0).type()),
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+  }
+};
+
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddInput("Scores",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddOutput("SentenceIds",
+              "(LodTensor)"
+              "All possible result sentences of word ids");
+    AddOutput("SentenceScores",
+              "(LodTensor)"
+              "All possible result sentences of word scores");
+    AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+  }
+};
+
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("Ids"),
+                   "BeamSearchDecodeOp must has input Ids");
+    PADDLE_ENFORCE(context->HasInput("Scores"),
+                   "BeamSearchDecodeOp must has input Scores");
+    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+                   "BeamSearchDecodeOp must has output SentenceIds");
+    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+                   "BeamSearchDecodeOp must has output SentenceScores");
+  }
+};
+
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind& op_desc,
+                  framework::BlockDescBind* block) const override {
+    for (auto& o : op_desc.Output("SentenceIds")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+    for (auto& o : op_desc.Output("SentenceScores")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+                  paddle::operators::BeamSearchDecodeOpProtoMaker,
+                  paddle::operators::BeamSearchDecodeInferShape,
+                  paddle::operators::BeamSearchDecodeInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
new file mode 100644
index 0000000000..3b1c6cd7a1
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template <typename T>
+struct BeamNode {
+  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+
+  ~BeamNode() {
+    if (parent_) {
+      parent_->DropKid(this);
+      if (parent_->kids_.size() == 0UL) {
+        delete parent_;
+      }
+    }
+    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+  }
+
+  void AppendTo(BeamNode* parent) {
+    parent_ = parent;
+    parent->kids_.insert(this);
+  }
+
+  void DropKid(BeamNode* kid) { kids_.erase(kid); }
+
+  BeamNode* parent_ = nullptr;
+  std::unordered_set<BeamNode*> kids_;
+  int64_t word_id_;
+  T score_;
+};
+
+template <typename T>
+using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
+
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+
+template <typename T>
+struct BeamSearchDecoder {
+  /**
+   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+   */
+  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
+
+  /**
+   * Param:
+   *  cur_ids: LoDTensor of One step for word ID
+   *  cur_scores: LoDTensor of One Step for word score
+   *  prefixes_list: prefixes for each source sentence.
+   *  sentence_vector_list: result sentence_vector for each source sentence.
+   * Return:
+   *  a new prefixes list for each source of current step
+   */
+  std::vector<BeamNodeVector<T>> PackTwoSteps(
+      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor) const;
+
+  /**
+   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * it's main logic is:
+   * ```python
+   *   prefix
+   *   result_sentence
+   *   result_lod_tensor
+   *
+   *   for (step in steps):
+   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
+   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
+   * ```
+   */
+  void PackAllSteps(const LoDTensorArray& step_ids,
+                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                    LoDTensor* score_tensor) const;
+};
+
+template <typename T>
+Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
+  Sentence<T> sentence;
+  while (node != nullptr) {
+    sentence.word_ids.emplace_back(node->word_id_);
+    sentence.scores.emplace_back(node->score_);
+    node = node->parent_;
+  }
+
+  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+
+  return sentence;
+}
+
+template <typename T>
+std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
+    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<SentenceVector<T>>* sentence_vector_list) const {
+  std::vector<BeamNodeVector<T>> result;
+
+  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+       ++src_idx) {
+    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+
+    BeamNodeVector<T> beam_nodes;
+
+    // if prefixes size is 0, it means this is the first step. In this step,
+    // all candidate id is the start of candidate sentences.
+    if (prefixes_list.empty()) {
+      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+                        cur_ids.lod().at(kSentenceLevel).back(),
+                        "in the first step");
+      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
+            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
+      }
+    } else {
+      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
+
+      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+                        "prefix and candidate set number should be the same");
+
+      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
+        size_t candidate_start = candidate_offset[src_start + prefix_idx];
+        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+        if (candidate_start == candidate_end) {
+          VLOG(3) << "this sentence has no more candidate, "
+                     "add to result sentence and rm it from beam tree";
+          sentence_vector.push_back(MakeSentence(prefix.get()));
+          prefix.reset();
+        } else {
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            auto* candidate =
+                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
+                                cur_scores.data<T>()[candidate_idx]);
+            candidate->AppendTo(prefix.get());
+            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
+          }
+          prefix.release();
+        }
+      }
+    }
+    result.push_back(std::move(beam_nodes));
+  }
+  return result;
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor) const {
+  size_t src_num = sentence_vector_list.size();
+
+  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                     sentence.word_ids.end());
+      score_data.insert(score_data.end(), sentence.scores.begin(),
+                        sentence.scores.end());
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
+
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
+  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+                                        const LoDTensorArray& step_scores,
+                                        LoDTensor* id_tensor,
+                                        LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+
+  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+
+  // previous prefixes for each step,
+  // the init length is 0, means this is the first step.
+  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+
+  // pack all steps for one batch first, then another batch
+  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+    beamnode_vector_list =
+        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+                     beamnode_vector_list, &sentence_vector_list);
+  }
+  // append last beam_node to result
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+      beam_node.reset();
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/operators/beam_search_decode_op_test.cc
new file mode 100644
index 0000000000..5ac23991f3
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+
+template <typename T>
+using BeamNode = paddle::operators::BeamNode<T>;
+template <typename T>
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
+template <typename T>
+using Sentence = paddle::operators::Sentence<T>;
+template <typename T>
+using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
+template <typename T>
+using SentenceVector = paddle::operators::SentenceVector<T>;
+
+namespace paddle {
+namespace test {
+
+void GenerateExample(const std::vector<size_t>& level_0,
+                     const std::vector<size_t>& level_1,
+                     const std::vector<int>& data, LoDTensorArray* ids,
+                     LoDTensorArray* scores) {
+  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+                    "source level is used to describe candidate set");
+  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+                    "the lowest level is used to describe data"
+                    ", so it's last element should be data length");
+
+  CPUPlace place;
+
+  LoD lod;
+  lod.push_back(level_0);
+  lod.push_back(level_1);
+
+  // Ids
+  LoDTensor tensor_id;
+  tensor_id.set_lod(lod);
+  tensor_id.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    id_ptr[i] = static_cast<int64_t>(data.at(i));
+  }
+
+  // Scores
+  LoDTensor tensor_score;
+  tensor_score.set_lod(lod);
+  tensor_score.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  float* score_ptr = tensor_score.mutable_data<float>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    score_ptr[i] = static_cast<float>(data.at(i));
+  }
+
+  ids->push_back(tensor_id);
+  scores->push_back(tensor_score);
+}
+
+}  // namespace test
+}  // namespace paddle
+
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* b2 = new BeamNode<float>(2, 2);
+  auto* b3 = new BeamNode<float>(3, 3);
+
+  b1->AppendTo(root);
+  b2->AppendTo(root);
+  b3->AppendTo(b1);
+
+  delete b3;
+  delete b2;
+}
+
+TEST(BeamSearchDecodeOp, MakeSentence) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* end = new BeamNode<float>(2, 2);
+  b1->AppendTo(root);
+  end->AppendTo(b1);
+
+  BeamSearchDecoder<float> helper;
+  Sentence<float> sentence = helper.MakeSentence(end);
+  delete end;
+
+  std::vector<int64_t> expect_ids = {0, 1, 2};
+  ASSERT_EQ(sentence.word_ids, expect_ids);
+
+  std::vector<float> expect_scores = {0, 1, 2};
+  ASSERT_EQ(sentence.scores, expect_scores);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
+  CPUPlace place;
+
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  BeamSearchDecoder<float> helper;
+  beamnode_vector_list = helper.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoSteps) {
+  CPUPlace place;
+
+  // first source has three prefix
+  BeamNodeVector<float> source0_prefixes;
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
+
+  // second source has two prefix
+  BeamNodeVector<float> source1_prefixes;
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  beamnode_vector_list.push_back(std::move(source0_prefixes));
+  beamnode_vector_list.push_back(std::move(source1_prefixes));
+
+  // generate data for one step
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
+                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  BeamSearchDecoder<float> helper1;
+  beamnode_vector_list = helper1.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+
+  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
+  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
+}
+
+TEST(BeamSearchDecodeOp, PackAllSteps) {
+  CPUPlace place;
+
+  // we will constuct a sample data with 3 steps and 2 source sentences
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(scores.size(), 3UL);
+
+  BeamSearchDecoder<float> helper;
+
+  LoDTensor id_tensor;
+  LoDTensor score_tensor;
+  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+
+  LoD lod = id_tensor.lod();
+  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  EXPECT_EQ(lod[0], expect_source_lod);
+  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  EXPECT_EQ(lod[1], expect_sentence_lod);
+  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
+  for (size_t i = 0; i < expect_data.size(); ++i) {
+    ASSERT_EQ(id_tensor.data<int64_t>()[i],
+              static_cast<int64_t>(expect_data[i]));
+  }
+  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
+    ASSERT_EQ(score_tensor.data<float>()[i],
+              static_cast<float>(id_tensor.data<int64_t>()[i]));
+  }
+}
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
new file mode 100644
index 0000000000..8c3e2a303f
--- /dev/null
+++ b/paddle/operators/beam_search_op.cc
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/beam_search_op.h"
+
+#include <map>
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            framework::LoDTensor *selected_ids,
+                            framework::LoDTensor *selected_scores) {
+  auto items = SelectTopBeamSizeItems();
+  auto selected_items = ToMap(items);
+  PruneEndidCandidates(pre_ids, &selected_items);
+  // calculate the output tensor's height
+  size_t num_instances = std::accumulate(
+      std::begin(items), std::end(items), 0,
+      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+  // the output tensor shape should be [num_instances, 1]
+  auto dims = framework::make_ddim(
+      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+  selected_ids->Resize(dims);
+  selected_scores->Resize(dims);
+
+  std::map<size_t /*offset*/, std::vector<Item>> hash;
+  framework::LoD new_lod;
+  auto *ids_data = selected_ids->mutable_data<int>(platform::CPUPlace());
+  auto *scores_data =
+      selected_scores->mutable_data<float>(platform::CPUPlace());
+
+  // fill in data
+  std::vector<size_t> low_level;
+  size_t low_offset = 0;
+  for (auto &items : selected_items) {
+    low_level.push_back(low_offset);
+    for (auto &item : items) {
+      ids_data[low_offset] = item.id;
+      scores_data[low_offset] = item.score;
+      low_offset++;
+    }
+  }
+  // fill lod
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+  framework::LoD lod(2);
+  lod[0].assign(high_level.begin(), high_level.end());
+  lod[1].assign(low_level.begin(), low_level.end());
+  selected_ids->set_lod(lod);
+  selected_scores->set_lod(lod);
+}
+
+void BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
+                                      std::vector<std::vector<Item>> *items) {
+  auto *pre_ids_data = pre_ids.data<int>();
+
+  for (size_t offset = 0; offset < items->size(); offset++) {
+    auto prefix_id = pre_ids_data[offset];
+    if (prefix_id == end_id_) {
+      items->at(offset).clear();
+    }
+  }
+}
+
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
+    const std::vector<std::vector<Item>> &items) {
+  std::vector<std::vector<Item>> result;
+  for (auto &entries : items) {
+    for (const auto &item : entries) {
+      if (item.offset >= result.size()) {
+        result.resize(item.offset + 1);
+      }
+      result[item.offset].push_back(item);
+    }
+  }
+  return result;
+}
+
+std::vector<std::vector<BeamSearch::Item>>
+BeamSearch::SelectTopBeamSizeItems() {
+  std::vector<std::vector<Item>> result;
+  std::vector<Item> items;
+  // for each source sentence, select the top beam_size items across all
+  // candidate sets.
+  while (NextItemSet(&items)) {
+    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
+                     std::end(items), [](const Item &a, const Item &b) {
+                       // TODO(superjom) make score's comparation customizable.
+                       // partial sort in descending order
+                       return a.score > b.score;
+                     });
+    // prune the top beam_size items.
+    if (items.size() > beam_size_) {
+      items.resize(beam_size_);
+    }
+    result.emplace_back(items);
+  }
+  return result;
+}
+
+// the candidates of a source
+bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
+    return false;
+  }
+  // find the current candidates
+  auto ids = *ids_;
+  auto scores = *scores_;
+
+  auto source_abs_two_level_lod = framework::SliceInLevel(
+      ids.lod(), lod_level_, sent_offset_, sent_offset_ + 1);
+  source_abs_two_level_lod = framework::ToAbsOffset(source_abs_two_level_lod);
+  auto abs_lod = framework::ToAbsOffset(ids.lod());
+  PADDLE_ENFORCE_GE(source_abs_two_level_lod.size(), 2UL);
+
+  auto *ids_data = ids.data<int>();
+  auto *scores_data = scores.data<float>();
+
+  size_t instance_dim = 1;
+  for (int i = 1; i < ids.dims().size(); i++) {
+    instance_dim *= ids.dims()[i];
+  }
+
+  items->clear();
+  items->reserve(framework::product(ids.dims()));
+  for (size_t offset = abs_lod[lod_level_][sent_offset_];
+       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
+    for (size_t d = 0; d < instance_dim; d++) {
+      const size_t dim_offset = offset * instance_dim + d;
+      items->emplace_back(offset, ids_data[dim_offset],
+                          scores_data[dim_offset]);
+    }
+  }
+
+  sent_offset_++;
+  return true;
+}
+
+class BeamSearchProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
+                                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // inputs and outputs stored in proto
+    AddInput("pre_ids", "ids in previous step");
+    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("scores",
+             "a LoDTensor that has the same shape and LoD with `ids`");
+    AddOutput("selected_ids",
+              "a LoDTensor that stores the IDs selected by beam search");
+    AddOutput(
+        "selected_scores",
+        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+
+    // Attributes stored in AttributeMap
+    AddAttr<int>("level", "the level of LoDTensor");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
+
+    AddComment(
+        "This is a beam search operator that help to generate sequences.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp,
+                             paddle::operators::BeamSearchProtoAndCheckerMaker);
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
new file mode 100644
index 0000000000..cc556bfe42
--- /dev/null
+++ b/paddle/operators/beam_search_op.h
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include "gtest/gtest.h"
+#endif
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ * Input
+ *
+ * ids:
+ * LoD (should have 2 levels)
+ * first level: [0, 1, 4]
+ * second level: [0, 1, 2, 3, 4]
+ *
+ * tensor's data
+ * [
+ * [4, 2, 5]
+ * [2, 1, 3]
+ * [3, 5, 2]
+ * [8, 2, 1]
+ * ]
+ *
+ * scores:
+ * LoD same as `ids`
+ * tensor's data
+ * [
+ * [0.5, 0.3, 0.2]
+ * [0.6, 0.3, 0.1]
+ * [0.9, 0.5, 0.1]
+ * [0.7, 0.5, 0.1]
+ * ]
+ *
+ * the inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * lets assume beam size is 2, and the beam search's output should be
+ * LoD
+ * first level:
+ * [0, 1, 2]
+ * second level:
+ * [0, 2, 4]
+ *
+ * tensor's data
+ * [[
+ * 0.5,
+ * 0.3,
+ * 0.9,
+ * 0.7
+ * ]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+class BeamSearch {
+ public:
+  // TODO(superjom) make type customizable
+  using id_t = size_t;
+  using score_t = float;
+  /*
+   * Input the arguments that needed by this class.
+   */
+  BeamSearch(const framework::LoDTensor& ids,
+             const framework::LoDTensor& scores, size_t level, size_t beam_size,
+             int end_id)
+      : beam_size_(beam_size),
+        ids_(&ids),
+        scores_(&scores),
+        lod_level_(level),
+        end_id_(end_id) {}
+
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1]
+   *   -  [0 1 2]]
+   *   - [[]
+   *   -  [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the
+   *   source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(const framework::LoDTensor& pre_ids,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores);
+
+ protected:
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the lod_level_+1
+    size_t offset;
+    // the candidate id
+    id_t id;
+    // the corresponding score
+    score_t score;
+  };
+
+  void PruneEndidCandidates(const framework::LoDTensor& pre_ids,
+                            std::vector<std::vector<Item>>* items);
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>>& inputs);
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+
+  /*
+   * Get the items of next source sequence, return false if no remaining items.
+   */
+  bool NextItemSet(std::vector<Item>* items);
+
+ private:
+  size_t beam_size_;
+  const framework::LoDTensor* ids_;
+  const framework::LoDTensor* scores_;
+  size_t lod_level_{0};
+  size_t sent_offset_{0};
+  int end_id_{0};
+};
+
+class BeamSearchOp : public framework::OperatorBase {
+ public:
+  BeamSearchOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  BeamSearchOp(const BeamSearchOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not Implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    LOG(INFO) << "run beam search op";
+    auto ids_var = scope.FindVar(Input("ids"));
+    auto scores_var = scope.FindVar(Input("scores"));
+    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
+    PADDLE_ENFORCE_NOT_NULL(ids_var);
+    PADDLE_ENFORCE_NOT_NULL(scores_var);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+
+    auto& ids = ids_var->Get<framework::LoDTensor>();
+    auto& scores = scores_var->Get<framework::LoDTensor>();
+    auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
+    size_t level = Attr<int>("level");
+    size_t beam_size = Attr<int>("beam_size");
+    int end_id = Attr<int>("end_id");
+    LOG(INFO) << "init beam search";
+    BeamSearch alg(ids, scores, level, beam_size, end_id);
+
+    LOG(INFO) << "after beam search";
+    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
+    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
+    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
+    auto& selected_ids_tensor =
+        *selected_ids_var->GetMutable<framework::LoDTensor>();
+    auto& selected_scores_tensor =
+        *selected_scores_var->GetMutable<framework::LoDTensor>();
+    LOG(INFO) << "run beam search";
+    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
+    LOG(INFO) << "finish beam search";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
new file mode 100644
index 0000000000..c88b2c9beb
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
+Output is computed by one slice $i = 1, . . . , k$ of the tensor:
+
+$$
+M =  (X W_i) * Y \\
+Out_i = \sum_j {M_j} + Bias_i
+$$
+
+Where $W_i$ is the $i$-th slice of Input(Weight);
+      $M_j$ is the $j$-th column of $M$;
+      $Out_i$ is the $i$-th column of Output(Out);
+      $Bias_i$ is a column vector, each element of it is equal to
+        the $i$-th element of $Bias$;
+
+)DOC");
+  }
+};
+
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
new file mode 100644
index 0000000000..858d2668d0
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
new file mode 100644
index 0000000000..1113a4c6f3
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                           batch_size, y_dim, x_dim, 1, x->data<T>(),
+                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+    math::SetConstant<Place, T> set_zero;
+
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+    }
+
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+    }
+
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
+                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
+                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
+                             y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index 19187894c3..3082a53ccf 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   CastOpProtoMaker(framework::OpProto *proto,
                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensor of cast op");
-    AddOutput("Out", "the output tensor of cast op");
-    AddComment(R"DOC(Cast operator.
-cast the input tensor to other data type.
+    AddInput("X", "The input tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
+    AddAttr<int>("out_dtype", "output data type");
+    AddAttr<int>("in_dtype", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+
 )DOC");
-    AddAttr<int>("out_data_type", "output data type");
-    AddAttr<int>("in_data_type", "input data type");
   }
 };
 
@@ -54,8 +58,8 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
     grad->SetType("cast");
     grad->SetInput("X", OutputGrad("Out"));
     grad->SetOutput("Out", InputGrad("X"));
-    grad->SetAttr("out_data_type", GetAttr("in_data_type"));
-    grad->SetAttr("in_data_type", GetAttr("out_data_type"));
+    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
+    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
     return std::unique_ptr<framework::OpDescBind>(grad);
   }
 };
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index ffdbff7030..850dc8e349 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -55,7 +55,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_data_type")),
+        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
         CastOpFunctor<Place, InT>(in, out, context.device_context()));
   }
 };
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
new file mode 100644
index 0000000000..94127ab33e
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/chunk_eval_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ChunkEvalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input(Inference) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
+                   "Output(Precision) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
+                   "Output(Recall) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
+                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
+
+    PADDLE_ENFORCE(inference_dim == label_dim,
+                   "Inference's shape must be the same as Label's shape.");
+
+    ctx->SetOutputDim("Precision", {1});
+    ctx->SetOutputDim("Recall", {1});
+    ctx->SetOutputDim("F1-Score", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChunkEvalOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Inference",
+             "(Tensor, default: Tensor<int64_t>). "
+             "Predictions from the network.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
+    AddOutput("Precision",
+              "(float). The evaluated precision (called positive predictive "
+              "value) of chunks on the given mini-batch.");
+    AddOutput("Recall",
+              "(float). The evaluated recall (true positive rate or "
+              "sensitivity) of chunks on the given mini-batch.");
+    AddOutput("F1-Score",
+              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddAttr<int>("num_chunk_types",
+                 "(int). The number of chunk type. See below for details.");
+    AddAttr<std::string>(
+        "chunk_scheme",
+        "(string, default IOB). The labeling scheme indicating "
+        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
+        "for details.")
+        .SetDefault("IOB");
+    AddAttr<std::vector<int>>("excluded_chunk_types",
+                              "(list<int>) A list including chunk type ids "
+                              "indicating chunk types that are not counted. "
+                              "See below for details.")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(
+For some basics of chunking, please refer to
+‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+
+
+CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+Here is a NER example of labeling for these tagging schemes:
+
+ 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+
+There are three chunk types(named entity types) including PER(person), ORG(organization)
+and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+Since the calculations actually use label ids rather than labels, extra attention
+should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+is that the listed equations are satisfied by ids.
+
+    tag_type = label % num_tag_type
+    chunk_type = label / num_tag_type
+
+where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+is the num of chunk types, and `tag_type` get its value from the following table.
+
+    Scheme Begin Inside End   Single
+     plain   0     -      -     -
+     IOB     0     1      -     -
+     IOE     -     0      1     -
+     IOBES   0     1      2     3
+
+Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    B-ORG  0
+    I-ORG  1
+    B-PER  2
+    I-PER  3
+    B-LOC  4
+    I-LOC  5
+    O      6
+
+It’s not hard to verify the equations noting that the num of chunk types
+is 3 and the num of tag types in IOB scheme is 2. For example, the label
+id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+I-LOC is 2, which consistent with the results from the equations.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
+                             ops::ChunkEvalOpMaker);
+REGISTER_OP_CPU_KERNEL(chunk_eval,
+                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
new file mode 100644
index 0000000000..dd88f2553b
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <set>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class ChunkEvalKernel : public framework::OpKernel<T> {
+ public:
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  void GetSegments(const int64_t* label, int length,
+                   std::vector<Segment>& segments, int num_chunk_types,
+                   int num_tag_types, int other_chunk_type, int tag_begin,
+                   int tag_inside, int tag_end, int tag_single) const {
+    segments.clear();
+    segments.reserve(length);
+    int chunk_start = 0;
+    bool in_chunk = false;
+    int tag = -1;
+    int type = other_chunk_type;
+    for (int i = 0; i < length; ++i) {
+      int prev_tag = tag;
+      int prev_type = type;
+      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
+      tag = label[i] % num_tag_types;
+      type = label[i] / num_tag_types;
+      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
+                               tag_begin, tag_inside, tag_end, tag_single)) {
+        Segment segment{
+            chunk_start,  // begin
+            i - 1,        // end
+            prev_type,
+        };
+        segments.push_back(segment);
+        in_chunk = false;
+      }
+      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
+                     tag_begin, tag_inside, tag_end, tag_single)) {
+        chunk_start = i;
+        in_chunk = true;
+      }
+    }
+    if (in_chunk) {
+      Segment segment{
+          chunk_start,  // begin
+          length - 1,   // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
+                int other_chunk_type, int tag_begin, int tag_inside,
+                int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return false;
+    if (type == other_chunk_type) return true;
+    if (type != prev_type) return true;
+    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_end) return true;
+    if (prev_tag == tag_single) return true;
+    return false;
+  }
+
+  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return type != other_chunk_type;
+    if (type == other_chunk_type) return false;
+    if (type != prev_type) return true;
+    if (tag == tag_begin) return true;
+    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_single) return true;
+    return false;
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    // initialize to parse configurations
+    int num_chunk_types, num_tag_types;
+    int other_chunk_type;
+    int tag_begin, tag_inside, tag_end, tag_single;
+    std::vector<Segment> label_segments;
+    std::vector<Segment> output_segments;
+    std::set<int> excluded_chunk_types;
+    int64_t num_output_segments = 0;
+    int64_t num_label_segments = 0;
+    int64_t num_correct = 0;
+    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
+      num_tag_types = 2;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = -1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
+      num_tag_types = 2;
+      tag_begin = -1;
+      tag_inside = 0;
+      tag_end = 1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
+      num_tag_types = 4;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = 2;
+      tag_single = 3;
+    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
+      num_tag_types = 1;
+      tag_begin = -1;
+      tag_inside = -1;
+      tag_end = -1;
+      tag_single = -1;
+    } else {
+      PADDLE_THROW("Unknown chunk scheme.");
+    }
+    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
+    excluded_chunk_types.insert(
+        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
+        context.Attr<std::vector<int>>("excluded_chunk_types").end());
+
+    auto* inference = context.Input<LoDTensor>("Inference");
+    auto* label = context.Input<LoDTensor>("Label");
+    auto* precision = context.Output<Tensor>("Precision");
+    auto* recall = context.Output<Tensor>("Recall");
+    auto* f1 = context.Output<Tensor>("F1-Score");
+
+    const int64_t* inference_data = inference->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    T* precision_data = precision->mutable_data<T>(context.GetPlace());
+    T* racall_data = recall->mutable_data<T>(context.GetPlace());
+    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+
+    auto lod = label->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE(lod == inference->lod(),
+                   "LoD must be same between Inference and Label.");
+    int num_sequences = lod[0].size() - 1;
+    for (int i = 0; i < num_sequences; ++i) {
+      int seq_length = lod[0][i + 1] - lod[0][i];
+      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
+                 output_segments, label_segments, num_output_segments,
+                 num_label_segments, num_correct, num_chunk_types,
+                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
+                 tag_end, tag_single, excluded_chunk_types);
+    }
+    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
+                                                     num_output_segments;
+    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
+                                                 num_label_segments;
+    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
+                                      ((*precision_data) + (*racall_data));
+  }
+
+  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
+                  std::vector<Segment>& output_segments,
+                  std::vector<Segment>& label_segments,
+                  int64_t& num_output_segments, int64_t& num_label_segments,
+                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single,
+                  const std::set<int>& excluded_chunk_types) const {
+    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    size_t i = 0, j = 0;
+    while (i < output_segments.size() && j < label_segments.size()) {
+      if (output_segments[i] == label_segments[j] &&
+          excluded_chunk_types.count(output_segments[i].type) != 1) {
+        ++num_correct;
+      }
+      if (output_segments[i].end < label_segments[j].end) {
+        ++i;
+      } else if (output_segments[i].end > label_segments[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : label_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    }
+    for (auto& segment : output_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000..f73d55bbe3
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipByNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipByNormOp should not be null.");
+    auto max_norm = ctx->Attrs().Get<float>("max_norm");
+    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipByNormOpMaker(framework::OpProto* proto,
+                    framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of clip_by_norm op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out",
+              "(Tensor) The output of clip_by_norm op with shape as input(X)");
+    AddAttr<float>("max_norm", "(float) The maximum norm value.");
+    AddComment(R"DOC(
+ClipByNorm Operator.
+
+This operator limits the L2 norm of the input $X$ within $max\_norm$.
+If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
+the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
+be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
+shown in the following formula:
+
+$$
+Out = \frac{max\_norm * X}{norm(X)},
+$$
+
+where $norm(X)$ represents the L2 norm of $X$.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+                             ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    clip_by_norm, ops::ClipByNormKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/increment_op.cu b/paddle/operators/clip_by_norm_op.cu
similarity index 80%
rename from paddle/operators/increment_op.cu
rename to paddle/operators/clip_by_norm_op.cu
index 659c380d14..2593a24ebb 100644
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -12,8 +12,8 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/increment_op.h"
+#include "paddle/operators/clip_by_norm_op.h"
 
+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    increment,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>);
+    clip_by_norm, ops::ClipByNormKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000..b26476cae9
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ClipByNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max_norm = context.Attr<T>("max_norm");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenVector<T>::Flatten(*input);
+    auto out = EigenVector<T>::Flatten(*output);
+    auto x_norm = x.square().sum().sqrt();
+    auto place = context.GetEigenDevice<Place>();
+
+    auto temp = (x_norm <= max_norm).template cast<T>().eval();
+    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
+    Eigen::array<int, 1> one_dim{{1}};
+    Eigen::DSizes<int, 1> m_dsize(input->numel());
+    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index f80204c683..4ddf24dea3 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -49,8 +49,15 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>(
         "max", "(float)Maximum value, above which element is replaced by max");
     AddComment(R"DOC(
-Clip operator limits the given input within an interval. The interval is
-specified with arguments 'min' and 'max'.
+Clip Operator.
+
+The clip operator limits the value of given input within an interval. The interval is
+specified with arguments 'min' and 'max':
+
+$$
+Out = \min(\max(X, min), max)
+$$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
new file mode 100644
index 0000000000..bf7e883681
--- /dev/null
+++ b/paddle/operators/compare_op.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/compare_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CompareOpProtoMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y", string::Sprintf(
+                      "(LoDTensor) the right hand operand of %s operator",
+                      comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. Each of them is a
+N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
+calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class CompareOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
+                   comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
+                   comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    // CompareOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+  struct _##op_type##Comment {                                       \
+    static char type[];                                              \
+    static char equation[];                                          \
+  };                                                                 \
+  char _##op_type##Comment::type[]{#op_type};                        \
+  char _##op_type##Comment::equation[]{_equation};                   \
+  REGISTER_OPERATOR(                                                 \
+      op_type, ::paddle::operators::CompareOp,                       \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
+REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
+REGISTER_LOGICAL_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
+REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_LOGICAL_OP(equal, "Out = X == Y");
+REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
new file mode 100644
index 0000000000..6ac8c124b9
--- /dev/null
+++ b/paddle/operators/compare_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/compare_op.h"
+
+REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
new file mode 100644
index 0000000000..afdf3ab3e0
--- /dev/null
+++ b/paddle/operators/compare_op.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LessThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+};
+
+template <typename T>
+struct LessEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+};
+
+template <typename T>
+struct GreaterThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct GreaterEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+};
+
+template <typename T>
+struct EqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    if (std::is_floating_point<T>::value) {
+      // This branch will be optimized while compiling if T is integer. It is
+      // safe to cast a and b to double.
+      return fabs(static_cast<double>(a - b)) < 1e-8;
+    } else {
+      return (a == b);
+    }
+  }
+};
+
+template <typename Place, typename Functor>
+class CompareOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
+          binary_func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                     \
+  REGISTER_OP_##dev##_KERNEL(                                              \
+      op_type,                                                             \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<int>>,                  \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<int64_t>>,              \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<float>>,                \
+      ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
+                                           functor<double>>);
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index e11e51b458..6134ac78b1 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should be empty.")
+                      "Inputs(X) of ConcatOp should be empty.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ConcatOp should not be null.");
 
@@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel {
         }
         PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                           "Input tensors should have the same "
-                          "elements except the specify axis.")
+                          "elements except the specify axis.");
       }
     }
     ctx->SetOutputDim("Out", out_dims);
@@ -56,20 +56,24 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of concat operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of concat operator.");
-    AddComment(R"DOC(
-            Join the input tensors along with the axis.
-            Examples:
-              Input[0] = [[1,2],[3,4]]
-              Input[1] = [[5,6]]
-              axis = 0
-              Output = [[1,2],
-                        [3,4],
-                        [5,6]]
-        )DOC");
-    AddAttr<int>("axis", "The axis which the inputs will be joined with.")
+    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
+    AddOutput("Out", "Output tensor of concat operator.");
+    AddAttr<int>("axis",
+                 "The axis along which the input tensors will be concatenated.")
         .SetDefault(0);
+    AddComment(R"DOC(
+Concat Operator.
+
+Concatenate the input tensors along dimension axis.
+Examples:
+  Input[0] = [[1,2],[3,4]]
+  Input[1] = [[5,6]]
+  axis = 0
+  Output = [[1,2],
+            [3,4],
+            [5,6]]
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/concat_op.cu b/paddle/operators/concat_op.cu.cc
similarity index 100%
rename from paddle/operators/concat_op.cu
rename to paddle/operators/concat_op.cu.cc
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index adcd867f50..b809bdc3a0 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -216,11 +216,12 @@ class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
 
     AddComment(R"DOC(
-Sample dependent Cond Operator:
-Given Cond[i] as a 1/0 vector to indicate true/false
-The equation is:
-Out[i] = subnet_t[i], if Cond[i] == true
-Out[i] = subnet_t[i], if Cond[i] == false
+Sample Dependent Conditional Operator.
+
+Given Cond[i] as a 1/0 vector to indicate true/false:
+Out[i] = subnet_true[i], if Cond[i] == true
+Out[i] = subnet_false[i], if Cond[i] == false
+
 )DOC");
   }
 };
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
new file mode 100644
index 0000000000..d5b124682d
--- /dev/null
+++ b/paddle/operators/conditional_block_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+};
+
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDescBind *>(
+        "block", "The step block of conditional block operator");
+    AddComment(R"DOC(Conditional block operator
+
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = this->InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign =
+          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
+                                          {{"Out", {out_grad_name}}}, {});
+      assign->Run(cur_scope, dev_ctx);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad_op = new framework::OpDescBind();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/operators/conv2d_op.cc b/paddle/operators/conv2d_op.cc
deleted file mode 100644
index 1acb8415d0..0000000000
--- a/paddle/operators/conv2d_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/conv2d_op.h"
-
-namespace paddle {
-namespace operators {
-
-void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-  int groups = ctx->Attrs().Get<int>("groups");
-  int input_channels = in_dims[1];
-  int output_channels = filter_dims[0];
-
-  PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
-  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
-                    "The number of input channels should be equal to filter "
-                    "channels * groups.");
-  PADDLE_ENFORCE_EQ(
-      output_channels % groups, 0,
-      "The number of output channels should be divided by groups.");
-
-  auto output_height =
-      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
-  auto output_width =
-      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
-  ctx->SetOutputDim("Output",
-                    {in_dims[0], filter_dims[0], output_height, output_width});
-}
-
-Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "The input tensor of convolution operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of channels, H and W is the height and width of image.");
-  AddInput("Filter",
-           "The filter tensor of convolution operator."
-           "The format of the filter tensor is MCHW, where M is the number of "
-           "output image channels, C is the number of input image channels, "
-           "H and W is height and width of filter. "
-           "If the groups attribute is greater than 1, C equal the number of "
-           "input image channels divided by the groups.");
-  AddOutput("Output",
-            "The output tensor of convolution operator."
-            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings", "paddings of convolution operator.")
-      .SetDefault({0, 0});
-  AddAttr<int>(
-      "groups",
-      "group size of convolution operator. "
-      "Refer to grouped convolution in Alex Krizhevsky's paper: "
-      "when group=2, the first half of the filters are only connected to the "
-      "first half of the input channels, and the second half only connected "
-      "to the second half.")
-      .SetDefault(1);
-  AddComment(R"DOC(
-The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-)DOC");
-}
-
-void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::Conv2DOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_op.h b/paddle/operators/conv2d_op.h
deleted file mode 100644
index 0621389a79..0000000000
--- a/paddle/operators/conv2d_op.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// Base convolution operator definations for other conv
-// like operators to reuse the implementation.
-inline int OutputSize(int input_size, int filter_size, int padding,
-                      int stride) {
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  return output_size;
-}
-
-// Define Op classes in .h file so that other conv
-// operator implementations can reuse the code.
-class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
-};
-
-class Conv2DOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class Conv2DOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    int groups = context.Attr<int>("groups");
-
-    int batch_size = input->dims()[0];
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
-
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-    // use col_shape in the im2col calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
-    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
-
-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    framework::DDim output_matrix_shape = {output_channels,
-                                           output_height * output_width};
-    // convolution operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-      for (int g = 0; g < groups; g++) {
-        // im2col
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<Place, T>(context.device_context(), filter_slice, false,
-                               col_matrix, false, T(1.0), &out_slice, T(0.0));
-      }
-    }
-  }
-};
-
-template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    int groups = context.Attr<int>("groups");
-
-    int batch_size = input->dims()[0];
-    int input_channels = input->dims()[1];
-    int filter_height = filter.dims()[filter.dims().size() - 2];
-    int filter_width = filter.dims()[filter.dims().size() - 1];
-    int output_channels = output_grad->dims()[1];
-    int output_height = output_grad->dims()[2];
-    int output_width = output_grad->dims()[3];
-
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        col2im;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-    // use col_shape in the im2col and col2im calculation
-    framework::DDim col_shape = {input_channels / groups, filter_height,
-                                 filter_width, output_height, output_width};
-    // use col_matrix_shape in the gemm calculation
-    framework::DDim col_matrix_shape = {
-        input_channels / groups * filter_height * filter_width,
-        output_height * output_width};
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix = col;
-    col_matrix.Resize(col_matrix_shape);
-
-    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
-                                   input->dims()[3]};
-    framework::DDim output_matrix_shape = {
-        output_grad->dims()[1],
-        output_grad->dims()[2] * output_grad->dims()[3]};
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
-    // convolution backward input operator:  gemm + col2im
-    // convolution backward weight operator: im2col + gemm
-    int in_step = input_channels / groups;
-    int out_step = output_channels / groups;
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), filter_slice, true,
-                                 out_grad_slice, false, T(1.0), &col_matrix,
-                                 T(0.0));
-
-          // col2im
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-          col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
-                 paddings[1]);
-        }
-      }
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-          im2col(context.device_context(), in_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[0], paddings[1],
-                 paddings[1]);
-
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<Place, T>(context.device_context(), out_grad_slice,
-                                 false, col_matrix, true, T(1.0),
-                                 &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cc b/paddle/operators/conv2d_transpose_cudnn_op.cc
deleted file mode 100644
index 8ce94e0f04..0000000000
--- a/paddle/operators/conv2d_transpose_cudnn_op.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/conv2d_transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
- public:
-  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
-      : Conv2DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault(std::vector<int>{1, 1});
-    AddAttr<int>("workspace_size_MB",
-                 "workspace size for cudnn, in MB, "
-                 "workspace is a section of GPU memory which will be "
-                 "allocated/freed each time the operator runs, larger "
-                 "workspace size can increase performance but also requires "
-                 "better hardward. This size should be carefully setted.")
-        .SetDefault(4096);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose_cudnn, ops::Conv2DTransposeOp,
-            ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
-            ops::Conv2DTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_cudnn,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_cudnn_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_transpose_op.cc b/paddle/operators/conv2d_transpose_op.cc
deleted file mode 100644
index 348527728b..0000000000
--- a/paddle/operators/conv2d_transpose_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/conv2d_transpose_op.h"
-
-namespace paddle {
-namespace operators {
-
-void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "Input(Input) of Conv2DTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                 "Input(Filter) of Conv2DTransposeOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                 "Output(Output) of Conv2DTransposeOp should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
-  for (size_t i = 0; i < paddings.size(); ++i) {
-    PADDLE_ENFORCE_EQ(paddings[i], 0,
-                      "No Padding allowed in conv transpose op.");
-  }
-
-  PADDLE_ENFORCE_EQ(in_dims.size(), 4,
-                    "Conv2DTransposeOp input should be 4-D tensor.");
-  PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
-                    "Conv2DTransposeOp filter should be 4-D tensor.");
-  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "input and kernel input dimension should be equal.");
-
-  auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
-  auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
-  ctx->SetOutputDim("Output",
-                    {in_dims[0], filter_dims[1], output_height, output_width});
-}
-
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of convolution transpose operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of input channels, H and W is the height and width of image.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMHW, where C is the number of "
-           "output image channels, M is the number of input image channels, "
-           "H and W is height and width of filter. "
-           "We enforce groups number == 1 and padding == 0 in "
-           "convolution transpose Scenario.");
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution transpose operator."
-            "The format of output tensor is also NCHW.");
-  AddAttr<std::vector<int>>("strides",
-                            "strides of convolution transpose operator.")
-      .SetDefault({1, 1});
-  AddAttr<std::vector<int>>("paddings",
-                            "paddings of convolution transpose operator.")
-      .SetDefault({0, 0});
-  AddComment(R"DOC(
-The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
-parameters is checked in the infer-shape.
-)DOC");
-}
-
-void Conv2DTransposeOpGrad::InferShape(
-    framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto filter_dims = ctx->GetInputDim("Filter");
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(conv2d_transpose, ops::Conv2DTransposeOp,
-            ops::Conv2DTransposeOpMaker, conv2d_transpose_grad,
-            ops::Conv2DTransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2d_transpose_op.h b/paddle/operators/conv2d_transpose_op.h
deleted file mode 100644
index cab7788227..0000000000
--- a/paddle/operators/conv2d_transpose_op.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/im2col.h"
-#include "paddle/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-// Define Op classes in .h file so that other conv transpose
-// operator implementations can reuse the code.
-class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
-};
-
-class Conv2DTransposeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override;
-};
-
-template <typename Place, typename T>
-class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped, so it should not be constant pointer
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    Tensor* output = context.Output<Tensor>("Output");
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-
-    // TODO(Zhuoyuan): Paddings can be added in future.
-    // groups will alway be disabled in conv2d_transpose.
-
-    const int batch_size = input->dims()[0];
-    const int m = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int k_h = filter.dims()[2];
-    const int k_w = filter.dims()[3];
-
-    const int c = output->dims()[1];  // output channels
-    const int o_h = output->dims()[2];
-    const int o_w = output->dims()[3];
-
-    paddle::operators::math::Col2ImFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        col2im;
-
-    // use col_shape in the im2col and col2im calculation
-    DDim col_shape = {c, k_h, k_w, h, w};
-
-    // use col_matrix_shape in the gemm calculation
-    DDim col_matrix_shape = {c * k_h * k_w, h * w};
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-
-    DDim output_shape = {c, o_h, o_w};
-    DDim input_matrix_shape = {m, h * w};
-
-    DDim filter_matrix_shape = {m, c * k_h * k_w};
-    filter.Resize(filter_matrix_shape);
-
-    // convolution transpose: gemm + col2im (similar to conv-backward on input)
-
-    output->mutable_data<T>(context.GetPlace());
-    auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-    for (int i = 0; i < batch_size; i++) {
-      // batch with size (M, h * w)
-      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-      // filter size: (M, c * k_h * k_w)
-
-      // output size: (c, o_h, o_w)
-      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w)
-      math::matmul<Place, T>(context.device_context(), filter, true,
-                             input_batch, false, T(1.0), &col_matrix, T(0.0));
-      col2im(context.device_context(), output_batch, col, strides[0],
-             strides[1], 0, 0, 0, 0);
-    }
-  }
-};
-
-template <typename Place, typename T>
-class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-
-    // For filter, we do not use const pointer b/c we will do reshape,
-    // but we should avoid modifying its value.
-    Tensor filter = *context.Input<Tensor>("Filter");
-
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    // Actually, no paddings and groups allowed in conv transpose.
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-
-    const int batch_size = input->dims()[0];
-    const int m = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int k_h = filter.dims()[2];
-    const int k_w = filter.dims()[3];
-
-    const int c = output_grad->dims()[1];  // output channels
-    const int o_h = output_grad->dims()[2];
-    const int o_w = output_grad->dims()[3];
-
-    // Only im2col functor required for bp to get to the right shape
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, Place, T>
-        im2col;
-
-    // use col_shape in the im2col and col2im calculation
-    DDim col_shape = {c, k_h, k_w, h, w};
-
-    // use col_matrix_shape in the gemm calculation
-    DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
-
-    Tensor col;
-    col.mutable_data<T>(col_shape, context.GetPlace());
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-
-    DDim output_shape = {c, o_h, o_w};
-    DDim input_matrix_shape = {m, h * w};
-
-    DDim filter_matrix_shape = {m, c * k_h * k_w};
-    filter.Resize(filter_matrix_shape);
-
-    // convolution transpose grad on input:
-    // im2col + gemm (similar to conv-forward)
-    // input need to compute gradient
-    if (input_grad) {
-      Tensor col_matrix;
-      col_matrix.ShareDataWith(col);
-      DDim col_matrix_shape = {c * k_h * k_w, h * w};
-      col_matrix.Resize(col_matrix_shape);
-
-      input_grad->mutable_data<T>(context.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; i++) {
-        // batch with size (c, o_h * o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-        // filter of size (m, c * k_h * k_w)
-
-        // batch with size (m, h, w)
-        Tensor input_grad_batch =
-            input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
-
-        // im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
-        im2col(context.device_context(), output_grad_batch, col, strides[0],
-               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm: dx = filter * dy
-        // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
-        math::matmul<Place, T>(context.device_context(), filter, false,
-                               col_matrix, false, T(1.0), &input_grad_batch,
-                               T(0.0));
-      }
-    }
-
-    // filter gradient required
-    if (filter_grad) {
-      Tensor col_matrix_f;
-      col_matrix_f.ShareDataWith(col);
-      DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
-      col_matrix_f.Resize(col_matrix_shape_f);
-
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
-      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
-
-      for (int i = 0; i < batch_size; ++i) {
-        // batch with size (c, o_h, o_w)
-        Tensor output_grad_batch =
-            output_grad->Slice(i, i + 1).Resize(output_shape);
-        // input batch
-        Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-
-        // im2col: (c * h * w, k_h * k_w)
-        im2col(context.device_context(), output_grad_batch, col, strides[0],
-               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
-
-        // gemm: d_filter = x * y_grad^T
-        // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
-        math::matmul<Place, T>(context.device_context(), in_batch, false,
-                               col_matrix_f, true, T(1.0), &filter_grad_,
-                               T(1.0));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index 62190ebc21..0dd8c13b2a 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -12,18 +12,31 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 
 namespace paddle {
 namespace operators {
 
-class CudnnConvOpMaker : public Conv2DOpMaker {
+class CudnnConv2DOpMaker : public Conv2DOpMaker {
  public:
-  CudnnConvOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
       : Conv2DOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault(std::vector<int>{1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+  }
+};
+
+class CudnnConv3DOpMaker : public Conv3DOpMaker {
+ public:
+  CudnnConv3DOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : Conv3DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
@@ -38,10 +51,24 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::Conv2DOpGrad);
+REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
+            conv2d_cudnn_grad, ops::ConvOpGrad);
+
+REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
+            conv3d_cudnn_grad, ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+    conv2d_cudnn_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+
+REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv_cudnn_grad,
-    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
+    conv3d_cudnn_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_cudnn_op.cu b/paddle/operators/conv_cudnn_op.cu.cc
similarity index 71%
rename from paddle/operators/conv_cudnn_op.cu
rename to paddle/operators/conv_cudnn_op.cu.cc
index e2eb157f40..bc265dcc4f 100644
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_op.h"
+#include "paddle/operators/conv_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"
 
@@ -27,9 +27,9 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
-using CUDADeviceContext = platform::CUDADeviceContext;
 
-static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
+    static_cast<size_t>(1024) * 1024 * 1024;
 
 template <typename T>
 class CudnnConvOpKernel : public framework::OpKernel<T> {
@@ -45,7 +45,8 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
@@ -57,6 +58,21 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -64,19 +80,34 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
         layout, framework::vectorize2int(output->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+    int output_channels = filter->dims()[0];
+    int output_height, output_width, output_depth;
+    if (output->dims().size() == 5) {
+      output_depth = output->dims()[2];
+      output_height = output->dims()[3];
+      output_width = output->dims()[4];
+    } else {
+      output_depth = 1;
+      output_height = output->dims()[2];
+      output_width = output->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
     int group_offset_out =
-        output_channels / groups * output_height * output_width;
+        output_channels / groups * output_height * output_width * output_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
     void* cudnn_workspace = nullptr;
@@ -134,17 +165,32 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
-    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
 
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_grad_desc;
-    ScopedTensorDescriptor input_grad_desc;
 
     ScopedFilterDescriptor filter_desc;
     ScopedFilterDescriptor filter_grad_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -153,22 +199,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
             layout, framework::vectorize2int(output_grad->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
-    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+
     int output_grad_channels = filter->dims()[0];
-    int output_grad_height = output_grad->dims()[2];
-    int output_grad_width = output_grad->dims()[3];
+    int output_grad_height, output_grad_width, output_grad_depth;
+    if (input->dims().size() == 5) {
+      output_grad_depth = output_grad->dims()[2];
+      output_grad_height = output_grad->dims()[3];
+      output_grad_width = output_grad->dims()[4];
+    } else {
+      output_grad_depth = 1;
+      output_grad_height = output_grad->dims()[2];
+      output_grad_width = output_grad->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
-    int group_offset_out =
-        output_grad_channels / groups * output_grad_height * output_grad_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out = output_grad_channels / groups * output_grad_height *
+                           output_grad_width * output_grad_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn backward algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t data_algo;
@@ -181,8 +240,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 
     auto handle = ctx.cuda_device_context().cudnn_handle();
     if (input_grad) {
-      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(input_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               handle, cudnn_filter_desc,
@@ -191,19 +248,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
               cudnn_output_grad_desc, cudnn_conv_desc,
               // dxDesc: Handle to the previously initialized output tensor
               // descriptor.
-              cudnn_input_grad_desc,
+              cudnn_input_desc,
               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
               workspace_size_limit, &data_algo));
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
-              cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
     if (filter_grad) {
-      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(filter_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -223,34 +278,30 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta,
-            cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + i * group_offset_in));
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
             filter_grad_data + i * group_offset_filter));
       }
     }
@@ -262,6 +313,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>);
-REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
-                       paddle::operators::CudnnConvGradOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
+                       paddle::operators::CudnnConvOpKernel<float>,
+                       paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>,
+                       paddle::operators::CudnnConvGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
+                       paddle::operators::CudnnConvOpKernel<float>,
+                       paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>,
+                       paddle::operators::CudnnConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
new file mode 100644
index 0000000000..462e6d9cbc
--- /dev/null
+++ b/paddle/operators/conv_op.cc
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  int groups = ctx->Attrs().Get<int>("groups");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+  int input_channels = in_dims[1];
+  int output_channels = filter_dims[0];
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "Conv intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(), filter_dims.size(),
+      "Conv input dimension and filter dimension should be the same.");
+  PADDLE_ENFORCE(
+      in_dims.size() - strides.size() == 2U,
+      "Conv input dimension and strides dimension should be consistent.");
+  PADDLE_ENFORCE_EQ(
+      paddings.size(), strides.size(),
+      "Conv paddings dimension and Conv strides dimension should be the same.");
+  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
+                    "The number of input channels should be equal to filter "
+                    "channels * groups.");
+  PADDLE_ENFORCE_EQ(
+      output_channels % groups, 0,
+      "The number of output channels should be divided by groups.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
+                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
+                       0,
+                   "Due to the settings of paddings, filter_dims and "
+                   "dilations, the output size is less than 0, please check "
+                   "again.");
+    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+}
+
+Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "H is the height of the filter, and W is the width of the filter. "
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator. "
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int> default:{1, 1}), the "
+                            "strides(h_stride, w_stride) of "
+                            "convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int> default:{0, 0}), the "
+                            "paddings(h_pad, w_pad) of "
+                            "convolution operator.")
+      .SetDefault({0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the groups number of the convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of "
+                            "convolution operator.")
+      .SetDefault({1, 1});
+  AddComment(R"DOC(
+Convolution Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
+)DOC");
+}
+
+Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is the "
+      "number of channels, D is the depth of the feature, H is the height of "
+      "the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "D is the depth of the filter, H is the height of the filter, and W "
+           "is the width of the filter."
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator."
+            "The format of output tensor is also NCDHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default:{1, 1, 1}), the "
+                            "strides(d_stride, h_stride, w_stride) of "
+                            "convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int>, default:{0, 0, 0}), the "
+                            "paddings(d_pad, h_pad, w_pad) of convolution "
+                            "operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the groups number of the convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1, 1}), the "
+                            "dilations(d_dilation, h_dilation, w_dilation) of "
+                            "convolution operator.")
+      .SetDefault({1, 1, 1});
+
+  AddComment(R"DOC(
+Convolution3D Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format, where N is batch
+size, C is the number of channels,D is the depth of the feature, H is the height of
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
+)DOC");
+}
+
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+            ops::ConvOpGrad);
+namespace ops = paddle::operators;
+REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+            ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
+
+REGISTER_OP_CPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
new file mode 100644
index 0000000000..546451234a
--- /dev/null
+++ b/paddle/operators/conv_op.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(conv2d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
+
+REGISTER_OP_GPU_KERNEL(conv3d,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
new file mode 100644
index 0000000000..09bff0a68d
--- /dev/null
+++ b/paddle/operators/conv_op.h
@@ -0,0 +1,345 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// Base convolution operator definations for other conv
+// like operators to reuse the implementation.
+inline int OutputSize(int input_size, int filter_size, int dilation,
+                      int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+inline bool IsExpand(std::vector<int64_t>& filter_dim,
+                     std::vector<int>& strides, std::vector<int>& paddings,
+                     std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+// Define Op classes in .h file so that other conv
+// operator implementations can reuse the code.
+class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class ConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class ConvOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+template <typename Place, typename T>
+class GemmConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
+    // o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output->dims()[1],
+        output->numel() / (output->dims()[0] * output->dims()[1])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+    math::Vol2ColFunctor<Place, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+
+    for (int i = 0; i < batch_size; i++) {
+      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          // im2col
+          im2col(context.device_context(), in_slice, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &col);
+        } else if (data_dim == 3U) {
+          // vol2col
+          vol2col(context.device_context(), in_slice, dilations, strides,
+                  paddings, &col);
+        }
+
+        // gemm
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+        math::matmul<Place, T>(context.device_context(), filter_slice, false,
+                               col_matrix, false, T(1.0), &out_slice, T(0.0));
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class GemmConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    // The filter and filter_grad will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(output_grad->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
+    // or
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output_grad->dims()[1],
+        output_grad->numel() /
+            (output_grad->dims()[0] * output_grad->dims()[1])};
+
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    math::SetConstant<Place, T> set_zero;
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      set_zero(context.device_context(), input_grad, static_cast<T>(0));
+
+      math::Col2VolFunctor<Place, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // gemm
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+          Tensor in_grad_slice =
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (!is_expand) {
+            col_matrix.ShareDataWith(in_grad_slice);
+            col_matrix.Resize(col_matrix_shape);
+          }
+          math::matmul<Place, T>(context.device_context(), filter_slice, true,
+                                 out_grad_slice, false, T(1.0), &col_matrix,
+                                 T(0.0));
+
+          if (is_expand && data_dim == 2U) {
+            col2im(context.device_context(), col, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+                   &in_grad_slice);
+          } else if (is_expand && data_dim == 3U) {
+            col2vol(context.device_context(), col, dilations, strides, paddings,
+                    &in_grad_slice);
+          }
+        }
+      }
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      Tensor filter_grad_ = *filter_grad;
+      filter_grad_.Resize(filter_matrix_shape);
+      set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+      math::Vol2ColFunctor<Place, T> vol2col;
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // im2col
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (!is_expand) {
+            col.ShareDataWith(in_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(context.device_context(), in_slice, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(context.device_context(), in_slice, dilations, strides,
+                    paddings, &col);
+          }
+
+          // gemm
+          Tensor filter_grad_slice =
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+          math::matmul<Place, T>(context.device_context(), out_grad_slice,
+                                 false, col_matrix, true, T(1.0),
+                                 &filter_grad_slice, T(1.0));
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
index 6156a2d6af..a4150a5664 100644
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -96,14 +96,13 @@ as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
 
 The equation is:
 
-  \f[
-      Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}
-  \f]
+$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
 
-where X's index is computed modulo M, and b's index is computed modulo N.
+where X's index is computed modulo M, and Y's index is computed modulo N.
+
+Both inputs X and Y can carry LoD (Level of Details) information.
+However, the output only shares the LoD information with input X.
 
-Both of the input `X` and `Y` can carry LoD (Level of Details) information.
-However, the output only shares the LoD information with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
index 145e966fe9..95e13c38a8 100644
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/conv_shift_op.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/platform/cuda_helper.h"
 
 namespace paddle {
@@ -22,7 +23,7 @@ using framework::Tensor;
 
 namespace {
 
-inline int div_up(int x, int y) { return (x + y - 1) / y; }
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
 
 // Some notes on the design:
 //
@@ -33,9 +34,9 @@ inline int div_up(int x, int y) { return (x + y - 1) / y; }
 // y is fairly small. For large y, it would probably be more efficient
 // to also tile across y.
 template <typename T>
-__global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
-                                   int y_width, int y_half_width,
-                                   int batch_size) {
+__global__ void ConvShiftForward(const T *x, const T *y, int x_width,
+                                 int y_width, int y_half_width, int batch_size,
+                                 T *out) {
   extern __shared__ T mem[];
 
   int tx = threadIdx.x;
@@ -62,25 +63,26 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
   if (tx < num_x) {
     int load_i = (i - y_half_width + x_width) % x_width;
     sx[tx] = x[k * x_width + load_i];
-  } else {
-    return;
   }
   __syncthreads();
 
-  // Compute dot product of sx[tx:tx + y_width] and sy.
-  T sum = 0;
-  for (int j = 0; j < y_width; ++j) {
-    sum += sx[tx + j] * sy[j];
-  }
+  if (tx < num_x) {
+    // Compute dot product of sx[tx:tx + y_width] and sy.
+    T sum = 0;
+    for (int j = 0; j < y_width; ++j) {
+      sum += sx[tx + j] * sy[j];
+    }
 
-  // Save to out[k, i].
-  out[k * x_width + i] = sum;
+    // Save to out[k, i].
+    out[k * x_width + i] = sum;
+  }
 }
 
 // Compute x gradient - initial naive implementation with atomic add.
 template <typename T>
-__global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width,
-                              int y_width, int y_half_width, int batch_size) {
+__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width,
+                               int y_width, int y_half_width, int batch_size,
+                               T *dx) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
   int j = blockIdx.y;                             // y index
   int k = blockIdx.z;                             // batch index
@@ -94,8 +96,8 @@ __global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width,
 
 // Compute y gradient - initial naive implementation with atomic add.
 template <typename T>
-__global__ void conv_shift_dy(const T *x, const T *dout, T *dy, int x_width,
-                              int y_width, int y_half_width, int batch_size) {
+__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
+                            int y_half_width, int batch_size, T *dy) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
   int j = blockIdx.y;                             // y index
   int k = blockIdx.z;                             // batch index
@@ -125,17 +127,15 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     int y_half_width = (y_width - 1) / 2;
 
     const int x_per_block = 256;
-    int num_x_blocks = div_up(x_width, x_per_block);
+    int num_x_blocks = DivUp(x_width, x_per_block);
     int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
 
     dim3 grid_dim(num_x_blocks, batch_size);
 
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
-                      context.device_context())
-                      .stream();
+    auto stream = context.cuda_device_context().stream();
 
-    conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
-        x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
+    ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
+        x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
   }
 };
 
@@ -159,27 +159,26 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
     int y_width = Y->dims()[1];
     int y_half_width = (y_width - 1) / 2;
 
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
-                      context.device_context())
-                      .stream();
+    auto &device_ctx = context.cuda_device_context();
+    math::SetConstant<platform::GPUPlace, T> zero;
 
     const int x_per_block = 256;
-    int num_x_blocks = div_up(x_width, x_per_block);
+    int num_x_blocks = DivUp(x_width, x_per_block);
     dim3 grid_dim(num_x_blocks, y_width, batch_size);
 
     if (dX) {
       T *dx_data = dX->mutable_data<T>(context.GetPlace());
-      cudaMemsetAsync(dx_data, 0, dX->numel() * sizeof(T), stream);
-      conv_shift_dx<T><<<grid_dim, x_per_block, 0, stream>>>(
-          dout_data, y_data, dx_data, x_width, y_width, y_half_width,
-          batch_size);
+      zero(device_ctx, dX, static_cast<T>(0.0));
+      ConvShiftGradX<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
+          dout_data, y_data, x_width, y_width, y_half_width, batch_size,
+          dx_data);
     }
     if (dY) {
       T *dy_data = dY->mutable_data<T>(context.GetPlace());
-      cudaMemsetAsync(dy_data, 0, dY->numel() * sizeof(T), stream);
-      conv_shift_dy<T><<<grid_dim, x_per_block, 0, stream>>>(
-          x_data, dout_data, dy_data, x_width, y_width, y_half_width,
-          batch_size);
+      zero(device_ctx, dY, static_cast<T>(0.0));
+      ConvShiftDy<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
+          x_data, dout_data, x_width, y_width, y_half_width, batch_size,
+          dy_data);
     }
   }
 };
diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
new file mode 100644
index 0000000000..0192178ce3
--- /dev/null
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
+ public:
+  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv2DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault({1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+
+class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
+ public:
+  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
+                              framework::OpAttrChecker* op_checker)
+      : Conv3DTransposeOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault({1, 1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+
+REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp,
+            ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad,
+            ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_cudnn_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu b/paddle/operators/conv_transpose_cudnn_op.cu.cc
similarity index 89%
rename from paddle/operators/conv2d_transpose_cudnn_op.cu
rename to paddle/operators/conv_transpose_cudnn_op.cu.cc
index 61fcfb3bd8..494904fe52 100644
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -15,7 +15,7 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memory.h"
-#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/operators/conv_transpose_op.h"
 #include "paddle/platform/assert.h"
 #include "paddle/platform/cudnn_helper.h"
 
@@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
-using CUDADeviceContext = platform::CUDADeviceContext;
 
 static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
 
@@ -55,15 +54,21 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor output_desc;
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
-    // N, M, H, W
+    // (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
-    // N, C, O_h, O_w
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         layout, framework::vectorize2int(output->dims()));
-    // M, C, K_h, K_w
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()));
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
@@ -137,13 +142,13 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
 
-    // Input: (N, M, H, W)
+    // Input: (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
-    // Output: (N, C, O_H, O_W)
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         layout, framework::vectorize2int(output_grad->dims()));
-    // Filter (M, C, K_H, K_W)
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()));
 
@@ -201,10 +206,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*input_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
-
+      // Because beta is zero, it is unnecessary to reset input_grad.
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_output_desc, output_grad_data,
           cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
@@ -215,9 +217,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
-      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
-      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
-          t.constant(static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
           handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
@@ -235,6 +235,15 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
-                       ops::CudnnConvTransposeOpKernel<float>);
+                       ops::CudnnConvTransposeOpKernel<float>,
+                       ops::CudnnConvTransposeOpKernel<double>);
 REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
-                       ops::CudnnConvTransposeGradOpKernel<float>);
+                       ops::CudnnConvTransposeGradOpKernel<float>,
+                       ops::CudnnConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn,
+                       ops::CudnnConvTransposeOpKernel<float>,
+                       ops::CudnnConvTransposeOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad,
+                       ops::CudnnConvTransposeGradOpKernel<float>,
+                       ops::CudnnConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
new file mode 100644
index 0000000000..678b192dea
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.cc
@@ -0,0 +1,217 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvTransposeOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
+                    "ConvTransposeOp input dimension and filter dimension "
+                    "should be the same.");
+  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
+                 "ConvTransposeOp input dimension and strides dimension should "
+                 "be consistent.");
+  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
+                    "ConvTransposeOp paddings dimension and strides "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
+                    "In ConvTransposeOp, The input channel should be the same "
+                    "as the number of filters.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
+                           filter_dims[i + 2]);
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+}
+
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution transpose operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of input channels, H is the height of the feature, and "
+      "W is the width of the feature.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator. "
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
+      "convolution transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "transpose operator.")
+      .SetDefault({0, 0});
+  AddComment(R"DOC(
+Convolution2D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+  $$
+)DOC");
+}
+
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) The input tensor of convolution transpose operator."
+           "The format of input tensor is NCDHW. Where N is batch size, C is "
+           "the number of channels, D is the depth of the feature, H is the "
+           "height of the feature, and "
+           "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
+           "is the depth of the filter, H is the height of the filter, and "
+           "W is the width of the filter."
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution3d transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D is the depth of the feature, H is the "
+            "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int> default:{1, 1, 1}), the "
+                            "strides{d_stride, h_stride, w_stride} of "
+                            "convolution transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
+                            "h_pad, w_pad) of convolution transpose operator.")
+      .SetDefault({0, 0, 0});
+  AddComment(R"DOC(
+Convolution3D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:   
+  Input:
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+  $$
+)DOC");
+}
+
+void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
+            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
+
+REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
new file mode 100644
index 0000000000..4165eb0c7b
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_transpose_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
+
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
new file mode 100644
index 0000000000..1cacb770e6
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.h
@@ -0,0 +1,287 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+// Define Op classes in .h file so that other conv transpose
+// operator implementations can reuse the code.
+class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DTransposeOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker);
+};
+
+class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DTransposeOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker);
+};
+
+class ConvTransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class ConvTransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+template <typename Place, typename T>
+class GemmConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped, so it should not be constant pointer
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    // groups will alway be disabled in conv2dtranspose.
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape =
+        framework::slice_ddim(output->dims(), 1, output->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    output->mutable_data<T>(context.GetPlace());
+    math::SetConstant<Place, T> set_zero;
+    set_zero(context.device_context(), output, static_cast<T>(0));
+
+    math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
+    math::Col2VolFunctor<Place, T> col2vol;
+    std::vector<int> dilations({1, 1, 1});
+
+    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+    // on input)
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (m, h * w) or (m, d * h * w)
+      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+
+      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+      // col_matrix = filter * input_batch
+      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+      math::matmul<Place, T>(context.device_context(), filter, true,
+                             input_batch, false, static_cast<T>(1.0),
+                             &col_matrix, static_cast<T>(0.0));
+
+      if (data_dim == 2U) {
+        // col2im: col_matrix -> dy
+        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
+        col2im(context.device_context(), col,
+               std::vector<int>{dilations[0], dilations[1]}, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &output_batch);
+      } else if (data_dim == 3U) {
+        // col2vol: col_matrix -> dy
+        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
+        col2vol(context.device_context(), col, dilations, strides, paddings,
+                &output_batch);
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    // For filter, we do not use const pointer b/c we will do reshape,
+    // but we should avoid modifying its value.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    if ((!input_grad) && (!filter_grad)) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output_grad->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
+                                              output_grad->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    // convolution transpose grad on input:
+    // im2col + gemm (similar to conv-forward)
+    // input need to compute gradient
+    if (input_grad || filter_grad) {
+      Tensor col;
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // col_matrix shares the same piece of data with col,
+      // but will be reshaped into a two-dimensional matrix shape
+      // to call the matrix multiplication interface.
+      Tensor col_matrix;
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+
+      Tensor filter_grad_;
+      math::SetConstant<Place, T> set_zero;
+
+      math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
+      math::Vol2ColFunctor<Place, T> vol2col;
+      std::vector<int> dilations({1, 1, 1});
+
+      if (input_grad) {
+        input_grad->mutable_data<T>(context.GetPlace());
+        set_zero(context.device_context(), input_grad, static_cast<T>(0));
+      }
+      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+        filter_grad->mutable_data<T>(context.GetPlace());
+        set_zero(context.device_context(), filter_grad, static_cast<T>(0));
+        filter_grad_ = *filter_grad;
+        filter_grad_.Resize(filter_matrix_shape);
+      }
+
+      for (int i = 0; i < batch_size; i++) {
+        // batch with size (c, o_h * o_w)
+        Tensor output_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_shape);
+
+        if (data_dim == 2U) {
+          // im2col: dy -> col matrix
+          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
+          im2col(context.device_context(), output_grad_batch,
+                 std::vector<int>{dilations[0], dilations[1]}, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &col);
+        } else if (data_dim == 3U) {
+          // vol2col: dy -> col_matrix
+          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
+          vol2col(context.device_context(), output_grad_batch, dilations,
+                  strides, paddings, &col);
+        }
+
+        if (input_grad) {
+          // batch with size (m, h, w)
+          Tensor input_grad_batch =
+              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: dx = filter * dy
+          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
+          // or
+          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
+          // d, h, w)
+          math::matmul<Place, T>(context.device_context(), filter, false,
+                                 col_matrix, false, static_cast<T>(1.0),
+                                 &input_grad_batch, static_cast<T>(0.0));
+        }
+        if (filter_grad) {
+          // input batch
+          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: d_filter = x * dy^T
+          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
+          // or
+          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
+          // k_h * k_w)
+          math::matmul<Place, T>(context.device_context(), in_batch, false,
+                                 col_matrix, true, static_cast<T>(1.0),
+                                 &filter_grad_, static_cast<T>(1.0));
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 68c56f531f..62a4e484ec 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
       // compute dy
       if (out_grad_y) {
         out_grad_y->mutable_data<T>(context.GetPlace());
-        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto dy = EigenVector<T>::Flatten(*out_grad_y);
         auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
         dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
       }
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index d1ce74c4b9..291b23ed1b 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -36,17 +36,18 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         "w. See more details in comments of the linear_chain_crf operator.");
     AddInput(
         "Label",
-        "(LoDTensor,  LoDTensor<int>). The ground truth with shape "
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
         "[N x 1]. This input is optional. See more details in the operator's "
         "comments.")
         .AsDispensable();
-    AddOutput("ViterbiPath",
-              "(LoDTensor, LoDTensor<int>). The decoding results. What to "
-              "return changes depending on whether the Input(Label) (the groud "
-              "truth) is given. See more details in the operator's comment.");
+    AddOutput(
+        "ViterbiPath",
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
+        "truth) is given. See more details in the operator's comment.");
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-freature weights learned by the linear_chain_crf operator. It implements the
+feature weights learned by the linear_chain_crf operator. It implements the
 Viterbi algorithm which is a dynamic programming algorithm for finding the most
 likely sequence of hidden states, called the Viterbi path, that results in a
 sequence of observed tags.
@@ -60,14 +61,14 @@ operator.
 
 When Input(Label) is given, the crf_decoding operator returns a row vector
 with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an ouput is the
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
 input to chunk_eval operator.
 
 2. Input(Label) is not given:
 
 This is the standard decoding process.
 
-The crf_decoding operator returns a row vecotr with shape [N x 1] whose values
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
 range from 0 to maximum tag number - 1. Each element indicates an index of a
 predicted tag.
 )DOC");
@@ -120,9 +121,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        ctx.device_context());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
index 526e0c5dcb..57b5e21b3a 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -43,9 +43,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     const size_t level = 0;
     const size_t seq_num = lod[level].size() - 1;
 
-    int* path = decoded_path->mutable_data<int>(platform::CPUPlace());
-    math::SetConstant<platform::CPUPlace, int>()(ctx.device_context(),
-                                                 decoded_path, 0);
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
+    math::SetConstant<platform::CPUPlace, int64_t>()(ctx.device_context(),
+                                                     decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
@@ -57,7 +57,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     if (label) {
       PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
                         "The Input(Label) should be a sequence.");
-      const int* label_value = label->data<int>();
+      const int64_t* label_value = label->data<int64_t>();
       size_t batch_size = emission_weights->dims()[0];
       for (size_t i = 0; i < batch_size; ++i) {
         path[i] = label_value[i] == path[i] ? 1 : 0;
@@ -76,7 +76,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
 
     const T* x = emission_weights.data<T>();
     const T* w = transition_weights.data<T>();
-    int* path = decoded_path->data<int>();
+    int64_t* path = decoded_path->data<int64_t>();
 
     // alpha is a memo table. An element alpha(k, v) records the score of the
     // best sequence of tags from position 1 to position k with v being the end
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 24df1fcada..1e82742eaf 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,9 +51,11 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
@@ -98,9 +100,11 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
@@ -114,21 +118,17 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
-    AddInput(
-        "Label",
-        "(Tensor, default Tensor<int>), the ground truth which is "
-        "a 2-D tensor. "
-        "When soft_label is set to false, Label is a Tensor<int> with shape "
-        "[N x 1]. "
-        "When soft_label is set to true, Label is a Tensor<float/double> "
-        "with shape [N x K].");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. When "
+             "soft_label is set to false, Label is a Tensor<int64> with shape "
+             "[N x 1]. When soft_label is set to true, Label is a "
+             "Tensor<float/double> with shape [N x K].");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor "
-              "with shape [N x 1]. The cross entropy loss.");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default false), a flag to indicate whether to interpretate "
-        "the given labels as soft labels.")
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The cross entropy loss.");
+    AddAttr<bool>("soft_label",
+                  "(bool, default false), a flag indicating whether to "
+                  "interpretate the given labels as soft labels.")
         .SetDefault(false);
     AddComment(R"DOC(
 CrossEntropy Operator.
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index a523cb6fce..6212e39dfd 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -23,8 +23,6 @@ template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                            const int64_t* label, const int N,
                                            const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     int idx = i * D + label[i];
@@ -82,24 +80,19 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
 
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
+    auto stream = ctx.cuda_device_context().stream();
 
     if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(dx_data, dy_data, x_data, label_data,
-                                           batch_size, class_num);
+      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
     } else {
       math::SetConstant<platform::GPUPlace, T> functor;
       functor(ctx.device_context(), dx, 0);
       auto* label_data = label->data<int64_t>();
       grid = (batch_size + block - 1) / block;
-      CrossEntropyGradientKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(dx_data, dy_data, x_data, label_data,
-                                           batch_size, class_num);
+      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
     }
   }
 };
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000..f6bdc63cc2
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
new file mode 100644
index 0000000000..89dc504522
--- /dev/null
+++ b/paddle/operators/detail/recv_impl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+Status SendRecvServerImpl::SendVariable(ServerContext *context,
+                                        const VariableMessage *in_var,
+                                        VariableMessage *out_var) {
+  framework::LoDTensor t;
+  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+  std::istringstream iss(in_var->serialized());
+  framework::DeserializeFromStream(iss, &t);
+  lodtensor_queue_.Push(std::move(t));
+  // Block util the sub graph is done.
+  t = lodtensor_return_queue_.Pop();
+  std::ostringstream oss;
+  // FIXME(typhoonzero): get context from op.
+  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  std::string *varname = out_var->mutable_varname();
+  *varname = in_var->varname();
+  std::string *serialized = out_var->mutable_serialized();
+  *serialized = oss.str();
+
+  return Status::OK;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conv2d_transpose_op.cu b/paddle/operators/detail/safe_ref.h
similarity index 54%
rename from paddle/operators/conv2d_transpose_op.cu
rename to paddle/operators/detail/safe_ref.h
index 931ac9eed2..b71af17309 100644
--- a/paddle/operators/conv2d_transpose_op.cu
+++ b/paddle/operators/detail/safe_ref.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -12,13 +12,20 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/conv2d_transpose_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-
-REGISTER_OP_GPU_KERNEL(
-    conv2d_transpose,
-    ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
+namespace paddle {
+namespace operators {
+namespace detail {
+/**
+ * Get Reference From Pointer with check. The error message is printf format,
+ * and passed by `args`
+ */
+template <typename T, typename... ARGS>
+inline T &Ref(T *ptr, ARGS &&... args) {
+  PADDLE_ENFORCE(ptr != nullptr, args...);
+  return *ptr;
+}
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
new file mode 100644
index 0000000000..da1ddf75d2
--- /dev/null
+++ b/paddle/operators/detail/send_impl.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::SendVariable(const framework::Scope& scope,
+                             const std::string& inname,
+                             const std::string& outname) {
+  ClientContext context;
+  VariableMessage msg, out_msg;
+  // FIXME(typhoonzero): pass device context to here.
+  auto ctx = platform::CPUDeviceContext();
+  auto* var = scope.FindVar(inname);
+  PADDLE_ENFORCE(var);
+  // TODO(typhoonzero): support SelectedRows
+  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                 "Only support LoDTensor, %s has wrong type", inname);
+  const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
+  std::ostringstream oss;
+  framework::SerializeToStream(oss, tensor, ctx);
+  msg.set_varname(inname);
+  msg.set_serialized(oss.str());
+  Status status = stub_->SendVariable(&context, msg, &out_msg);
+  if (!status.ok()) {
+    return false;
+  }
+  std::istringstream iss(out_msg.serialized());
+  framework::LoDTensor ret_tensor;
+  framework::DeserializeFromStream(iss, &ret_tensor);
+  auto* outvar = scope.FindVar(outname);
+  framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
+  // FIXME(typhoonzero): do not copy.
+  framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000..07ff9d2c62
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+syntax = "proto3";
+
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// Tensor
+// LoDTensor
+// SelectedRows
+message VariableMessage {
+  string varname = 1;
+  bytes serialized = 2;
+}
+
+message VoidMessage {}
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
new file mode 100644
index 0000000000..b9a5340a86
--- /dev/null
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+// #include <grpc++/channel.h>
+// #include <grpc++/client_context.h>
+// #include <grpc++/create_channel.h>
+// #include <grpc++/security/credentials.h>
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+
+using grpc::Channel;
+using grpc::Server;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerBuilder;
+
+using grpc::ClientContext;
+using grpc::ClientReader;
+using grpc::ClientReaderWriter;
+using grpc::ClientWriter;
+using grpc::Status;
+using sendrecv::SendRecvService;
+using sendrecv::VariableMessage;
+using sendrecv::VoidMessage;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class SendRecvServerImpl final : public SendRecvService::Service {
+ public:
+  explicit SendRecvServerImpl() {}
+
+  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+                      VariableMessage *out_var) override;
+
+  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+
+  void Push(const framework::LoDTensor &tensor) {
+    this->lodtensor_return_queue_.Push(tensor);
+  }
+
+ private:
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+};
+
+// RPCClient is a class to send tensors to pserver sub-network
+// using different hashing methods.
+class RPCClient {
+ public:
+  RPCClient(std::shared_ptr<Channel> channel)
+      : stub_(SendRecvService::NewStub(channel)) {}
+
+  bool SendVariable(const framework::Scope &scope, const std::string &inname,
+                    const std::string &outname);
+
+ private:
+  std::unique_ptr<SendRecvService::Stub> stub_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000..4489921757
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 818146aca7..932c0bf8fb 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -30,7 +30,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", x_dims);
-    if (ctx->Attrs().Get<bool>("is_training") == true) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("Mask", x_dims);
     }
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -49,7 +49,7 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
         .SetDefault(.5f);
-    AddAttr<bool>("is_training", "True if in training phase.").SetDefault(true);
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
     AddComment(R"DOC(
@@ -71,8 +71,8 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), true,
-                      "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      "GradOp is only callable when is_test is false");
 
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 30c769000f..db3578b9bf 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -59,7 +59,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     auto Y = EigenMatrix<T>::Reshape(*y, 1);
 
     auto place = context.GetEigenDevice<Place>();
-    if (context.Attr<bool>("is_training")) {
+    if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 6000b75fec..d9a130fdc0 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -35,7 +35,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
-    if (context.Attr<bool>("is_training")) {
+    if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int seed = context.Attr<int>("seed");
@@ -65,8 +65,8 @@ template <typename Place, typename T>
 class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(context.Attr<bool>("is_training"),
-                   "GradOp is only callable when is_training is true");
+    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
 
     auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
     auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
deleted file mode 100644
index d48cc4e8df..0000000000
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve .
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-using framework::OperatorBase;
-using framework::DySeqMetaBatch;
-
-namespace detail {
-
-inline void CreateVariables(Scope& scope,
-                            const std::vector<std::string>& var_names) {
-  for (const auto& name : var_names) {
-    scope.Var(name);
-  }
-}
-
-/*
- * The inputs with sequence should be reordered when they are split, so the
- * boot_states should be reordered in the same order.
- *
- * NOTE This may require that the `pre_state` of the first time step should just
- * copy the `boot_state` rather than reference it, for that the content should
- * be reordered, but the RNN op should not change the `boot_state` as an input
- * variable's content.
- */
-inline void ReorderInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& boot_state, LoDTensor* tensor,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor->Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    // TODO(superjom) pass in device context as an argument
-    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-inline void RestoreInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& tensor, LoDTensor* boot_state,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor.Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-}  // namespace detail
-
-// Implementation for forward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kForward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  CreateScopes();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  ConcatOutputs();
-}
-
-// Implementation for backward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kBackward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  // copy boot-states' gradients back.
-  for (const auto& state : arg_.states) {
-    ExportInitialStateGradient(state);
-  }
-
-  ConcatOutputs();
-}
-
-void RNNAlgorithm::SplitInputs() {
-  // TODO(superjom) make level a config
-  // TODO(superjom) check all the inputs has the same LoD
-  int level = 0;
-  for (const auto& item : cache_.inputs) {
-    const auto& var = item.second;
-    const auto& tensor = var->Get<LoDTensor>();
-    TensorArray& ta = step_inputs_[item.first];
-
-    dy_seq_metas_[item.first] =
-        ta.Unpack(tensor, level, true /*length_descend*/);
-
-    if (cache_.num_steps) {
-      PADDLE_ENFORCE_EQ(ta.size(), cache_.num_steps,
-                        "inputs should have the same steps");
-    } else {
-      cache_.num_steps = ta.size();
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepInputs() {
-  for (const auto& item : cache_.inputs) {
-    auto ta_it = step_inputs_.find(item.first);
-    PADDLE_ENFORCE(ta_it != step_inputs_.end(),
-                   "step_inputs_ not compatible with memory set");
-    TensorArray& ta = ta_it->second;
-    for (size_t step = 0; step < ta.size(); step++) {
-      auto tensor = ta.Read(step);
-      auto& step_scope = cache_.GetScope(step);
-      Variable* var = step_scope.FindVar(item.first);
-      if (var == nullptr) {
-        var = step_scope.Var(item.first);
-      }
-      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepOutputs() {
-  // initialize step outputs
-  for (const auto& item : cache_.outputs) {
-    step_outputs_.emplace(item.first, TensorArray());
-  }
-  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
-}
-
-void RNNAlgorithm::CreateScopes() {
-  PADDLE_ENFORCE_GT(cache_.num_steps, 0);
-  // resize scopes
-  size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
-  for (size_t i = 0; i < num_scopes_need_create; i++) {
-    cache_.scopes->emplace_back(&cache_.scope->NewScope());
-  }
-
-  // init temporary inputs
-  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
-  std::vector<std::string> states;
-  std::vector<std::string> ex_states;
-  std::vector<std::string> step_unit_outputs;
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(states),
-                 [](const rnn::StateAttr& m) { return m.var; });
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(ex_states),
-                 [](const rnn::StateAttr& m) { return m.pre_var; });
-  for (const auto& item : step_unit_->Outputs()) {
-    for (const auto& var : item.second) {
-      step_unit_outputs.push_back(var);
-    }
-  }
-
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    detail::CreateVariables(scope, arg_.inlinks);
-    detail::CreateVariables(scope, arg_.outlinks);
-    detail::CreateVariables(scope, states);
-    detail::CreateVariables(scope, ex_states);
-    detail::CreateVariables(scope, step_unit_outputs);
-  }
-}
-
-void RNNAlgorithm::ConcatOutputs() {
-  // TODO(superjom) transform this to a config
-  int level = 0;
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    for (auto& item : step_outputs_) {
-      auto* var = scope.FindVar(item.first);
-      PADDLE_ENFORCE_NOT_NULL(var);
-      auto* tensor = var->GetMutable<LoDTensor>();
-      tensor->mutable_data<value_type>(platform::CPUPlace());
-      item.second.WriteShared(step, *tensor);
-    }
-  }
-  // the inputs' lods should be the same, so randomly get one lod.
-  const auto& some_lod =
-      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  for (auto& item : step_outputs_) {
-    auto tensor = item.second.Pack(level, some_meta, some_lod);
-    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
-    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
-  }
-}
-
-void RNNAlgorithm::RunSteps() {
-  if (IsBackward()) {
-    // call stepnet in all the time steps reversely
-    for (int step = cache_.num_steps - 1; step >= 0; step--) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  } else {
-    for (size_t step = 0; step < cache_.num_steps; step++) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  }
-}
-
-void RNNAlgorithm::InitStates() {
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    for (const auto& state : arg_.states) {
-      CreateState(state, step);
-      LinkState(state, step);
-    }
-  }
-}
-
-void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state = *cache_.GetTensor(scope, state_attr.var);
-  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
-
-  size_t num_instances =
-      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-  auto dims = boot_state.dims();
-  dims[0] = num_instances;
-
-  state.Resize(dims);
-  state.mutable_data<value_type>(platform::CPUPlace());
-  states_[state_attr.var].WriteShared(step, state);
-}
-
-void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-
-  // process the first state's boot-state(the 0-step in forward mode or the
-  // last step in backward mode)
-  // Only forward mode need to link the boot-state to the `pre-state` in first
-  // time step. In backward mode, need to copy the gradient of `pre-state` in
-  // first time step to the gradient of `boot-state`.
-  if (step == 0 && IsForward()) {
-    LinkInitialState(state);
-  } else {
-    size_t num_instances =
-        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
-    // shink and share from previous state
-    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
-    state_pre.ShareDataWith(shrinked_pre_state);
-  }
-}
-
-void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state->mutable_data<float>(platform::CPUPlace());
-  // allocate state
-  state_pre.Resize(pre_state->dims());
-  state_pre.mutable_data<value_type>(platform::CPUPlace());
-  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
-                              pre_state->place());
-}
-
-void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state.Resize(state_pre.dims());
-  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
-                              pre_state.place());
-}
-
-void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
-                                  const paddle::framework::OperatorBase& op,
-                                  const paddle::framework::Scope& scope,
-                                  platform::DeviceContext const* dev_ctx,
-                                  rnn::Argument* arg) {
-  this->scope = &scope;
-  InitArgument(name, op, arg);
-  CacheScopes(scope, *arg);
-  CacheInlinks(scope, arg->inlinks);
-  CacheOutlinks(scope, arg->outlinks);
-  this->dev_ctx = dev_ctx;
-}
-
-void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
-                                          const OperatorBase& op,
-                                          rnn::Argument* arg) {
-  rnn::InitArgument(name, arg, op, false /*is_grad*/);
-}
-
-void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
-                                         const rnn::Argument& arg) {
-  auto scopes_var = scope.FindVar(arg.step_scopes);
-  PADDLE_ENFORCE(scopes_var != nullptr,
-                 "the step_scopes output argument [%s] should be created first "
-                 "by framework.",
-                 arg.step_scopes);
-  this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
-}
-
-void RNNAlgorithm::ArgCache::CacheInlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    inputs[name] = var;
-  }
-}
-
-void RNNAlgorithm::ArgCache::CacheOutlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    outputs[name] = var;
-  }
-}
-
-Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
-                                              const std::string& name) {
-  auto* var = scope.FindVar(name);
-  PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
-  return var;
-}
-
-LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
-                                             const std::string& name) {
-  auto* var = GetVariable(scope, name);
-  return var->GetMutable<LoDTensor>();
-}
-
-const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
-    {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs",
-                       "states", "ex_states", "initial_states"},
-     rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
-                       "inputs@GRAD", "states", "ex_states",
-                       "initial_states@GRAD"}}};
-
-void DynamicRecurrentOp::Run(const framework::Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-void DynamicRecurrentGradientOp::Run(
-    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-class DynamicRecurrentOpProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
-                                         framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    const auto& name =
-        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
-    // inputs and outputs stored in proto
-    AddInput(name.inlinks,
-             "The inputs that need to be segmented for each step.")
-        .AsDuplicable();
-    AddInput(name.initial_states, "Variables to initialize the states.")
-        .AsDuplicable();
-
-    AddOutput(name.outlinks,
-              "The outputs that need to be concatenated for all steps.")
-        .AsDuplicable();
-    AddOutput(name.step_scopes, "step scopes");
-
-    // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
-    AddAttr<std::vector<std::string>>(name.states, "names of states");
-
-    AddComment(R"DOC(
-Dynamic Recurrent Operator.
-
-This is a RNN operator for varience-length sequences.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
-            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
-            dynamic_recurrent_grad,
-            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
deleted file mode 100644
index 5b0548c3a4..0000000000
--- a/paddle/operators/dynamic_recurrent_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor_array.h"
-#include "paddle/framework/variable.h"
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-
-class RNNAlgorithm {
- public:
-  enum ComputeMode { kForward = 0, kBackward = 1 };
-  static const std::array<rnn::ArgumentName, 2> kArgNames;
-  using value_type = float;
-
-  /*
-   * Different `Run` method for forward and backward, `_` is just for template
-   * specifialization.
-   */
-  template <ComputeMode _>
-  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
-           const platform::DeviceContext& dev_ctx);
-  /*
-   * Split the inputs(LoDTensors) to segments for each time step.
-   */
-  void SplitInputs();
-
-  /*
-   * Create step-scopes to store temporary outputs in each time steps.
-   */
-  void CreateScopes();
-
-  /*
-   * Link TensorArray steps to the corresponding variables located in
-   * step-scopes.
-   */
-  void WriteStepInputs();
-
-  /*
-   * Write output of each step to the corresponding TensorArray.
-   */
-  void WriteStepOutputs();
-
-  /*
-   * Initialize the states, each state will have a corresponding pre-state,
-   * which share the memory with the state in the previous time state. The
-   * pre-state in the first time step will be initialized with an zero tensor or
-   * a tensor in parent scope if is provided.
-   */
-  void InitStates();
-
-  /*
-   * Create state variables for each time step.
-   */
-  void CreateState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link pre-state variable in current scope to the state variable in the
-   * previous time step (scope) by reference.
-   */
-  void LinkState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link the pre-state of the first time step to the `boot-state` in parent's
-   * scope.
-   */
-  void LinkInitialState(const rnn::StateAttr& state);
-
-  /*
-   * Copy the gradient from `pre-state` in the first step-scope to the
-   * `boot-state` in parent's scope.
-   */
-  void ExportInitialStateGradient(const rnn::StateAttr& state);
-
-  /*
-   * Calculate time steps.
-   */
-  void RunSteps();
-
-  /*
-   * Concatenate outputs in each time step and generate a LoDTensor.
-   */
-  void ConcatOutputs();
-
-  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
-  bool IsForward() const { return mode_ == ComputeMode::kForward; }
-  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
-
-  /*
-   * set a step unit that is created according to a RecurrentOp's step unit.
-   */
-  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
-    PADDLE_ENFORCE_NOT_NULL(step_unit);
-    step_unit_ = std::move(step_unit);
-  }
-  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
-
-  const framework::TensorArray& state(const std::string& name) const {
-    auto it = states_.find(name);
-    PADDLE_ENFORCE(it != states_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_input(const std::string& name) const {
-    auto it = step_inputs_.find(name);
-    PADDLE_ENFORCE(it != step_inputs_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_output(const std::string& name) const {
-    auto it = step_outputs_.find(name);
-    PADDLE_ENFORCE(it != step_outputs_.end());
-    return it->second;
-  }
-
- protected:
-  struct ArgCache {
-    framework::Scope const* scope;
-    std::vector<framework::Scope*>* scopes;
-    std::map<std::string, framework::Variable*> inputs;
-    std::map<std::string, framework::Variable*> outputs;
-    platform::DeviceContext const* dev_ctx;
-
-    size_t num_steps{0};
-
-    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
-              const framework::Scope& scope,
-              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
-
-    framework::Scope& GetScope(size_t index) {
-      PADDLE_ENFORCE_LT(index, num_steps);
-      return *scopes->at(index);
-    }
-
-    framework::LoDTensor* GetTensor(const framework::Scope& scope,
-                                    const std::string& name);
-
-   private:
-    void InitArgument(const rnn::ArgumentName& name,
-                      const framework::OperatorBase& op, rnn::Argument* arg);
-    void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
-    void CacheInlinks(const framework::Scope& scope,
-                      const std::vector<std::string>& names);
-    void CacheOutlinks(const framework::Scope& scope,
-                       const std::vector<std::string>& names);
-    framework::Variable* GetVariable(const framework::Scope& scope,
-                                     const std::string& name);
-  };
-
- private:
-  std::unique_ptr<framework::OperatorBase> step_unit_;
-  std::map<std::string, framework::TensorArray> states_;
-  std::map<std::string, framework::TensorArray> step_inputs_;
-  std::map<std::string, framework::TensorArray> step_outputs_;
-  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
-  rnn::Argument arg_;
-  ArgCache cache_;
-  ComputeMode mode_{ComputeMode::kForward};
-
-#ifdef PADDLE_WITH_TESTING
-  // test forward
-  friend class RNNAlgorithmTestHelper;
-  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
-  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
-// TODO(superjom) test backward
-#endif
-};
-
-class DynamicRecurrentOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentOp(const DynamicRecurrentOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-class DynamicRecurrentGradientOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentGradientOp(const std::string& type,
-                             const framework::VariableNameMap& inputs,
-                             const framework::VariableNameMap& outputs,
-                             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
deleted file mode 100644
index 8d840e259b..0000000000
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
-};
-
-void OpDescNewVar(const std::string& param_name,
-                  std::initializer_list<const char*> arguments,
-                  paddle::framework::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    var->add_arguments(arg_name);
-  }
-}
-
-// create a LoD tensor in scope with specific dims
-LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
-                     const platform::Place& place) {
-  auto* var = scope.Var(name);
-  auto* tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize(dims);
-  tensor->mutable_data<float>(place);
-  return tensor;
-}
-
-class RNNAlgorithmTestHelper : public ::testing::Test {
- protected:
-  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
-
-  virtual void SetUp() override {
-    CreateGlobalVariables();
-
-    auto op_desc = CreateOpDesc();
-    op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
-    InitCacheManually();
-    InitStepNet();
-  }
-
-  framework::OpDesc CreateOpDesc() {
-    // create op
-    paddle::framework::OpDesc op_desc;
-    op_desc.set_type("dynamic_recurrent");
-
-    OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
-    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
-    OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
-    OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
-
-    // set pre-states
-    auto pre_memories = op_desc.mutable_attrs()->Add();
-    pre_memories->set_name(argname.ex_states);
-    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto pre_memories_item = pre_memories->add_strings();
-    *pre_memories_item = "mem@pre";
-
-    // set states
-    auto memories = op_desc.mutable_attrs()->Add();
-    memories->set_name(argname.states);
-    memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto memories_item = memories->add_strings();
-    *memories_item = "mem";
-    return op_desc;
-  }
-
-  void CreateGlobalVariables() {
-    platform::CPUPlace place;
-    scope.Var("step_scopes");
-    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
-    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
-    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
-    // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.
-    framework::LoD in0_lod(1);
-    for (int x : std::vector<int>{0, 4, 7, 9, 10}) {
-      in0_lod[0].push_back(x);
-    }
-    in0->set_lod(in0_lod);
-    in0->Resize(framework::make_ddim({10, 8}));
-    // set the content, each sentence content is seqid.batchid
-    // the seqid starts from 0
-    int start = 0;
-    for (size_t seqid = 0; seqid < in0_lod.size() - 1; seqid++) {
-      for (size_t batchid = 0;
-           batchid < in0_lod[0][seqid + 1] - in0_lod[0][seqid]; batchid++) {
-        float v = seqid + batchid * 0.1;
-
-        for (size_t dim = 0; dim < 8; dim++) {
-          in0->data<float>()[start * 8 + dim] = v;
-        }
-        start++;
-      }
-    }
-  }
-
-  void InitCacheManually() {
-    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
-                     &dop->arg_);
-  }
-
-  void InitStepNet() {
-    std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
-    dynamic_cast<NetOp*>(stepnet.get())
-        ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
-            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
-            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
-    dop->SetStepUnit(std::move(stepnet));
-  }
-
- protected:
-  RNNAlgorithm* dop;
-  std::unique_ptr<framework::OperatorBase> op;
-  paddle::platform::CPUDeviceContext device_context;
-  paddle::framework::Scope scope;
-};
-
-TEST_F(RNNAlgorithmTestHelper, CreateCache) {
-  const rnn::Argument& arg = dop->arg_;
-  ASSERT_EQ(arg.inlinks.size(), 1UL);
-  ASSERT_EQ(arg.outlinks.size(), 1UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
-  dop->SplitInputs();
-  auto& in0_ta = dop->step_inputs_["in0"];
-  ASSERT_EQ(in0_ta.size(), 4UL);
-
-  const auto& batch0 = in0_ta.Read(0);
-  const auto& batch1 = in0_ta.Read(1);
-  const auto& batch2 = in0_ta.Read(2);
-  const auto& batch3 = in0_ta.Read(3);
-  EXPECT_EQ(batch0.dims()[0], 4);
-  EXPECT_EQ(batch1.dims()[0], 3);
-  EXPECT_EQ(batch2.dims()[0], 2);
-  EXPECT_EQ(batch3.dims()[0], 1);
-}
-
-TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  ASSERT_EQ(dop->cache_.num_steps, 4UL);
-  ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"in0"})) {
-      ASSERT_TRUE(scope.FindVar(name) != nullptr);
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"out0"})) {
-      ASSERT_TRUE(scope.FindVar(name));
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
-  // Let's leave this test to python unittest.
-}
-
-TEST_F(RNNAlgorithmTestHelper, InitStates) {
-  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-  dop->InitStates();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    auto state = scope.FindVar("mem");
-    ASSERT_TRUE(state != nullptr);
-
-    auto* pre_state = scope.FindVar("mem@pre");
-    ASSERT_TRUE(pre_state != nullptr);
-
-    auto* boot_state = scope.FindVar("boot_mem");
-    ASSERT_TRUE(boot_state != nullptr);
-  }
-}
-
-}  // operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index d9bc80c869..432b9ba6f7 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -22,7 +22,7 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
   ElementwiseAddOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("add", "Out = X + Y");
+    SetComment("Add", "$Out = X + Y$");
     AddComment(comment_);
   }
 };
@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
             elementwise_add_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
index 85d063a76b..7591428ac7 100644
--- a/paddle/operators/elementwise_add_op.cu
+++ b/paddle/operators/elementwise_add_op.cu
@@ -19,7 +19,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int64_t>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index f04fe3ec60..921dc5f6a6 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -19,11 +19,48 @@
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
 template <typename Place, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    TransformFunctor<AddFunctor<T>, T, Place> functor(
+        x, y, z, ctx.device_context(), AddFunctor<T>());
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    if (x_dims == y_dims) {
+      functor.Run();
+      return;
+    }
+
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                   "Axis should be in range [0, x_dims)");
+
+    int pre, n, post;
+    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+    if (post == 1) {
+      functor.RunRowWise(n, pre);
+      return;
+    } else {
+      functor.RunMidWise(n, pre, post);
+      return;
+    }
   }
 };
 
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 3f56344d00..7a325199bd 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -22,7 +22,7 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
   ElementwiseDivOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "Out = X / Y");
+    SetComment("Div", "$Out = X / Y$");
     AddComment(comment_);
   }
 };
@@ -35,7 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
             elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
index b96aa31748..de4d0c3344 100644
--- a/paddle/operators/elementwise_div_op.cu
+++ b/paddle/operators/elementwise_div_op.cu
@@ -19,7 +19,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int64_t>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index da7765aa6a..8851267a52 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -23,7 +23,7 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
   ElementwiseMulOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "Out = X ⊙ Y");
+    SetComment("Mul", "$Out = X \\odot\\ Y$");
     AddComment(comment_);
   }
 };
@@ -37,8 +37,12 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index 056f081d3e..b0dfdee1cc 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -20,8 +20,12 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int64_t>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index fce4b24a22..ea533503e4 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -46,37 +46,42 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   ElementwiseOpMaker(framework::OpProto* proto,
                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", R"DOC(
-The first input of elementwise op, it's a tensor of any dimensions.
-)DOC");
-    AddInput("Y", R"DOC(
-The sencond input of elementwise op, it's a tensor and it's dimensions
-must be small or equal to X's dimensions.
-)DOC");
+    AddInput("X", "(Tensor) The first input tensor of elementwise op");
+    AddInput("Y", "(Tensor) The second input tensor of elementwise op");
+    AddOutput("Out", "The output of elementwise op");
     AddAttr<int>("axis",
-                 R"DOC(
-When the shape(Y) does not equal the shape(X),Y will be broadcasted
-to match the shape of X and axis should be dimension index Y in X
-        )DOC")
+                 "(int, default -1) The starting dimension index "
+                 "for broadcasting Y onto X")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
-
-    AddOutput("Out", "The output of elementwise op");
     comment_ = R"DOC(
-Limited elementwise {name} operator.The equation is: Out = {equation}.
-1. The shape of Y should be same with X or
-2. Y's shape is a subset of X.
-   Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
-
-   example:
-      shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-      shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-      shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+Limited Elementwise {name} Operator.
+
+The equation is:
+
+{equation}
+
+X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
+or equal to the dimensions of X. 
+
+There are two cases for this operator:
+1. The shape of Y is same with X;
+2. The shape of Y is a subset of X.
+
+For case 2:
+Y will be broadcasted to match the shape of X and axis should be 
+the starting dimension index for broadcasting Y onto X.
+
+example:
+  shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+  shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+  shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+  shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+  shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
 
 Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input X.
+or not. But the output only shares the LoD information with input X.
+
 )DOC";
     AddComment(comment_);
   }
@@ -115,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 488a35aafc..ca3542e783 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -16,6 +16,11 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+#include "paddle/platform/transform.h"
+
+#ifdef __NVCC__
+#include <thrust/iterator/iterator_adaptor.h>
+#endif
 
 #include "paddle/operators/math/math_function.h"
 
@@ -54,6 +59,160 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
+template <typename T, typename Place>
+class RowwiseTransformIterator;
+template <typename T, typename Place>
+class MidWiseTransformIterator;
+
+template <typename T>
+class RowwiseTransformIterator<T, platform::CPUPlace> {
+ public:
+  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+
+  RowwiseTransformIterator<T, platform::CPUPlace>& operator++() {
+    ++i_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(
+      const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(
+      const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t n_;
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUPlace> {
+ public:
+  MidWiseTransformIterator(const T* ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, platform::CPUPlace>& operator++() {
+    ++j_;
+    i_ = j_ / post_;
+    if (UNLIKELY(i_ == n_)) {
+      j_ = 0;
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(
+      const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(
+      const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t j_;
+  int64_t n_;
+  int post_;
+};
+
+#ifdef __NVCC__
+template <typename T>
+class RowwiseTransformIterator<T, platform::GPUPlace>
+    : public thrust::iterator_adaptor<
+          RowwiseTransformIterator<T, platform::GPUPlace>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      RowwiseTransformIterator<T, platform::GPUPlace>, const T*>
+      super_t;
+  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+      : super_t(x), begin_(x), n_(n){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (this->base() - begin_) % n_);
+  }
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::GPUPlace>
+    : public thrust::iterator_adaptor<
+          MidWiseTransformIterator<T, platform::GPUPlace>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      MidWiseTransformIterator<T, platform::GPUPlace>, const T*>
+      super_t;
+  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+      : super_t(x), begin_(x), n_(n), post_(post){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int post_;
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (((this->base() - begin_) / post_) % n_));
+  }
+};
+#endif
+
+template <typename Functor, typename T, typename Place>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
+                   framework::Tensor* z, const platform::DeviceContext& ctx,
+                   Functor func)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<T>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func) {}
+
+  inline void Run() const {
+    platform::Transform<Place> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<Place> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, Place>(y_, n), z_,
+          func_);
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<Place> trans;
+    trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator<T, Place>(y_, n, post),
+          z_, func_);
+  }
+
+ private:
+  const T* x_;
+  const T* y_;
+  T* z_;
+  int64_t nx_;
+  const platform::DeviceContext& ctx_;
+  Functor func_;
+};
+
 #define EIGEN_FUNCTOR(name, eigen_op)                                          \
   struct Eigen##name##Functor {                                                \
     template <typename Place, typename T>                                      \
@@ -106,7 +265,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.")
+                    "Rank of first input must >= rank of second input.");
 
   if (x_dims == y_dims) {
     functor f;
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 3e4f98fdb3..95d7979e39 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -22,7 +22,7 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
   ElementwiseSubOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "Out = X - Y");
+    SetComment("Sub", "$Out = X - Y$");
     AddComment(comment_);
   }
 };
@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
             elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
index 0efb92fce9..ec23bec35f 100644
--- a/paddle/operators/elementwise_sub_op.cu
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -19,7 +19,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int64_t>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000..282775fcda
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+             "X is the input tensor to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+              "The rank of Output(Out) is same as Input(X) except that each "
+              "dimension size of Output(Out) is equal to corresponding "
+              "dimension size of Input(X) multiplying corresponding value of "
+              "Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please notice that size of 'expand_times' must be same with
+X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/expand_op.cu
similarity index 75%
rename from paddle/operators/fill_zeros_like_op.cu
rename to paddle/operators/expand_op.cu
index fdbcf520a0..6744562b6c 100644
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/expand_op.cu
@@ -13,10 +13,11 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/fill_zeros_like_op.h"
+
+#include "paddle/operators/expand_op.h"
 
 namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
+    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000..4d7996ad1e
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename Place, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::CopyFrom(*in0, context.GetPlace(), context.device_context(),
+                          out0);
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(context.GetEigenDevice<Place>()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 0e5b263eae..ee43c22fb1 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
@@ -59,8 +59,13 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of feed op");
     AddOutput("Out", "The output of feed op");
-    AddComment("feed op, it should not be configured by users directly");
-    AddAttr<int>("col", "column of feed");
+    AddAttr<int>("col", "(int) The column of feed");
+    AddComment(R"DOC(
+Feed Operator.
+
+It should not be configured by users directly.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index f1086e3dc7..1ae07194c2 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
 
@@ -66,8 +66,13 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fetch op");
     AddOutput("Out", "The output of fetch op");
-    AddComment("fetch op, it should not be configured by users directly");
-    AddAttr<int>("col", "column of fetch");
+    AddAttr<int>("col", "(int) The column of fetch");
+    AddComment(R"DOC(
+Fetch Operator.
+
+It should not be configured by users directly.
+
+)DOC");
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 0244adb423..892922cd3a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -34,21 +34,26 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
-    auto dims = framework::make_ddim(shape_int64);
+    auto output_dim = framework::make_ddim(shape_int64);
 
-    int dim_idx = ctx->Attrs().Get<int>("dim_idx");
-    PADDLE_ENFORCE_GE(dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), dim_idx);
-    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
 
-    dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx];
-    ctx->SetOutputDim("Out", dims);
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    return framework::OpKernelType(
+        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
   }
 };
 
@@ -58,7 +63,7 @@ class FillConstantBatchSizeLikeOpMaker
   FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
                                    framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
@@ -69,22 +74,34 @@ class FillConstantBatchSizeLikeOpMaker
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-    AddAttr<int>("dim_idx",
-                 "(int, default 0) the index of batch size dimension")
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) The index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) The index of output's batch size dimension")
         .SetDefault(0);
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
-    AddComment(R"DOC(Fill up a variable with specified constant value.)DOC");
+    AddComment(R"DOC(
+FillConstantBatchSizeLike Operator.
+
+Fill up a variable with specified constant value.
+
+)DOC");
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant_batch_size_like,
-                             ops::FillConstantBatchSizeLikeOp,
-                             ops::FillConstantBatchSizeLikeOpMaker);
+REGISTER_OPERATOR(fill_constant_batch_size_like,
+                  ops::FillConstantBatchSizeLikeOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantBatchSizeLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
     fill_constant_batch_size_like,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>);
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
+                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
similarity index 81%
rename from paddle/operators/fill_constant_batch_size_like_op.cu
rename to paddle/operators/fill_constant_batch_size_like_op.cu.cc
index cfa5df001e..9e7a1eeab8 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -12,12 +12,14 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/framework/op_registry.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     fill_constant_batch_size_like,
     ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>);
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
+                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
index a360e6683e..339d97a30a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +27,8 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
-    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
-    auto place = ctx.GetEigenDevice<Place>();
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
+    math::SetConstant<Place, T> setter;
+    setter(ctx.device_context(), out, static_cast<T>(value));
   }
 };
 
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 7a861b6cfc..3d5f84bc23 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -12,30 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/fill_constant_op.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
     auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
+};
 
- protected:
-  framework::DataType IndicateDataType(
-      const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto data_type = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+    }
+    math::set_constant(dev_ctx, &out, value);
   }
 };
 
@@ -44,26 +55,33 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
   FillConstantOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
     AddOutput("Out",
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
-    AddComment(R"DOC(Fill up a variable with specified constant value.)DOC");
+    AddComment(R"DOC(
+FillConstantBatchSizeLike Operator.
+
+Fill up a variable with specified constant value.
+
+)DOC");
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
-                             ops::FillConstantOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index ed529ac40a..95fb5932b8 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -37,11 +37,13 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
                        framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Y", "The varibale will be filled up with zeros.");
+    AddOutput("Y", "The variable will be filled up with zeros.");
     AddComment(R"DOC(
-Fill up a vriable with zeros.
+FillZerosLike Operator.
+
+Fill up a variable with zeros.
+The output will have the same size as the input.
 
-The output will have the same size with input.
 )DOC");
   }
 };
@@ -52,5 +54,8 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
                              ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
+    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
diff --git a/paddle/operators/fill_constant_op.cu b/paddle/operators/fill_zeros_like_op.cu.cc
similarity index 65%
rename from paddle/operators/fill_constant_op.cu
rename to paddle/operators/fill_zeros_like_op.cu.cc
index a57b11c6cb..1501a17441 100644
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -12,12 +12,13 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
+#include "paddle/operators/fill_zeros_like_op.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/fill_constant_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);
+    fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index cdf56a723b..7e7d78eea2 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -23,10 +23,11 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Y");
-    output->mutable_data<T>(context.GetPlace());
-    auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+    auto* out = context.Output<framework::Tensor>("Y");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<Place, T> setter;
+    setter(context.device_context(), out, static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
new file mode 100644
index 0000000000..cb7ae69196
--- /dev/null
+++ b/paddle/operators/ftrl_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/ftrl_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FTRLOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
+                   "Input(SquaredAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
+                   "Input(LinearAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of FTRL should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
+                   "Output(SquaredAccumOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
+                   "Output(LinearAccumOut) of FTRL should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of FTRL Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("SquaredAccumOut", param_dim);
+    ctx->SetOutputDim("LinearAccumOut", param_dim);
+  }
+};
+
+class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FTRLOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("SquaredAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates squared gradients.");
+    AddInput("LinearAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates linear gradients.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("SquaredAccumOut",
+              "(Tensor) Output accumulated squared"
+              " gradients.");
+    AddOutput("LinearAccumOut",
+              "(Tensor) Output accumulated linear"
+              " gradients.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("lr_power",
+                   "(float, default -0.5f) "
+                   "Learning Rate Power.")
+        .SetDefault(-0.5f);
+    AddComment(R"DOC(
+FTRL (Follow The Regularized Leader) Operator.
+
+Optimizer that implements the FTRL algorithm:
+
+$$
+new\_accum = squared\_accum + grad^2 \\
+if (lr\_power == -0.5) {
+   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
+                   (learning\_rate * param) \\
+} else {
+   linear\_accum += grad -
+                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
+                  (learning\_rate * param) \\
+}
+
+x = (l1 * sign(linear\_accum) - linear\_accum)
+if (lr\_power == -0.5) {
+   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+} else {
+   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+}
+squared\_accum += grad^2;
+$$
+
+The paper that proposed Follow The Regularized Leader (FTRL):
+(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
+REGISTER_OP_CPU_KERNEL(ftrl,
+                       ops::FTRLOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
new file mode 100644
index 0000000000..97b36dade6
--- /dev/null
+++ b/paddle/operators/ftrl_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/ftrl_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(ftrl,
+                       ops::FTRLOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
new file mode 100644
index 0000000000..b040162f8d
--- /dev/null
+++ b/paddle/operators/ftrl_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class FTRLOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
+    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    sq_accum_out->mutable_data<T>(ctx.GetPlace());
+    lin_accum_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto sq_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
+    auto lin_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
+    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto new_accum = sq_accum + g * g;
+    // Special case for lr_power = -0.5
+    if (lr_power == static_cast<T>(-0.5)) {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
+    } else {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
+           lr.broadcast(grad_dsize)) *
+              p;
+    }
+
+    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
+    if (lr_power == static_cast<T>(-0.5)) {
+      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    } else {
+      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    }
+
+    s_acc_out.device(place) = sq_accum + g * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index f6c7f472da..8f80fb1625 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,9 +40,11 @@ class GatherOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
@@ -55,9 +57,11 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
@@ -67,11 +71,28 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
-    AddOutput("Out", "The output of add op");
+    AddOutput("Out", "The output of gather op");
     AddComment(R"DOC(
-Gather Operator by selecting from the first axis,
+Gather Operator.
+
+$Out = X[Index]$
+
+Out is obtained by gathering entries of the outer-most dimension 
+of X indexed by Index and concatenate them together.
+
+Example:
+
+X = [[1, 2],
+     [3, 4],
+     [5, 6]]
+
+Index = [[1, 2]]
+
+Then:
+
+Out = [[3, 4],
+       [5, 6]]
 
-Out = X[Index]
 )DOC");
   }
 };
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index be7f542a7a..254c83e137 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -57,9 +57,11 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    return framework::OpKernelType(
+        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
   }
 };
 
@@ -68,21 +70,35 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
   GaussianRandomOpMaker(framework::OpProto* proto,
                         framework::OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "output matrix of random op");
-    AddComment(R"DOC(
-GaussianRandom operator.
-Use to initialize tensor with gaussian random generator.
-)DOC");
+    AddOutput("Out", "Output matrix of gaussian random op");
 
-    AddAttr<std::vector<int>>("shape", "The dimension of random tensor.");
-    AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
-    AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "The dimension of random tensor.");
+    AddAttr<float>("mean",
+                   "(float, default 0.0) "
+                   "mean of random tensor.")
+        .SetDefault(.0f);
+    AddAttr<float>("std",
+                   "(float, default 1.0) "
+                   "std of random tensor.")
+        .SetDefault(1.0f);
     AddAttr<int>("seed",
+                 "(int, default 0) "
                  "Random seed of generator."
-                 "0 means use system wide seed")
+                 "0 means use system wide seed.")
         .SetDefault(0);
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<int>("dtype",
+                 "(int, default 5(FP32)) "
+                 "Output data type.")
         .SetDefault(framework::DataType::FP32);
+
+    AddComment(R"DOC(
+GaussianRandom Operator.
+
+Used to initialize tensors with gaussian random generator.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
new file mode 100644
index 0000000000..5aa03f8916
--- /dev/null
+++ b/paddle/operators/gru_op.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(%s) of GRUOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                   "Output(%s) of GRUOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[1], frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("BatchGate", input_dims);
+    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
+    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
+    ctx->ShareLoD("Input", "Hidden");
+  }
+};
+
+class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) The first input is a LodTensor, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) The initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.")
+        .AsDispensable();
+    AddInput(
+        "Weight",
+        "(Tensor) The learnable hidden-hidden weight matrix with shape "
+        "(D x 3D), where D is the hidden size. The elements continuous in "
+        "memory can be divided into two parts. The first part are weights of "
+        "the update gate and reset gate with shape (D x 2D), and the second "
+        "part are weights of output candidate with shape (D x D).");
+    AddInput("Bias",
+             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("BatchGate",
+              "(LoDTensor) To compute with batches, sequence data will be "
+              "reorganized into several successive batches each containing "
+              "data from the same time step. The LoDTensor BatchGate contains "
+              "the update gate, reset gate and output candidate values "
+              "organized in batches. The LoD size is 2. The first LoD contains "
+              "the batch offsets and the second LoD contains the indexes in "
+              "the raw sequence data.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchResetHiddenPrev",
+        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchHidden",
+        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "Hidden",
+        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.");
+    AddAttr<std::string>("activation",
+                         "(string, default tanh) "
+                         "The activation type used for output candidate {h}_t.")
+        .SetDefault("tanh");
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default sigmoid) "
+        "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed GRU.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+\f]
+
+@note To implement the complete GRU, fully-connected operator must be used  
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class GRUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of GRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
new file mode 100644
index 0000000000..0ceff94ec3
--- /dev/null
+++ b/paddle/operators/gru_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::GRUGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
new file mode 100644
index 0000000000..564489d3a9
--- /dev/null
+++ b/paddle/operators/gru_op.h
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename Place, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    context.ShareLoD("Input", "Hidden");
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    auto& dev_ctx = context.device_context();
+    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<Place, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::hl_gru_value<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<Place, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+
+    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, *hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename Place, typename T>
+class GRUGradKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+    auto* batch_reset_hidden_prev =
+        context.Input<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+    auto* hidden = context.Input<LoDTensor>("Hidden");
+    auto* hidden_grad =
+        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto gate_dims = batch_gate->dims();
+    auto hidden_dims = hidden->dims();
+    int frame_size = hidden_dims[1];
+
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                                                 context.GetPlace());
+    math::SetConstant<Place, T> zero;
+    auto& dev_ctx = context.device_context();
+    zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
+    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
+    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
+
+    Tensor ordered_h0, ordered_h0_grad;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      ReorderInitState<Place, T>(context.device_context(), *h0, order,
+                                 &ordered_h0, true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.device_context(), &ordered_h0_grad, static_cast<T>(0.0));
+    }
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    batch_hidden_grad.set_lod(batch_hidden->lod());
+    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
+
+    math::hl_gru_value<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+
+    math::hl_gru_grad<T> gru_grad;
+    if (weight_grad) {
+      gru_grad.gate_weight_grad =
+          weight_grad->mutable_data<T>(context.GetPlace());
+      zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+      gru_grad.state_weight_grad =
+          weight_grad->data<T>() + 2 * frame_size * frame_size;
+    } else {
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
+    }
+
+    auto batch_starts = batch_hidden_grad.lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      gru_value.gate_value = gate_t.data<T>();
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      gru_grad.output_grad = hidden_grad_t.data<T>();
+      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      gru_grad.gate_grad = gate_grad_t.data<T>();
+      Tensor reset_hidden_prev_grad_t =
+          batch_reset_hidden_prev_grad.Slice(bstart, bend);
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
+      if (n == 0) {
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
+      } else {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
+        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
+      }
+
+      math::GRUUnitGradFunctor<Place, T>::compute(
+          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
+          math::ActiveType(context.Attr<std::string>("activation")),
+          math::ActiveType(context.Attr<std::string>("gate_activation")));
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      math::Batch2LoDTensorFunctor<Place, T> to_seq;
+      batch_gate_grad.set_lod(batch_gate->lod());
+      to_seq(dev_ctx, batch_gate_grad, *input_grad);
+    }
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      math::ColwiseSum<Place, T> col_sum;
+      col_sum(dev_ctx, batch_gate_grad, bias_grad);
+    }
+    if (h0 && h0_grad) {
+      ReorderInitState<Place, T>(context.device_context(), ordered_h0_grad,
+                                 order, h0_grad, false);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 8d9723289d..877c969103 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -80,19 +80,21 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("HiddenPrev",
              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
              "states of previous time step.");
-    AddInput("Weight",
-             "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
-             "The elements continuous in memory can be divided into two parts. "
-             "The first part are weights of the update gate and reset gate "
-             "with shape [frame_size, frame_size * 2], and the second part are "
-             "weights of output candidate with shape [frame_size, frame_size]");
-    AddInput("Bias",
-             "(Tensor) Bias vector with shape [1, frame_size * 3] concating "
-             "bias of the update gate, reset gate and output candidate.")
+    AddInput(
+        "Weight",
+        "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+        "The elements continuous in memory can be divided into two parts. "
+        "The first part are weights of the update gate and reset gate "
+        "with shape [frame_size, frame_size * 2], and the second part are "
+        "weights of output candidate with shape [frame_size, frame_size].");
+    AddInput(
+        "Bias",
+        "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating "
+        "bias of the update gate, reset gate and output candidate.")
         .AsDispensable();
     AddOutput("Gate",
               "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
-              "output of update gate, reset gate and output candidate")
+              "output of update gate, reset gate and output candidate.")
         .AsIntermediate();
     AddOutput("ResetHiddenPrev",
               "(Tensor) Matrix with shape [batch_size, frame_size] for the "
@@ -112,16 +114,20 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(sigmoid)
         .InEnum({identity, sigmoid, tanh, relu});
     AddComment(R"DOC(
-GRUUnitOp implements part calculations of the GRU unit as following:
+GRUUnit Operator implements partial calculations of the GRU unit as following:
 
-\f[
-update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r)  \\
-output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev)
-\f]
+$$
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+$$
+
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
-The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
 )DOC");
   }
 };
@@ -145,12 +151,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
                    "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                    "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Gate");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                    "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
                    "Hidden");
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index c53e7d9827..3398c0934e 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -28,6 +28,10 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
 template <typename Place, typename T>
@@ -110,7 +114,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // calculate final output
-    h.device(place) = u * (h_p - c) + c;
+    h.device(place) = u * (c - h_p) + h_p;
   }
 };
 
@@ -146,35 +150,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     auto* weight_grad =
         context.Output<Tensor>(framework::GradVarName("Weight"));
     auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    hidden_prev_grad->mutable_data<T>(context.GetPlace());
-    weight_grad->mutable_data<T>(context.GetPlace());
     Tensor gate_grad;
-    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     Tensor reset_hidden_prev_grad;
-    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
-                                           context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
 
     const T* hidden_prev_data = hidden_prev->data<T>();
-    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
     const T* weight_data = weight->data<T>();
-    T* weight_grad_data = weight_grad->data<T>();
-    T* gate_grad_data = gate_grad.data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());
 
     auto h_p = EigenMatrix<T>::From(*hidden_prev);
     auto g = EigenMatrix<T>::From(*gate);
     auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_x = EigenMatrix<T>::From(*input_grad);
-    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
     auto d_g = EigenMatrix<T>::From(gate_grad);
     auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
     auto place = context.GetEigenDevice<Place>();
 
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
     Eigen::array<int, 2> u_offsets({{0, 0}});
     auto u = g.slice(u_offsets, extents);  // update gate
@@ -185,42 +181,56 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
 
     // backward for unactivated update gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
     // backward for unactivated output candidate
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+                   d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
     math::gemm<Place, T>(context.device_context(), false, true, batch_size,
                          frame_size, frame_size, 1,
                          gate_grad_data + frame_size * 2, frame_size * 3,
                          weight_data + frame_size * frame_size * 2, frame_size,
                          0, reset_hidden_prev_grad_data, frame_size);
-    // backward for state_weight
-    math::gemm<Place, T>(
-        context.device_context(), true, false, frame_size, frame_size,
-        batch_size, 1, reset_hidden_prev_data, frame_size,
-        gate_grad_data + frame_size * 2, frame_size * 3, 0,
-        weight_grad_data + frame_size * frame_size * 2, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for update_gate_weight and reset_gate_weight
-    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                         frame_size * 2, batch_size, 1, hidden_prev_data,
-                         frame_size, gate_grad_data, frame_size * 3, 0,
-                         weight_grad_data, frame_size * 2);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<Place, T>(
+          context.device_context(), true, false, frame_size, frame_size,
+          batch_size, 1, reset_hidden_prev_data, frame_size,
+          gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                           frame_size * 2, batch_size, 1, hidden_prev_data,
+                           frame_size, gate_grad_data, frame_size * 3, 0,
+                           weight_grad_data, frame_size * 2);
+    }
     // backward for hidden_prev
-    d_h_p.device(place) = d_r_h_p * r + d_h * u;
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size * 2, 1, gate_grad_data,
-                         frame_size * 3, weight_data, frame_size * 2, 1,
-                         hidden_prev_grad_data, frame_size);
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                           frame_size, frame_size * 2, 1, gate_grad_data,
+                           frame_size * 3, weight_data, frame_size * 2, 1,
+                           hidden_prev_grad_data, frame_size);
+    }
     // backward for input
-    d_x.device(place) = d_g;
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
     // backward for bias
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_b = EigenVector<T>::Flatten(*bias_grad);
       d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
     }
   }
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
new file mode 100644
index 0000000000..1e13897bb6
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/hinge_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HingeLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Logits) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Logits) contains a real value, "
+                      "so the 2nd dimension of Input(Logits) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Logits", "Loss");
+  }
+};
+
+template <typename AttrType>
+class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HingeLossOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "The input value (Logits) of Hinge loss op."
+             "Logits is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Hinge loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the hinge loss.");
+    AddComment(R"DOC(
+HingeLoss Operator.
+
+Let x be a logit (prediction) and y be the actual label. The logit can
+take any values from (-inf, inf), but the labels should be either -1 or 1.
+Then, the hinge loss is computed as follows:
+
+$$
+L_(x, y) = max(1 - y.x, 0) 
+$$
+
+Note that the labels passed as input will have values as either 0 or 1.
+
+)DOC");
+  }
+};
+
+class HingeLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Input(Logits@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto lab_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Logits");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OP_CPU_KERNEL(hinge_loss,
+                       ops::HingeLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
new file mode 100644
index 0000000000..ec20b08e30
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/hinge_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(hinge_loss,
+                       ops::HingeLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h
new file mode 100644
index 0000000000..c0be496f9c
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T, typename AttrType = T>
+class HingeLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* loss = context.Output<framework::Tensor>("Loss");
+    auto place = context.GetEigenDevice<Place>();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    loss->mutable_data<T>(context.GetPlace());
+    auto l = framework::EigenVector<T>::Flatten(*loss);
+    l.device(place) =
+        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
+            .cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class HingeLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* dloss =
+        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
+    auto place = context.GetEigenDevice<Place>();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    auto dl = framework::EigenVector<T>::Flatten(*dloss);
+
+    if (dpred) {
+      dpred->mutable_data<T>(context.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
+      dx.device(place) =
+          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
+          (-alt_labels);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 2d9449f5ca..938803d5b3 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -59,20 +59,29 @@ class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
               "The shape is same as Input(X) and will be reused in backward.")
         .AsIntermediate();
     AddOutput("Out",
-              "The output tensor with shape [batch_size, 1] which represents "
-              "the huber loss.");
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the huber loss.");
     AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
     AddComment(R"DOC(
+HuberLoss Operator.
+
 Huber loss is a loss function used in robust regression. We define X as the
 input value and Y as the target value. Huber loss can evaluate the fitness of
 X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
 shape of X and Y are [batch_size, 1]. The equation is:
 
-L_{\delta}(y, f(x)) =
+$$
+Out_{\delta}(X, Y)_i =
 \begin{cases}
-0.5 * (y - f(x))^2, \quad |y - f(x)| \leq \delta \\
-\delta * (|y - f(x)| - 0.5 * \delta),   \quad otherwise
+0.5 * (Y_i - X_i)^2,
+\quad |Y_i - X_i| \leq \delta \\
+\delta * (|Y_i - X_i| - 0.5 * \delta),
+\quad otherwise
 \end{cases}
+$$
+
+In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index 139392c691..54911267e3 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -12,26 +12,62 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/increment_op.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-class IncrementOp : public framework::OperatorWithKernel {
+class IncrementInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of IncrementOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of IncrementOp should not be null.");
+    PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
-template <typename AttrType>
+struct IncrementFunctor {
+  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
+                   float value)
+      : x_(x), out_(out), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
+  }
+
+  const framework::LoDTensor &x_;
+  framework::LoDTensor *out_;
+  float value_;
+};
+
+class IncrementOp : public framework::OperatorBase {
+ public:
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    float value = Attr<float>("step");
+    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
+             << value;
+    framework::VisitDataType(framework::ToDataType(out.type()),
+                             IncrementFunctor(x, &out, value));
+  }
+};
+
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   IncrementOpMaker(framework::OpProto *proto,
@@ -39,14 +75,18 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor of increment operator");
     AddOutput("Out", "(Tensor) The output tensor of increment operator.");
-    AddComment(R"DOC(Increment operator
+    AddAttr<float>("step",
+                   "(float, default 1.0) "
+                   "The step size by which the "
+                   "input tensor will be incremented.")
+        .SetDefault(1.0);
+    AddComment(R"DOC(
+Increment Operator.
+
+The equation is: 
+$$Out = X + step$$
 
-The equation is: Out = X + step
 )DOC");
-    AddAttr<AttrType>("step",
-                      "The step size by which the "
-                      "input tensor will be incremented.")
-        .SetDefault(1.0);
   }
 };
 
@@ -56,10 +96,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 
   std::unique_ptr<framework::OpDescBind> Apply() const override {
     auto *grad_op = new framework::OpDescBind();
-    grad_op->SetType("scale");
-    grad_op->SetInput("X", OutputGrad("Out"));
-    grad_op->SetOutput("Out", InputGrad("X"));
-    grad_op->SetAttr("scale", 1.0f);
+    grad_op->SetType("increment");
+    grad_op->SetInput("X", Output("Out"));
+    grad_op->SetOutput("Out", Input("X"));
+    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
     return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
@@ -68,8 +108,5 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>,
-                  ops::IncrementGradOpMaker);
-REGISTER_OP_CPU_KERNEL(increment,
-                       ops::IncrementKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
+                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
diff --git a/paddle/operators/increment_op.h b/paddle/operators/increment_op.h
deleted file mode 100644
index 342e254fc4..0000000000
--- a/paddle/operators/increment_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename Place, typename T, typename AttrType = T>
-class IncrementKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    tensor->mutable_data<T>(in->place());
-
-    auto step = static_cast<T>(context.Attr<AttrType>("step"));
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place = context.GetEigenDevice<Place>();
-    eigen_out.device(place) = eigen_in + step;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
new file mode 100644
index 0000000000..54fecf44e8
--- /dev/null
+++ b/paddle/operators/is_empty_op.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kInput[] = "X";
+constexpr char kOutput[] = "Out";
+
+class IsEmptyOp : public framework::OperatorBase {
+ public:
+  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // get input
+    auto *var = scope.FindVar(Input(kInput));
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto &tensor = var->Get<framework::LoDTensor>();
+    // get output
+    auto *out = scope.FindVar(Output(kOutput));
+    PADDLE_ENFORCE_NOT_NULL(out);
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+
+    out_tensor->Resize({1});
+    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(tensor.dims()) == 0;
+  }
+};
+
+class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IsEmptyOpProtoMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
+    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddComment(R"DOC(
+IsEmpty Operator which checks whether a tensor is empty.
+
+It will just return product(tensor.ddims()) > 0;
+              )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
+                             paddle::operators::IsEmptyOpProtoMaker);
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
index 1d111696cf..02ebf02296 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -57,7 +57,7 @@ L1 Norm Operator.
 
 Computes the L1 norm of a tensor.
 
-Out = sum (abs(X))
+$$Out = \sum{|X|}$$
 
 )DOC");
   }
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index de459818ad..3c60dc3dc7 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -29,7 +29,7 @@ class L1NormKernel : public framework::OpKernel<T> {
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
     auto place = context.GetEigenDevice<Place>();
 
     out.device(place) = x.abs().sum();
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 6864e3b0b7..8e079a14e0 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -23,57 +23,59 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
-             "(LoDTensor, default: LoDTensor<float>). "
-             "A 2-D LoDTensor with shape [N x D] where N is the size of the "
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
              "mini-batch and D is the total tag number. The unscaled emission "
              "weight matrix for the linear chain CRF. ");
     AddInput("Transition",
-             "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
              "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
              "operator. See more details in the operator's comments.");
     AddInput("Label",
-             "(LoDTensor, default: LoDTensor<int>). A LoDTensor with shape "
+             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
              "[N x 1], where N is the total element number in a mini-batch. "
              "The ground truth.");
     AddOutput(
         "Alpha",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
-        "The forward vectors for the entire batch. Denote it as \f$\alpha\f$. "
-        "\f$\alpha$\f is a memo table used to calculate the normalization "
-        "factor in CRF. \f$\alpha[k, v]$\f stores the unnormalized "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
+        "The forward vectors for the entire batch. Denote it as $\alpha$. "
+        "$\alpha$ is a memo table used to calculate the normalization "
+        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
         "probabilites of all possible unfinished sequences of tags that end at "
-        "position \f$k$\f with tag \f$v$\f. For each \f$k$\f, "
-        "\f$\alpha[k, v]$\f is a vector of length \f$D$\f with a component for "
-        "each tag value \f$v$\f. This vector is called a forward vecotr and "
+        "position $k$ with tag $v$. For each $k$, "
+        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
+        "each tag value $v$. This vector is called a forward vecotr and "
         "will also be used in backward computations.")
         .AsIntermediate();
     AddOutput(
         "EmissionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape [N x D]. "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
         "The exponentials of Input(Emission). This is an intermediate "
         "computational result in forward computation, and will be reused in "
         "backward computation.")
         .AsIntermediate();
     AddOutput(
         "TransitionExps",
-        "(Tensor, default: Tensor<float>). A 2-D Tensor with shape "
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
         "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
         "intermediate computational result in forward computation, and "
         "will be reused in backward computation.")
         .AsIntermediate();
     AddOutput(
         "LogLikelihood",
-        "(Tensor, default: Tensor<float>). The logarithm of the conditional "
+        "(Tensor, default Tensor<float>) The logarithm of the conditional "
         "likelihood of each training sample in a mini-batch. This is a 2-D "
         "tensor with shape [S x 1], where S is the sequence number in a "
         "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
         "The output is no longer a LoDTensor.");
     AddComment(R"DOC(
+LinearChainCRF Operator.
+
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
-variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
-\f$X = (x_1, x_2, ... , x_n)\f$ are structured inputs and
-\f$Y = (y_1, y_2, ... , y_n)\f$ are labels for the inputs.
+variables. CRF learns the conditional probability $P(Y|X)$, where
+$X = (x_1, x_2, ... , x_n)$ are structured inputs and
+$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
 
 Linear chain CRF is a special case of CRF that is useful for sequence labeling
 task. Sequence labeling tasks do not assume a lot of conditional
@@ -82,29 +84,29 @@ and output must be linear sequences. Thus, the graph of such a CRF is a simple
 chain or a line, which results in the linear chain CRF.
 
 This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
-http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
+CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
-
-- Denote Input(Emission) to this operator as \f$x\f$ here.
-- The first D values of Input(Transition) to this operator are for starting
-weights, denoted as \f$a\f$ here.
-- The next D values of Input(Transition) of this operator are for ending
-weights, denoted as \f$b\f$ here.
-- The remaning values of Input(Transition) are for transition weights,
-denoted as \f$w\f$ here.
-- Denote Input(Label) as \f$s\f$ here.
-
-The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as:
-\f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-                 + \sum_{l=1}^L x_{s_l}
-                 + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-all possible sequences is \f$1\f$, and \f$x\f$ is the emission feature weight
+1. Denote Input(Emission) to this operator as $x$ here.
+2. The first D values of Input(Transition) to this operator are for starting
+weights, denoted as $a$ here.
+3. The next D values of Input(Transition) of this operator are for ending
+weights, denoted as $b$ here.
+4. The remaning values of Input(Transition) are for transition weights,
+denoted as $w$ here.
+5. Denote Input(Label) as $s$ here.
+
+The probability of a sequence $s$ of length $L$ is defined as:
+$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                + \sum_{l=1}^L x_{s_l}
+                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+
+where $Z$ is a normalization value so that the sum of $P(s)$ over
+all possible sequences is 1, and $x$ is the emission feature weight
 to the linear chain CRF.
 
-Finaly, the linear chain CRF operator outputs the logarithm of the conditional
+Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.
 
 NOTE:
@@ -182,9 +184,11 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of linear_chain_crf
   // is determined by its input "Emission".
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        ctx.device_context());
   }
 };
 
@@ -239,10 +243,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
   // operator is determined by its input: gradients of LogLikelihood.
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))->type());
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
+                ->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index ddf7398175..014bbfa758 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -195,7 +195,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyLoDTensor = [](const platform::DeviceContext& ctx,
                             const LoDTensor& src, LoDTensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
     };
 
     copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -203,8 +203,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
 
     transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
                                             platform::CPUPlace());
-    transition_weights_dst->CopyFrom(transition_weights_src,
-                                     platform::CPUPlace(), ctx);
+    framework::CopyFrom(transition_weights_src, platform::CPUPlace(), ctx,
+                        transition_weights_dst);
   }
 
   void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
@@ -219,7 +219,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(platform::GPUPlace());
-      dst->CopyFrom(src, platform::GPUPlace(), ctx);
+      framework::CopyFrom(src, platform::GPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     ll -= std::log(sum);
     // Now ll is equal to -log(Z).
 
-    const int* lbl = label.data<int>();
+    const int64_t* lbl = label.data<int64_t>();
     PADDLE_ENFORCE_LT(
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
         "An invalid tag label that execesses the largest tag number.");
@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // Copy the inputs from GPU memory to CPU memory when this operators runs on
     // GPU device.
     label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    label_dst->CopyFrom(label_src, platform::CPUPlace(), ctx);
+    framework::CopyFrom(label_src, platform::CPUPlace(), ctx, label_dst);
 
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                          Tensor* dst) {
       if (src && dst) {
         dst->mutable_data<T>(platform::GPUPlace());
-        dst->CopyFrom(*src, platform::GPUPlace(), ctx);
+        framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst);
       }
     };
     copyTensor(ctx, emission_grad_src, emission_grad_dst);
@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                            Tensor* emission_grad) const {
     const T* w_exps = transition_exps.data<T>();
     const T* x_exps = emission_exps.data<T>();
-    const int* label_value = label.data<int>();
+    const int64_t* label_value = label.data<int64_t>();
     T* beta_value = beta->data<T>();
 
     auto x_dims = emission_exps.dims();
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 2d4eff0c35..4e58b84430 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    uint32_t version;
-    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-    framework::TensorDesc desc;
-    {  // int32_t size
-       // proto buffer
-      int32_t size;
-      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      fin.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                     "Cannot parse tensor desc");
-    }
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-      tensor->Resize(framework::make_ddim(dims));
-
-      void *buf;
-      platform::Place cpu = platform::CPUPlace();
-      switch (desc.data_type()) {
-        case framework::FP32:
-          buf = tensor->mutable_data<float>(cpu);
-          break;
-        case framework::FP64:
-          buf = tensor->mutable_data<double>(cpu);
-          break;
-        case framework::INT32:
-          buf = tensor->mutable_data<int>(cpu);
-          break;
-        case framework::INT64:
-          buf = tensor->mutable_data<int64_t>(cpu);
-          break;
-        default:
-          PADDLE_THROW("DataType %d not supported", desc.data_type());
-      }
-      fin.read(static_cast<char *>(buf), tensor->memory_size());
-    }
-    {  // read lod
-      uint64_t lod_level;
-      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-      for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size;
-        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        fin.read(reinterpret_cast<char *>(tmp.data()),
-                 static_cast<std::streamsize>(size));
-        lod[i] = tmp;
-      }
-    }
+    framework::DeserializeFromStream(fin, tensor);
 
     auto place = dev_ctx.GetPlace();
     if (platform::is_gpu_place(place)) {
@@ -105,7 +51,7 @@ class LoadOp : public framework::OperatorBase {
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(cpu_tensor.lod());
-      tensor->CopyFrom(cpu_tensor, place, dev_ctx);
+      CopyFrom(cpu_tensor, place, dev_ctx, tensor);
     }
   }
 };
@@ -115,14 +61,18 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   LoadOpProtoMaker(framework::OpProto *proto,
                    framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "The tensor need to be loaded");
-    AddComment(R"DOC(Load Operator
-Load operator will load a tensor variable from disk file.
-)DOC");
+    AddOutput("Out", "(Tensor) The tensor need to be loaded");
     AddAttr<std::string>("file_path",
+                         "(string) "
                          "Variable will be loaded from \"file_path\".")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+Load Operator.
+
+Load operator will load a tensor variable from disk file.
+
+)DOC");
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
new file mode 100644
index 0000000000..b2f4ec57fa
--- /dev/null
+++ b/paddle/operators/lod_array_length_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDArrayLengthOp : public framework::OperatorBase {
+ public:
+  LoDArrayLengthOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize({1});
+    auto cpu = platform::CPUPlace();
+    *out.mutable_data<int64_t>(cpu) = static_cast<int64_t>(x.size());
+  }
+};
+
+class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDArrayLengthProtoMaker(framework::OpProto *proto,
+                           framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensorArray) The input tensor array.");
+    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
+    AddComment(R"DOC(
+LoDArrayLength Operator.
+
+This operator obtains the length of lod tensor array:
+
+$$Out = len(X)$$
+
+NOTE: The output is a CPU Tensor since the control variable should be only in
+CPU and the length of LoDTensorArray should be used as control variables.
+
+)DOC");
+  }
+};
+
+class LoDArrayLengthInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput("Out"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp,
+                  ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index be198951c2..f7d4db1947 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -28,6 +28,7 @@ class LoDRankTableOp : public framework::OperatorBase {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
+    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
   }
 };
@@ -65,7 +66,8 @@ class LoDRankTableInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDescBind &op_desc,
                   framework::BlockDescBind *block) const override {
     for (auto &o : op_desc.Output("Out")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_RANK_TABLE);
+      block->FindRecursiveOrCreateVar(o)->SetType(
+          framework::VarDesc::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
new file mode 100644
index 0000000000..32831cb1e2
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+
+)DOC");
+  }
+};
+
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
new file mode 100644
index 0000000000..5244a17c3a
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
new file mode 100644
index 0000000000..cbcbf80adc
--- /dev/null
+++ b/paddle/operators/lod_reset_op.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        framework::CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                            &lod_cpu);
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+
+template <typename Place, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
new file mode 100644
index 0000000000..b970bf3177
--- /dev/null
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+class LoDTensorToArrayOp : public framework::OperatorBase {
+ public:
+  LoDTensorToArrayOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
+                          Input("X"))
+                  .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
+                           .Get<framework::LoDRankTable>();
+    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
+                     .GetMutable<framework::LoDTensorArray>();
+    auto &items = rank_table.items();
+    auto max_seq_len = items[0].length;
+    auto rank_level = rank_table.level();
+
+    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
+                      "Input should be a LOD tensor, and size is at least %d",
+                      rank_level + 1);
+    out.resize(max_seq_len);
+    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
+
+    // set out[i] lod
+    for (size_t t = 0; t < max_seq_len; t++) {
+      auto &lod = *out[t].mutable_lod();
+      lod.clear();
+      for (auto &item : items) {
+        if (t >= item.length) {
+          break;
+        }
+        size_t start_idx = x.lod()[rank_level][item.index] + t;
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x.lod(), start_idx, start_idx + 1, rank_level + 1);
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(&lod, lod_length);
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+      }
+    }
+    for (size_t i = 0; i < max_seq_len; ++i) {
+      auto &ranges = copy_ranges[i];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out[i].Resize(x_dim);
+      out[i].mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
+        auto slice = out[i].Slice(static_cast<int>(offset),
+                                  static_cast<int>(offset + len));
+        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                                    static_cast<int>(each_range.end)),
+                            x.place(), dev_ctx, &slice);
+        offset += len;
+      }
+    }
+  }
+};
+
+class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("RankTable", "");
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class LoDTensorToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of LoDTensorToArrayOp should not be null.");
+    PADDLE_ENFORCE(
+        context->HasInput("RankTable"),
+        "Input(RankTable) of LoDTensorToArrayOp should not be null.");
+
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of LoDTensorToArrayOp should not be null.");
+
+    auto x_dim = context->GetInputDim("X");
+    // The first dim of each LoDTensor in Output can only be set at run-time.;
+    // We still have to Resize each LoDTensor in Output.
+    context->SetOutputDim("Out", x_dim);
+  }
+};
+
+class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("array_to_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
+                  ops::LoDTensorToArrayOpProtoMaker,
+                  ops::LoDTensorToArrayInferShape,
+                  ops::LoDTensorToArrayInferVarType,
+                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000..257e5c8a49
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000..6c189ef341
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000..73404fce91
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
new file mode 100644
index 0000000000..c818d5e9c1
--- /dev/null
+++ b/paddle/operators/logical_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/logical_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) Left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y",
+             string::Sprintf("(LoDTensor) Right hand operand of %s operator",
+                             comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
+                           framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
+                                  comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class BinaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class LogicalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+    // LogicalOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                             \
+    static char type[];                                                    \
+    static char equation[];                                                \
+  };                                                                       \
+  char _##op_type##Comment::type[]{#op_type};                              \
+  char _##op_type##Comment::equation[]{_equation};                         \
+  REGISTER_OPERATOR(                                                       \
+      op_type, ::paddle::operators::LogicalOp,                             \
+      ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                            \
+    static char type[];                                                   \
+    static char equation[];                                               \
+  };                                                                      \
+  char _##op_type##Comment::type[]{#op_type};                             \
+  char _##op_type##Comment::equation[]{_equation};                        \
+  REGISTER_OPERATOR(                                                      \
+      op_type, ::paddle::operators::LogicalOp,                            \
+      ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_xor,
+                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
new file mode 100644
index 0000000000..d41239b2ca
--- /dev/null
+++ b/paddle/operators/logical_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/logical_op.h"
+
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
new file mode 100644
index 0000000000..6e78a7d6ed
--- /dev/null
+++ b/paddle/operators/logical_op.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LogicalAndFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
+};
+
+template <typename T>
+struct LogicalOrFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
+};
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a) const { return !a; }
+};
+
+template <typename T>
+struct LogicalXorFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+
+template <typename Place, typename Functor>
+class BinaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
+          binary_func);
+  }
+};
+
+template <typename Place, typename Functor>
+class UnaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor unary_func;
+    platform::Transform<Place> trans;
+    trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
+          out->mutable_data<bool>(context.GetPlace()), unary_func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                 \
+      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##Place, functor<bool>>);
+
+#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                \
+      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##Place, functor<bool>>);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 0b361e20f2..93e812ac5b 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -41,9 +41,11 @@ class LookupTableOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("W")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
   }
 };
 
@@ -53,21 +55,27 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("W",
-             "An input represents embedding tensors,"
-             " which is a learnable parameter.");
+             "An input represents embedding tensors, "
+             "which is a learnable parameter.");
     AddInput("Ids",
-             "An input with type int32 or int64"
-             "contains the ids to be looked up in W."
-             "Ids must be a column vector with rank = 2."
-             "The 2nd dimension size must be 1");
-    AddOutput("Out", "The lookup results, which have the same type with W.");
-    AddAttr<bool>("is_sparse", "Sparse update").SetDefault(false);
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "Ids must be a column vector with rank = 2. "
+             "The 2nd dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update")
+        .SetDefault(false);
     AddComment(R"DOC(
+Lookup Table Operator.
+
 This operator is used to perform lookups on the parameter W,
 then concatenated into a dense tensor.
 
-The input `Ids` can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD with input `Ids`.
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
 )DOC");
   }
 };
@@ -91,9 +99,11 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<LoDTensor>("W")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index c7ba172066..84b044184a 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -74,10 +74,10 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTable<T, 128, 8, 8><<<
-        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                               context.device_context())
-                               .stream()>>>(output, table, ids, N, K, D);
+    LookupTable<
+        T, 128, 8,
+        8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+        output, table, ids, N, K, D);
   }
 };
 
@@ -95,9 +95,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto* ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                        context.device_context())
-                        .stream();
+      auto stream = context.cuda_device_context().stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_dim[0]);
@@ -136,11 +134,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
 
       dim3 threads(128, 8);
       dim3 grids(8, 1);
-      LookupTableGrad<T, 128, 8,
-                      8><<<grids, threads, 0,
-                           reinterpret_cast<const platform::CUDADeviceContext&>(
-                               context.device_context())
-                               .stream()>>>(d_table, d_output, ids, N, K, D);
+      LookupTableGrad<
+          T, 128, 8,
+          8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          d_table, d_output, ids, N, K, D);
     }
   }
 };
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
index 89ea6bfdbd..e20340e77b 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -19,6 +19,103 @@ namespace operators {
 
 using framework::Tensor;
 
+template <typename T>
+struct LRNFunctor<platform::CPUPlace, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    auto x_v = framework::EigenVector<T>::Flatten(input);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(input);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+template struct LRNFunctor<platform::CPUPlace, float>;
+template struct LRNFunctor<platform::CPUPlace, double>;
+
+template <typename T>
+struct LRNGradFunctor<platform::CPUPlace, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    T ratio = -2 * alpha * beta;
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e = x_g_e.constant(0.0);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+template struct LRNGradFunctor<platform::CPUPlace, float>;
+template struct LRNGradFunctor<platform::CPUPlace, double>;
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -45,72 +142,70 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", R"DOC(
- (Tensor) The input of LRN operator. It must be a 4D tenor with NCHW format.
- )DOC");
-
+    AddInput("X",
+             "(Tensor) The input of LRN operator. "
+             "It must be a 4D tenor with NCHW format.");
     AddOutput("Out",
               "(Tensor) The output of LRN operator, which is also the 4D "
               "tensor with NCHW format.");
-    AddOutput("MidOut", R"Doc(
-(Tensor)Middle result of lrn op.It's computed in forward process 
-and also used in backward process.
-    )Doc");
-
-    AddAttr<int>("n", R"DOC(
-(int, default 5)n is “adjacent” kernel maps at the same spatial position.
-        )DOC")
+    AddOutput("MidOut",
+              "(Tensor) Middle result of LRN operator. It's computed in "
+              "forward process and also used in backward process.");
+
+    AddAttr<int>("n",
+                 "(int default 5) "
+                 "n is the \"adjacent\" kernel that maps "
+                 "at the same spatial position.")
         .SetDefault(5)
         .GreaterThan(0);
 
-    AddAttr<T>("k", R"DOC(
-(float, default 2.0)k is the bias.
-        )DOC")
+    AddAttr<T>("k",
+               "(float, default 2.0) "
+               "k is the bias.")
         .SetDefault(2.0)
         .GreaterThan(0.0);
 
-    AddAttr<T>("alpha", R"DOC(
-(float, default 0.0001)alpha is the scale number.
-        )DOC")
+    AddAttr<T>("alpha",
+               "(float, default 0.0001) "
+               "alpha is the scale number.")
         .SetDefault(0.0001)
         .GreaterThan(0.0);
 
-    AddAttr<T>("beta", R"DOC(
-(float, default 0.75)beta is the power number.
-        )DOC")
+    AddAttr<T>("beta",
+               "(float, default 0.75) "
+               "beta is the power number.")
         .SetDefault(0.75)
         .GreaterThan(0.0);
 
     AddComment(R"DOC(
- Local Response Normalization.
-
- This Function comes from the paper
- "ImageNet Classification with Deep Convolutional Neural Networks".
+Local Response Normalization Operator.
 
- The original formula is:
+This operator comes from the paper:
+<<ImageNet Classification with Deep Convolutional Neural Networks>>.
 
-                                Input(i, x, y)
- Output(i, x, y) = ----------------------------------------------
-                                 -- upper
-                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
-                                 -- j = lower
+The original formula is:
 
- upper is `min(C, c + n/2)`
- lower if `max(0, c - n/2)`
+$$
+Output(i, x, y) = Input(i, x, y) / \left(
+k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
+(Input(j, x, y))^2
+\right)^{\beta}
+$$
 
- Function implementation:
+Function implementation:
 
- inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- And the meaning of each dimension(0-3) is respectively batch size,
- feature maps, rows and columns.
+Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4.
+And dimensions 0 ~ 3 represent batch size, feature maps, rows,
+and columns, respectively.
 
- Input and Output in the above formula is for each map(i) of one image, and
- Input(i, x, y), Output(i, x, y) represents an element in an image.
+Input and Output in the formula above is for each map(i) of one image, and
+Input(i, x, y), Output(i, x, y) represents an element in an image.
 
- C is the number of feature maps of one image, and n is a hyper-parameters
- is configured when Function is initialized. The sum in the denominator
- is the sum of the same position in the neighboring maps.
-    )DOC");
+C is the number of feature maps of one image. n is a hyper-parameter
+configured when operator is initialized. The sum in the denominator
+is the sum of the same positions in the neighboring maps.
+    
+)DOC");
   }
 };
 
@@ -121,8 +216,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")),
-                   "Input(MidOut@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
 
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
index 607dc6d86a..e9a8671233 100644
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
@@ -12,11 +12,167 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/lrn_op.h"
 
-namespace ops = paddle::operators;
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
+                                   int H, int W, int size, T k, T alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+
+    in += offset;
+    mid += offset;
+    const int step = H * W;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    T accum = 0;
+    int index = 0;
+    while (index < C + post_pad) {
+      if (index < C) {
+        T val = in[index * step];
+        accum += val * val;
+      }
+      if (index >= size) {
+        T val = in[(index - size) * step];
+        accum -= val * val;
+      }
+      if (index >= post_pad) {
+        mid[(index - post_pad) * step] = k + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
+                                T negative_beta, T* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < input_size) {
+    out[index] = in[index] * pow(mid[index], negative_beta);
+  }
+}
+
+template <typename T>
+void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
+                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
+                    T alpha, T beta) {
+  int img_size = N * H * W;
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  KeCMRNormFillScale<
+      T><<<grid_size, block_size, 0, ctx.cuda_device_context().stream()>>>(
+      img_size, inputs, mid, C, H, W, n, k, alpha);
+
+  int input_size = N * H * W * C;
+  grid_size = (input_size + block_size - 1) / block_size;
+  KeCMRNormOutput<
+      T><<<grid_size, block_size, 0, ctx.cuda_device_context().stream()>>>(
+      input_size, inputs, mid, -beta, outputs);
+}
+
+template <typename T>
+struct LRNFunctor<platform::GPUPlace, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    CrossMapNormal<T>(
+        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
+        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template struct LRNFunctor<platform::GPUPlace, float>;
+template struct LRNFunctor<platform::GPUPlace, double>;
 
+template <typename T>
+__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
+                              const T* mid, T* x_g, const T* out_g, int C,
+                              int H, int W, int size, T negative_beta,
+                              T ratio) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+    x += offset;
+    out += offset;
+    mid += offset;
+    out_g += offset;
+    x_g += offset;
+
+    const int step = H * W;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    T accum = 0;
+    // TODO(gongwb): optimize this with thread shared array.
+    while (index < C + post_pad) {
+      if (index < C) {
+        x_g[index * step] = 0.0;
+        accum += out_g[index * step] * out[index * step] / mid[index * step];
+      }
+      if (index >= size) {
+        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
+                 mid[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        x_g[(index - post_pad) * step] +=
+            out_g[(index - post_pad) * step] *
+                pow(mid[(index - post_pad) * step], negative_beta) -
+            ratio * x[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
+                        const T* out, const T* mid, T* x_g, const T* out_g,
+                        int N, int C, int H, int W, int n, T alpha, T beta) {
+  int img_size = N * H * W;
+
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  KeCMRNormDiff<
+      T><<<grid_size, block_size, 0, ctx.cuda_device_context().stream()>>>(
+      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
+      2.0f * alpha * beta);
+}
+
+template <typename T>
+struct LRNGradFunctor<platform::GPUPlace, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
+                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
+                          N, C, H, W, n, alpha, beta);
+  }
+};
+
+template struct LRNGradFunctor<platform::GPUPlace, float>;
+template struct LRNGradFunctor<platform::GPUPlace, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(lrn_grad,
                        ops::LRNGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
index 606c657443..aa7539db4a 100644
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
@@ -21,6 +21,14 @@
 namespace paddle {
 namespace operators {
 
+template <typename place, typename T>
+struct LRNFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta);
+};
+
 template <typename Place, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
@@ -31,8 +39,8 @@ class LRNKernel : public framework::OpKernel<T> {
   // f(x) represents outputs
   void Compute(const framework::ExecutionContext& ctx) const override {
     // input
-    const Tensor* x = ctx.Input<Tensor>("X");
-    auto x_dims = x->dims();
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    auto x_dims = x.dims();
 
     // NCHW
     int N = x_dims[0];
@@ -57,38 +65,20 @@ class LRNKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
     PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
 
-    auto x_v = framework::EigenVector<T>::Flatten(*x);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-    e_mid.device(ctx.GetEigenDevice<Place>()) = e_mid.constant(k);
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch >= 0 && ch < C) {
-            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                               Eigen::array<int, 4>({{1, 1, H, W}}));
-
-            s.device(ctx.GetEigenDevice<Place>()) += alpha * r.square();
-          }
-        }
-      }
-    }
-
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(ctx.GetEigenDevice<Place>()) =
-        x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+    LRNFunctor<Place, T> f;
+    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
   }
 };
 
+template <typename Place, typename T>
+struct LRNGradFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta);
+};
+
 /**
  * \brief Backward calculation for normalization with across maps.
  *
@@ -97,7 +87,7 @@ class LRNKernel : public framework::OpKernel<T> {
  * The implementation of this Function is derived from the
  * CrossMapNormalFunc implementation.
  *
- * InputGrad = OutputGrad * denoms ^ (-beta)
+ * InputGrad = OutputGrad * MidOut ^ (-beta)
  *    -- upper
  *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
  *    -- lower
@@ -113,18 +103,15 @@ class LRNGradKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* out = ctx.Input<Tensor>("Out");
-    const Tensor* out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor* mid = ctx.Input<Tensor>("MidOut");
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    const Tensor& out = *ctx.Input<Tensor>("Out");
+    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
 
     auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
     x_g->mutable_data<T>(ctx.GetPlace());
 
-    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
-    x_g_e.device(ctx.GetEigenDevice<Place>()) = x_g_e.constant(0.0);
-
-    auto x_dims = x->dims();
+    auto x_dims = x.dims();
     int N = x_dims[0];
     int C = x_dims[1];
     int H = x_dims[2];
@@ -133,51 +120,9 @@ class LRNGradKernel : public framework::OpKernel<T> {
     int n = ctx.Attr<int>("n");
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
-    T ratio = -2 * alpha * beta;
-
-    auto e_x = framework::EigenTensor<T, 4>::From(*x);
-    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
-    auto e_out = framework::EigenTensor<T, 4>::From(*out);
-    auto e_out_g = framework::EigenTensor<T, 4>::From(*out_g);
-    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    for (int m = 0; m < N; m++) {
-      for (int i = 0; i < C; i++) {
-        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                             Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                     Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
-                                 Eigen::array<int, 4>({{1, 1, H, W}}));
-
-        i_x_g.device(ctx.GetEigenDevice<Place>()) = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c <= end; c++) {
-          int ch = i + c;
-          if (ch < 0 || ch >= C) {
-            continue;
-          }
-
-          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                   Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
-                                       Eigen::array<int, 4>({{1, 1, H, W}}));
-
-          i_x_g.device(ctx.GetEigenDevice<Place>()) +=
-              ratio * c_out_g * c_out * i_x / c_mid;
-        }
-      }
-    }
+
+    LRNGradFunctor<Place, T> f;
+    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
 };
 
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index 94342d9407..fa8e5f2da8 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -24,6 +24,11 @@ class LSTMOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"),
                    "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
     PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                    "Output(Hidden) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Cell"),
@@ -59,11 +64,13 @@ class LSTMOp : public framework::OperatorWithKernel {
                       "The second dimension of Input(Weight) "
                       "should be 4 * %d.",
                       frame_size);
+
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
                       "The first dimension of Input(Bias) should be 1.");
-    if (ctx->Attrs().Get<bool>("usePeepholes")) {
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
       PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
                         "The second dimension of Input(Bias) should be "
                         "7 * %d if enable peepholes connection",
@@ -74,6 +81,7 @@ class LSTMOp : public framework::OperatorWithKernel {
                         "4 * %d if disable peepholes connection",
                         frame_size);
     }
+
     framework::DDim out_dims({in_dims[0], frame_size});
     ctx->SetOutputDim("Hidden", out_dims);
     ctx->SetOutputDim("Cell", out_dims);
@@ -84,10 +92,11 @@ class LSTMOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<framework::LoDTensor>("Input")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
   }
 };
 
@@ -103,7 +112,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("H0",
              "(Tensor, optional) the initial hidden state is an optional "
              "input. This is a tensor with shape (N x D), where N is the "
-             "batch size, D is the hidden size.")
+             "batch size and D is the hidden size.")
         .AsDispensable();
     AddInput("C0",
              "(Tensor, optional) the initial cell state is an optional "
@@ -117,14 +126,13 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Bias",
              "(Tensor) the learnable weights, which contains two parts: "
              "input-hidden bias weight and peephole connections weight if "
-             "setting `usePeepholes` True. "
-             "1. `usePeepholes = False` "
+             "setting `use_peepholes` True. "
+             "1. `use_peepholes = False` "
              " - The shape is (1 x 4D). "
              " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `usePeepholes = True` "
+             "2. `use_peepholes = True` "
              " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.")
-        .AsDispensable();
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
     AddOutput("Hidden",
               "(LoDTensor) the hidden state of LSTM operator. "
               "The shape is (T x D), and lod is the same with the `Input`.");
@@ -134,85 +142,85 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("BatchGate",
               "(LoDTensor) This LoDTensor contains input gate, forget gate "
               "and output gate after the nonlinear computation. This "
-              "LoDTensor has the same shape with the reorganized input, which "
+              "LoDTensor has the same shape as the reorganized input, which "
               "is also be called batch input. The LoD size is 2. The first "
               "LoD is the batch offsets and the second LoD contains the "
               "indexes, which denote the position of reorganized sequence "
               "in the raw input.")
         .AsIntermediate();
     AddOutput("BatchCellPreAct",
-              "(LoDTensor) This LoDTensor is got in the forward and used "
+              "(LoDTensor) This LoDTensor is obtained in the forward and used "
               "in the backward.")
         .AsIntermediate();
-    AddAttr<bool>("usePeepholes",
+    AddAttr<bool>("use_peepholes",
                   "(bool, defalut: True) "
                   "whether to enable diagonal/peephole connections.")
         .SetDefault(true);
-    AddAttr<bool>("isReverse",
+    AddAttr<bool>("is_reverse",
                   "(bool, defalut: False) "
                   "whether to compute reversed LSTM.")
         .SetDefault(false);
     AddAttr<std::string>(
-        "gateActivation",
+        "gate_activation",
         "(string, default: sigmoid)"
         "The activation for input gate, forget gate and output "
         "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid");
-    AddAttr<std::string>("cellActivation",
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
                          "(string, default: tanh)"
                          "The activation for cell output, `tanh` by defalut.")
-        .SetDefault("tanh");
-    AddAttr<std::string>("candidateActivation",
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
                          "(string, default: tanh)"
                          "The activation for candidate hidden state, "
                          "`tanh` by default.")
-        .SetDefault("tanh");
-    AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory (LSTM) Operator.
 
-The defalut implementation is diagonal/peephole connection [1], the formula is
-as follows
+The defalut implementation is diagonal/peephole connection
+(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
 
-    i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
 
-    f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
+f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
 
-    \tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
+\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
 
-    o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
+o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
 
-    c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t}
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
 
-    h_t = o_t ⊙ act_h(c_t)
+h_t = o_t \odot act_h(c_t)
+$$
 
-where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
-of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
-are diagonal weight matrices for peephole connections. In our implenmention,
-We use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
-is the non-line actications, such as logistic sigmoid function, and
-\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate,
-output gate and cell activation vectors, all of which are the same size as
-the cell output activation vector \f$h\f$.
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the non-line activations, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$.
 
-The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$
-are the cell input and cell output activation functions, `tanh` is usually
-used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\tilde{c_t}$ is also called candidate hidden state,
 which is computed based on the current input and the previous hidden state.
 
-Set `usePeepholes` False to disable peephole connection [2]. The formula
-is omitted here.
+Set `use_peepholes` False to disable peephole connection. The formula
+is omitted here, please refer to the paper
+http://www.bioinf.jku.at/publications/older/2604.pdf for details.
 
-@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
-operations on the input x_{t} were NOT included in this operator.
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
 Users can choose to use fully-connect operator before LSTM operator.
 
-[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory
-recurrent neural network architectures for large scale acoustic modeling.
-INTERSPEECH, 2014.
-
-[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory.
-Neural Computation, 9(8):1735-1780, 1997.
-
 )DOC");
   }
 };
@@ -228,30 +236,35 @@ class LSTMGradOp : public framework::OperatorWithKernel {
                    "Input(Hidden) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Cell"),
                    "Input(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
 
     PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
                    "Input(BatchGate) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
                    "Input(BatchGate) of LSTM should not be null.");
 
-    auto in_g_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(in_g_name))
-      ctx->SetOutputDim(in_g_name, ctx->GetInputDim("Input"));
-
-    auto w_g_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(w_g_name))
-      ctx->SetOutputDim(w_g_name, ctx->GetInputDim("Weight"));
-
-    auto b_g_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(b_g_name))
-      ctx->SetOutputDim(b_g_name, ctx->GetInputDim("Bias"));
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<framework::LoDTensor>("Input")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/lstm_op.cu b/paddle/operators/lstm_op.cu.cc
similarity index 97%
rename from paddle/operators/lstm_op.cu
rename to paddle/operators/lstm_op.cu.cc
index 9ad5694155..610cbb03e8 100644
--- a/paddle/operators/lstm_op.cu
+++ b/paddle/operators/lstm_op.cu.cc
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index af088b80b4..a78f548aaf 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -24,9 +24,14 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename Place, typename T>
+inline void ReorderInitState(const platform::DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<Place, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
 
 template <typename Place, typename T>
 class LSTMKernel : public framework::OpKernel<T> {
@@ -36,6 +41,9 @@ class LSTMKernel : public framework::OpKernel<T> {
     auto* weight = ctx.Input<Tensor>("Weight");
     auto* bias = ctx.Input<Tensor>("Bias");
 
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
     auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
     batch_gate->mutable_data<T>(ctx.GetPlace());
     auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
@@ -43,12 +51,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     auto* cell_out = ctx.Output<LoDTensor>("Cell");
     cell_out->mutable_data<T>(ctx.GetPlace());
 
-    // Now the function ShareLoD in InferShape is not implemented.
-    // So copy LoD here.
-    ctx.ShareLoD("Input", "Hidden");
-    ctx.ShareLoD("Input", "Cell");
-
-    bool is_reverse = ctx.Attr<bool>("isReverse");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<Place, T> to_batch;
     auto& device_ctx = ctx.device_context();
     to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
@@ -58,32 +61,37 @@ class LSTMKernel : public framework::OpKernel<T> {
     framework::DDim dims({in_dims[0], frame_size});
 
     if (bias) {
-      Eigen::array<int, 2> extents({{1, 4 * frame_size}});
-      Eigen::array<int, 2> offsets({{0, 0}});
-      auto b = EigenMatrix<T>::From(*bias);
-      auto gate = EigenMatrix<T>::From(*batch_gate);
-      gate.device(ctx.GetEigenDevice<Place>()) =
-          gate +
-          b.slice(offsets, extents)
-              .reshape(Eigen::array<int, 2>({{1, frame_size * 4}}))
-              .broadcast(
-                  Eigen::array<int, 2>({{static_cast<int>(in_dims[0]), 1}}));
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<Place, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
     math::LstmMetaValue<T> lstm_value;
-    if (bias) {
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
       // the code style in LstmMetaValue will be updated later.
 
-      lstm_value.checkIg = bias_data + 4 * frame_size;
-      lstm_value.checkFg = lstm_value.checkIg + frame_size;
-      lstm_value.checkOg = lstm_value.checkFg + frame_size;
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
     } else {
-      lstm_value.checkIg = nullptr;
-      lstm_value.checkFg = nullptr;
-      lstm_value.checkOg = nullptr;
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
+    }
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
+                                 true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
     }
-    lstm_value.prevStateValue = nullptr;
 
     // Use the local variable as here.
     LoDTensor batch_hidden, batch_cell;
@@ -94,9 +102,9 @@ class LSTMKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
-    auto gate_act = ctx.Attr<std::string>("gateActivation");
-    auto cell_act = ctx.Attr<std::string>("cellActivation");
-    auto cand_act = ctx.Attr<std::string>("candidateActivation");
+    auto gate_act = ctx.Attr<std::string>("gate_activation");
+    auto cell_act = ctx.Attr<std::string>("cell_activation");
+    auto cand_act = ctx.Attr<std::string>("candidate_activation");
 
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
@@ -109,24 +117,37 @@ class LSTMKernel : public framework::OpKernel<T> {
 
       int cur_batch_size = bend - bstart;
 
-      if (n != 0) {
+      if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
         math::matmul<Place, T>(device_ctx, pre_hidden_t, false, *weight, false,
                                static_cast<T>(1.0), &gate_t,
                                static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<Place, T>(device_ctx, *hidden_t0, order, &ordered_h0,
+                                   true);
+        math::matmul<Place, T>(device_ctx, ordered_h0, false, *weight, false,
+                               static_cast<T>(1.0), &gate_t,
+                               static_cast<T>(1.0));
       }
-      // else if : FIXME support the initial hidden and cell
 
-      lstm_value.gateValue = gate_t.data<T>();
-      lstm_value.outputValue = out_t.data<T>();
-      lstm_value.stateValue = cell_t.data<T>();
-      lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
       math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
                                                frame_size, cur_batch_size,
                                                gate_act, cell_act, cand_act);
-      lstm_value.prevStateValue = lstm_value.stateValue;
+      lstm_value.prev_state_value = lstm_value.state_value;
     }
 
     math::Batch2LoDTensorFunctor<Place, T> to_seq;
@@ -160,6 +181,12 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
     auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
     auto& device_ctx = ctx.device_context();
     math::SetConstant<Place, T> zero;
     if (weight_g) {
@@ -167,68 +194,78 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, weight_g, static_cast<T>(0.0));
     }
 
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (c0) {
+      ReorderInitState<Place, T>(device_ctx, *c0, order, &ordered_c0, true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
     auto in_dims = input->dims();
     auto out_dims = hidden_g->dims();
     int frame_size = static_cast<int>(in_dims[1] / 4);
     PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
 
     math::LstmMetaValue<T> lstm_value;
-    if (bias) {
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
       T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.checkIg = bias_data + 4 * frame_size;
-      lstm_value.checkFg = lstm_value.checkIg + frame_size;
-      lstm_value.checkOg = lstm_value.checkFg + frame_size;
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
     } else {
-      lstm_value.checkIg = nullptr;
-      lstm_value.checkFg = nullptr;
-      lstm_value.checkOg = nullptr;
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
     }
 
     math::LstmMetaGrad<T> lstm_grad;
+
     if (bias && bias_g) {
-      T* bias_g_data = const_cast<T*>(bias_g->mutable_data<T>(ctx.GetPlace()));
+      bias_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, bias_g, static_cast<T>(0.0));
-      lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size;
-      lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size;
-      lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size;
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
+      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
     } else {
-      lstm_grad.checkIgGrad = nullptr;
-      lstm_grad.checkFgGrad = nullptr;
-      lstm_grad.checkOgGrad = nullptr;
+      lstm_grad.check_ig_grad = nullptr;
+      lstm_grad.check_fg_grad = nullptr;
+      lstm_grad.check_og_grad = nullptr;
     }
 
     math::LoDTensor2BatchFunctor<Place, T> to_batch;
 
-    // use the local variable as here.
-    LoDTensor batch_hidden;
-    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_hidden.set_lod(batch_gate->lod());
-    to_batch(device_ctx, *hidden_out, batch_hidden, false);
-
-    LoDTensor batch_hidden_g;
-    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_hidden_g.set_lod(batch_gate->lod());
-    to_batch(device_ctx, *hidden_g, batch_hidden_g, false);
+    auto ToBatch = [&batch_gate, &to_batch](
+        const platform::DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
 
-    LoDTensor batch_cell;
-    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell.set_lod(batch_gate->lod());
-    to_batch(device_ctx, *cell_out, batch_cell, false);
+    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
+    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
+    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
 
-    LoDTensor batch_cell_g;
+    LoDTensor batch_cell_g, batch_gate_g;
     batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell_g.set_lod(batch_gate->lod());
     // TODO(qingqing) support the case output cell has gradient.
     // to_batch(device_ctx, *cell_g, batch_cell_g, false);
     zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
-
-    LoDTensor batch_gate_g;
     batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
     batch_gate_g.set_lod(batch_gate->lod());
 
-    auto gate_act = ctx.Attr<std::string>("gateActivation");
-    auto cell_act = ctx.Attr<std::string>("cellActivation");
-    auto cand_act = ctx.Attr<std::string>("candidateActivation");
+    auto gate_act = ctx.Attr<std::string>("gate_activation");
+    auto cell_act = ctx.Attr<std::string>("cell_activation");
+    auto cand_act = ctx.Attr<std::string>("candidate_activation");
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -239,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor gate = batch_gate->Slice(bstart, bend);
       Tensor cell = batch_cell.Slice(bstart, bend);
       Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gateValue = gate.data<T>();
-      lstm_value.stateValue = cell.data<T>();
-      lstm_value.stateActiveValue = cell_pre_act.data<T>();
+      lstm_value.gate_value = gate.data<T>();
+      lstm_value.state_value = cell.data<T>();
+      lstm_value.state_active_value = cell_pre_act.data<T>();
 
       Tensor out_g = batch_hidden_g.Slice(bstart, bend);
       Tensor gate_g = batch_gate_g.Slice(bstart, bend);
       Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.stateGrad = cell_g.data<T>();
-      lstm_grad.gateGrad = gate_g.data<T>();
-      lstm_grad.outputGrad = out_g.data<T>();
+      lstm_grad.state_grad = cell_g.data<T>();
+      lstm_grad.gate_grad = gate_g.data<T>();
+      lstm_grad.output_grad = out_g.data<T>();
 
-      if (n) {
+      if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
         Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prevStateValue = cell_pre.data<T>();
-        lstm_grad.prevStateGrad = cell_pre_g.data<T>();
+        lstm_value.prev_state_value = cell_pre.data<T>();
+        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
       } else {
-        lstm_value.prevStateValue = nullptr;
-        lstm_grad.prevStateGrad = nullptr;
+        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
       }
 
       int cur_batch_size = bend - bstart;
@@ -266,7 +303,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
           device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
           gate_act, cell_act, cand_act);
 
-      if (n != 0) {
+      if (n > 0) {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
@@ -280,6 +317,19 @@ class LSTMGradKernel : public framework::OpKernel<T> {
                                  static_cast<T>(1.0), weight_g,
                                  static_cast<T>(1.0));
         }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<Place, T>(device_ctx, *h0, order, &ordered_h0, true);
+          math::matmul<Place, T>(device_ctx, ordered_h0, true, gate_g, false,
+                                 static_cast<T>(1.0), weight_g,
+                                 static_cast<T>(1.0));
+        }
+        if (h0 && h0_g) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          math::matmul<Place, T>(device_ctx, gate_g, false, *weight, true,
+                                 static_cast<T>(1.0), &ordered_h0_g,
+                                 static_cast<T>(0.0));
+        }
       }
     }
 
@@ -291,16 +341,18 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     }
     if (bias && bias_g) {
       /* backward bias */
-      int m = static_cast<int>(batch_gate_g.dims()[0]);
-      int n = static_cast<int>(batch_gate_g.dims()[1]);
-
-      Tensor ones;
-      ones.mutable_data<T>({m}, ctx.GetPlace());
-      math::SetConstant<Place, T> set;
-      set(device_ctx, &ones, static_cast<T>(1.0));
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<Place, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
 
-      math::gemv<Place, T>(device_ctx, true, m, n, 1., batch_gate_g.data<T>(),
-                           ones.data<T>(), 0., bias_g->data<T>());
+    if (h0 && h0_g) {
+      ReorderInitState<Place, T>(device_ctx, ordered_h0_g, order, h0_g, false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<Place, T>(device_ctx, ordered_c0_g, order, c0_g, false);
     }
   }
 };
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index 5d63017208..18b9cdf2a3 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -34,10 +34,10 @@ class LstmUnitOp : public framework::OperatorWithKernel {
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE(x_dims[0] == c_prev_dims[0],
-                   "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE(x_dims[1] == c_prev_dims[1] * 4,
-                   "Dimension of FC should equal to prev state * 4");
+    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                      "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                      "Dimension of FC should equal to prev state * 4");
 
     int b_size = c_prev_dims[0];  // batch size
     int s_dim = c_prev_dims[1];   // state dim
@@ -57,17 +57,22 @@ class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         "The cell state tensor of last time-step in the Lstm Unit operator.");
     AddOutput("C", "The cell tensor of Lstm Unit operator.");
     AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
-
-    AddComment(R"DOC(Lstm-Unit Operator
+    AddAttr<float>("forget_bias",
+                   "(float, default 0.0) "
+                   "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+    AddComment(R"DOC(
+Lstm Unit Operator
 
 Equation:
-  i, f, o, j = split(X)
-  C = C_prev * sigm(f + forget_bias) + sigm(i) * tanh(j)
-  H = C * sigm(o)
+
+$$
+i, f, o, j = split(X) \\
+C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
+H = C * sigm(o)
+$$
 
 )DOC");
-    AddAttr<float>("forget_bias", "The forget bias of Lstm Unit.")
-        .SetDefault(0.0);
   }
 };
 
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index 638a99addc..d7e8a0ea76 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -55,8 +55,6 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
              "(2-D tensor with shape [batch_size x 1]) "
              "The label indicating X1 ranked higher than X2 or not, "
              "can only be +1 or -1.");
-    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
-        .SetDefault(static_cast<T>(0));
     AddOutput("Activated",
               "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
               "to indicate whether each element of Output(Out) is activated.")
@@ -64,23 +62,26 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(2-D tensor with shape [batch_size x 1]) "
               "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
     AddComment(R"DOC(
+MarginRankLoss Operator.
 
-MarginRankLoss operator measures the loss given a pair of training sample
+This operator measures the loss given a pair of training sample
 {`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss 
-turns out
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:
 
-loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
 
-The attribute `margin` involved here helps make the predictions more robust.
+The attribute `margin` here helps make the predictions more robust.
 Denote the item ranked higher as the positive sample, otherwise the negative 
 sample. If the score of the two samples satisfies 
 
-positive sample - negative sample < margin,
+$positive sample - negative sample < margin$
 
-the pair of samples will contribute to the final loss, which will backpropogate 
-and train the ranking model to enlarge the difference of the two score.
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.
 
 For batch input with size `batch_size`, `X1`, `X2` and `Label`
 all have the same shape [batch_size x 1].
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index b330f30d21..bf47879f77 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,31 +1,35 @@
 add_subdirectory(detail)
 
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
     nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
-    nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
-    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
+    nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
+    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
     nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
-    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
+    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
-    cc_library(softmax SRCS softmax.cc DEPS operator)
-    cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
+    cc_library(softmax SRCS softmax.cc DEPS device_context)
+    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
     cc_library(pooling SRCS pooling.cc DEPS device_context)
     cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context)
-    cc_library(context_project SRCS context_project.cc DEPS device_context)
+    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index e028336041..d853507188 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -24,9 +24,6 @@ namespace math {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 /*
  * \brief Context projection concatenates features in adjacent time-steps in
@@ -88,13 +85,18 @@ template <typename Place, typename T>
 class ContextProjectFunctor {
  public:
   void operator()(const platform::DeviceContext& context, const LoDTensor& in,
-                  const Tensor& padding_data, Tensor& col,
-                  bool padding_trainable, int context_start, int context_length,
-                  int context_stride, int up_pad, int down_pad) {
+                  const Tensor& padding_data, bool padding_trainable,
+                  const int context_start, const int context_length,
+                  const int context_stride, const int up_pad,
+                  const int down_pad, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
     math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf;
 
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
     int input_row_begin, input_row_end;
     int sequence_height, sequence_width;
     sequence_width = in.dims()[1];
@@ -105,8 +107,8 @@ class ContextProjectFunctor {
                             : static_cast<int>(lod_level_0[i]);
       input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-      Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
-                               static_cast<int>(lod_level_0[i + 1]));
+      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                static_cast<int>(lod_level_0[i + 1]));
 
       sequence_height = static_cast<int>(out_t.dims()[0]);
 
@@ -123,17 +125,14 @@ class ContextProjectFunctor {
             {1, input_row_end - input_row_begin,
              sequence_width});  // input_channels, input_height, input_width
         in_t.Resize(framework::make_ddim(input_shape));
-
-        im2col_ocf(context, in_t, out_t,
-                   /*stride_height*/ context_stride, /*stride_width*/ 1, up_pad,
-                   down_pad, 0, 0);
+        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
         out_t.Resize({sequence_height, context_length * sequence_width});
       }
     }
     if (padding_trainable) {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
-                                 static_cast<int>(lod_level_0[i + 1]));
+        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                  static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
@@ -150,9 +149,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
             Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
-            auto w_sub_e = EigenMatrix<T>::From(w_sub);
-            out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
+            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         if (down_pad > 0) {  // add down pad
@@ -182,9 +179,7 @@ class ContextProjectFunctor {
                 (down_pad_begin_row + t) * context_length);
             Tensor w_sub = padding_data.Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
-            auto w_sub_e = EigenMatrix<T>::From(w_sub);
-            out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
+            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         out_t.Resize({sequence_height, context_length * sequence_width});
@@ -196,14 +191,19 @@ class ContextProjectFunctor {
 template <typename Place, typename T>
 class ContextProjectGradFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, LoDTensor& in,
-                  Tensor& padding_data, Tensor& col, bool padding_trainable,
-                  int context_start, int context_length, int context_stride,
-                  int up_pad, int down_pad, bool input_grad, bool pad_grad) {
+  void operator()(const platform::DeviceContext& context, const LoDTensor& in,
+                  bool padding_trainable, const int context_start,
+                  const int context_length, const int context_stride,
+                  const int up_pad, const int down_pad, bool pad_grad,
+                  bool input_grad, Tensor* padding_data, Tensor* col) {
     auto lod_level_0 = in.lod()[0];
 
     math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf;
 
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
     int input_row_begin, input_row_end;
     int sequence_height, sequence_width;
     sequence_width = in.dims()[1];
@@ -215,8 +215,8 @@ class ContextProjectGradFunctor {
                               : static_cast<int>(lod_level_0[i]);
         input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-        Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
-                                 static_cast<int>(lod_level_0[i + 1]));
+        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                  static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
@@ -234,9 +234,7 @@ class ContextProjectGradFunctor {
                sequence_width});  // input_channels, input_height, input_width
           in_t.Resize(framework::make_ddim(input_shape));
 
-          col2im_ocf(context, in_t, out_t,
-                     /*stride_height*/ context_stride, /*stride_width*/ 1,
-                     up_pad, down_pad, 0, 0);
+          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
           out_t.Resize({sequence_height, context_length * sequence_width});
         }
       }
@@ -244,8 +242,8 @@ class ContextProjectGradFunctor {
     if (pad_grad) {
       if (padding_trainable) {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-          Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]),
-                                   static_cast<int>(lod_level_0[i + 1]));
+          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                    static_cast<int>(lod_level_0[i + 1]));
 
           sequence_height = static_cast<int>(out_t.dims()[0]);
           out_t.Resize({sequence_height * context_length, sequence_width});
@@ -259,11 +257,9 @@ class ContextProjectGradFunctor {
                   k + context_length < up_pad ? context_length : up_pad - k;
               Tensor out_t_sub = out_t.Slice(k * context_length,
                                              k * context_length + padding_size);
-              Tensor w_sub = padding_data.Slice(k, k + padding_size);
-              auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
-              auto w_sub_e = EigenMatrix<T>::From(w_sub);
-              w_sub_e.device(*context.GetEigenDevice<Place>()) =
-                  w_sub_e + out_t_sub_e;
+              Tensor w_sub = padding_data->Slice(k, k + padding_size);
+              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
+                             out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           if (down_pad > 0) {
@@ -292,12 +288,10 @@ class ContextProjectGradFunctor {
               Tensor out_t_sub = out_t.Slice(
                   (down_pad_begin_row + t) * context_length - padding_size,
                   (down_pad_begin_row + t) * context_length);
-              Tensor w_sub = padding_data.Slice(
+              Tensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub);
-              auto w_sub_e = EigenMatrix<T>::From(w_sub);
-              w_sub_e.device(*context.GetEigenDevice<Place>()) =
-                  w_sub_e + out_t_sub_e;
+              axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
+                             out_t_sub.data<T>(), w_sub.data<T>());
             }
           }
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
index 0ab6827ffa..70ed9ddd55 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -14,7 +14,6 @@
 
 #pragma once
 #include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/hostdevice.h"
 
diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/operators/math/detail/CMakeLists.txt
index 92eac9d362..0df1c060f9 100644
--- a/paddle/operators/math/detail/CMakeLists.txt
+++ b/paddle/operators/math/detail/CMakeLists.txt
@@ -1,3 +1 @@
-if(WITH_AVX)
-    cc_library(activation_functions SRCS avx_functions.cc)
-endif()
+cc_library(activation_functions SRCS avx_functions.cc)
diff --git a/paddle/operators/math/detail/avx_functions.cc b/paddle/operators/math/detail/avx_functions.cc
index 6d9df654a4..921364788c 100644
--- a/paddle/operators/math/detail/avx_functions.cc
+++ b/paddle/operators/math/detail/avx_functions.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef __AVX__
+
 #include <immintrin.h>
 #include "paddle/operators/math/detail/activation_functions.h"
 // TODO(qingqing) refine this dependence
@@ -84,3 +86,5 @@ __m256 Identity(const __m256 a, const __m256 b) { return a; }
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
+
+#endif
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
new file mode 100644
index 0000000000..4c67dec9cb
--- /dev/null
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
+                                       activation_mode_t active_gate) {
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
+                                       activation_mode_t active_node) {
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
+                                     activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
+    }
+
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
+                                     activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
+    }
+
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
+  }
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_reset_output(
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
+                                      activation_mode_t active_node) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
+                                      activation_mode_t active_gate) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
+                                    activation_mode_t active_node) {
+#ifdef __AVX__
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+    }
+
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
+                                    activation_mode_t active_gate) {
+#ifdef __AVX__
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+    }
+
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_state_grad(
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_reset_grad(
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
new file mode 100644
index 0000000000..d2edcb7f25
--- /dev/null
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -0,0 +1,203 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
+                                        activation_mode_t active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
+  }
+
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
+
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
+                                        activation_mode_t active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
+  }
+
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
+
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
+                                       activation_mode_t active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
+  }
+
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
+  }
+
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
+
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
+  }
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
+                                       activation_mode_t active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
+  }
+
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
+  }
+
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
+
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
+  }
+}
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
new file mode 100644
index 0000000000..acd84be01d
--- /dev/null
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+// TODO(guosheng): refine code style in gru_kernel
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, activation_mode_t act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             activation_mode_t act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, activation_mode_t act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             activation_mode_t act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
index f5b0dd85c9..a734ad31ee 100644
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -26,274 +26,284 @@ namespace detail {
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frameSize,
+                                     int frame_size,
                                      activation_mode_t active_node,
                                      activation_mode_t active_gate,
                                      activation_mode_t active_state) {
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rCheckI;
-  T rCheckF;
-  T rCheckO;
-  T rState;
-  T rPrevState = 0;
-  T rStateAtv;
-  T rOut;
-
-  T *valueIn = value.gateValue;
-  T *valueIg = value.gateValue + frameSize;
-  T *valueFg = value.gateValue + frameSize * 2;
-  T *valueOg = value.gateValue + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
-
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
-       rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    value.stateValue[i] = rState;
-    value.stateActiveValue[i] = rStateAtv;
-    value.outputValue[i] = rOut;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    value.state_value[i] = r_state;
+    value.state_active_value[i] = r_state_atv;
+    value.output_value[i] = r_out;
   }
 }
 
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frameSize,
+                                      LstmMetaGrad<T> grad, int frame_size,
                                       activation_mode_t active_node,
                                       activation_mode_t active_gate,
                                       activation_mode_t active_state) {
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rGradIn;
-  T rGradIg;
-  T rGradFg;
-  T rGradOg;
-  T rPrevState = 0;
-  T rPrevStateGrad;
-  T rState;
-  T rStateGrad;
-  T rStateAtv;
-  T rOutputGrad;
-  T rCheckI;
-  T rCheckF;
-  T rCheckO;
-  T rCheckIGrad;
-  T rCheckFGrad;
-  T rCheckOGrad;
-
-  T *valueIn = value.gateValue;
-  T *valueIg = value.gateValue + frameSize;
-  T *valueFg = value.gateValue + frameSize * 2;
-  T *valueOg = value.gateValue + frameSize * 3;
-  T *gradIn = grad.gateGrad;
-  T *gradIg = grad.gateGrad + frameSize;
-  T *gradFg = grad.gateGrad + frameSize * 2;
-  T *gradOg = grad.gateGrad + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
-    rState = value.stateValue[i];
-    rStateAtv = value.stateActiveValue[i];
-    rOutputGrad = grad.outputGrad[i];
-    rStateGrad = grad.stateGrad[i];
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  T *grad_in = grad.gate_grad;
+  T *grad_ig = grad.gate_grad + frame_size;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
+  T *grad_og = grad.gate_grad + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    r_state = value.state_value[i];
+    r_state_atv = value.state_active_value[i];
+    r_output_grad = grad.output_grad[i];
+    r_state_grad = grad.state_grad[i];
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
-       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
-       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
-       rCheckOGrad, active_node, active_gate, active_state);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    grad.stateGrad[i] = rStateGrad;
-
-    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
+
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
     }
-    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
   }
 }
 
 template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
+void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                   int frame_size,
                                    activation_mode_t active_node,
                                    activation_mode_t active_gate,
                                    activation_mode_t active_state) {
 #ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
-  __m256 rState;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rStateAtv;
-  __m256 rOut;
-
-  __m256 *valueIn = (__m256 *)value.gateValue;
-  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = ((__m256 *)value.checkIg)[i];
-    rCheckF = ((__m256 *)value.checkFg)[i];
-    rCheckO = ((__m256 *)value.checkOg)[i];
-
-    if (value.prevStateValue) {
-      rPrevState = ((__m256 *)value.prevStateValue)[i];
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_state;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_state_atv;
+  __m256 r_out;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
-       rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+    }
 
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    ((__m256 *)value.stateValue)[i] = rState;
-    ((__m256 *)value.stateActiveValue)[i] = rStateAtv;
-    ((__m256 *)value.outputValue)[i] = rOut;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    ((__m256 *)value.state_value)[i] = r_state;
+    ((__m256 *)value.state_active_value)[i] = r_state_atv;
+    ((__m256 *)value.output_value)[i] = r_out;
   }
 #endif
 }
 
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frameSize,
+                                    LstmMetaGrad<T> grad, int frame_size,
                                     activation_mode_t active_node,
                                     activation_mode_t active_gate,
                                     activation_mode_t active_state) {
 #ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rGradIn;
-  __m256 rGradIg;
-  __m256 rGradFg;
-  __m256 rGradOg;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rPrevStateGrad;
-  __m256 rStateGrad;
-  __m256 rState;
-  __m256 rStateAtv;
-  __m256 rOutputGrad;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
-  __m256 rCheckIGrad;
-  __m256 rCheckFGrad;
-  __m256 rCheckOGrad;
-
-  __m256 *valueIn = (__m256 *)value.gateValue;
-  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
-  __m256 *gradIn = (__m256 *)grad.gateGrad;
-  __m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
-  __m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
-  __m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = ((__m256 *)value.checkIg)[i];
-    rCheckF = ((__m256 *)value.checkFg)[i];
-    rCheckO = ((__m256 *)value.checkOg)[i];
-    rState = ((__m256 *)value.stateValue)[i];
-    rStateAtv = ((__m256 *)value.stateActiveValue)[i];
-    rOutputGrad = ((__m256 *)grad.outputGrad)[i];
-    rStateGrad = ((__m256 *)grad.stateGrad)[i];
-    if (value.prevStateValue) {
-      rPrevState = ((__m256 *)value.prevStateValue)[i];
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_grad_in;
+  __m256 r_grad_ig;
+  __m256 r_grad_fg;
+  __m256 r_grad_og;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state_grad;
+  __m256 r_state_grad;
+  __m256 r_state;
+  __m256 r_state_atv;
+  __m256 r_output_grad;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_checkIGrad;
+  __m256 r_checkFGrad;
+  __m256 r_checkOGrad;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+  __m256 *grad_in = (__m256 *)grad.gate_grad;
+  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
+  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
+    }
+    r_state = ((__m256 *)value.state_value)[i];
+    r_state_atv = ((__m256 *)value.state_active_value)[i];
+    r_output_grad = ((__m256 *)grad.output_grad)[i];
+    r_state_grad = ((__m256 *)grad.state_grad)[i];
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
     }
 
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
-       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
-       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
-       rCheckOGrad, active_node, active_gate, active_state);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    ((__m256 *)grad.stateGrad)[i] = rStateGrad;
-
-    if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
-      if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    ((__m256 *)grad.state_grad)[i] = r_state_grad;
+
+    if (grad.prev_state_grad)
+      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
+      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
     }
-    if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
+    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
   }
 #endif
 }
 
 template <class T, class Op>
-void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
+void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate,
                       activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                      active_gate, active_state);
   } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                        active_gate, active_state);
   }
 }
 
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frameSize, activation_mode_t active_node,
+                       int frame_size, activation_mode_t active_node,
                        activation_mode_t active_gate,
                        activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
                                       active_gate, active_state);
   } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
-                                        active_gate, active_state);
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+                                        active_node, active_gate, active_state);
   }
 }
 
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
index 41a54a359d..91bfedea53 100644
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -26,187 +26,192 @@ namespace math {
 namespace detail {
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class T, class Op, bool isBatch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
-                              int batchSize, activation_mode_t active_node,
+template <class T, class Op, bool is_batch>
+__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
+                              int batch_size, activation_mode_t active_node,
                               activation_mode_t active_gate,
                               activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
 
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.outputValue += batchIdx * frameSize;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.output_value += batch_idx * frame_size;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
   }
 
-  T rState;
-  T rPrevState = 0;
-  T rStateAtv;
-  T rOut;
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rCheckI = value.checkIg[frameIdx];
-  T rCheckF = value.checkFg[frameIdx];
-  T rCheckO = value.checkOg[frameIdx];
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
 
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
 
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
-     rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
+     active_state);
 
-  value.gateValue[frameIdx] = rValueIn;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+  value.gate_value[frame_idx] = r_value_in;
+  value.gate_value[frame_idx + frame_size] = r_value_ig;
+  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
+  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
 
-  value.stateValue[frameIdx] = rState;
-  value.stateActiveValue[frameIdx] = rStateAtv;
-  value.outputValue[frameIdx] = rOut;
+  value.state_value[frame_idx] = r_state;
+  value.state_active_value[frame_idx] = r_state_atv;
+  value.output_value[frame_idx] = r_out;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class T, class Op, bool isBatch>
+template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frameSize,
-                               int batchSize, activation_mode_t active_node,
+                               LstmMetaGrad<T> grad, int frame_size,
+                               int batch_size, activation_mode_t active_node,
                                activation_mode_t active_gate,
                                activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
 
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-    grad.gateGrad += batchIdx * frameSize * 4;
-    grad.stateGrad += batchIdx * frameSize;
-    grad.outputGrad += batchIdx * frameSize;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
+    grad.gate_grad += batch_idx * frame_size * 4;
+    grad.state_grad += batch_idx * frame_size;
+    grad.output_grad += batch_idx * frame_size;
   }
 
-  T rValueIn;
-  T rValueIg;
-  T rValueFg;
-  T rValueOg;
-  T rGradIn;
-  T rGradIg;
-  T rGradFg;
-  T rGradOg;
-  T rPrevState = 0;
-  T rPrevStateGrad;
-  T rState;
-  T rStateGrad;
-  T rStateAtv;
-  T rOutputGrad;
-  T rCheckI = value.checkIg[frameIdx];
-  T rCheckF = value.checkFg[frameIdx];
-  T rCheckO = value.checkOg[frameIdx];
-  T rCheckIGrad;
-  T rCheckFGrad;
-  T rCheckOGrad;
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
 
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-  rState = value.stateValue[frameIdx];
-  rStateAtv = value.stateActiveValue[frameIdx];
-  rOutputGrad = grad.outputGrad[frameIdx];
-  rStateGrad = grad.stateGrad[frameIdx];
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+  r_state = value.state_value[frame_idx];
+  r_state_atv = value.state_active_value[frame_idx];
+  r_output_grad = grad.output_grad[frame_idx];
+  r_state_grad = grad.state_grad[frame_idx];
 
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
-     rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
-     rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
-     active_node, active_gate, active_state);
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+     active_state);
 
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize] = rGradIg;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
-  grad.stateGrad[frameIdx] = rStateGrad;
-  if (grad.prevStateGrad) {
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+  grad.gate_grad[frame_idx] = r_grad_in;
+  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
+  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
+  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
+  grad.state_grad[frame_idx] = r_state_grad;
+  if (grad.prev_state_grad) {
+    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
+    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
   }
 
-  if (isBatch) {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad)
-        paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
-                                        rCheckIGrad);
-      if (grad.checkFgGrad)
-        paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
-                                        rCheckFGrad);
+  if (is_batch) {
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
+                                        r_checkIGrad);
+      if (grad.check_fg_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
+                                        r_checkFGrad);
     }
-    if (grad.checkOgGrad)
-      paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
+    if (grad.check_og_grad)
+      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
+                                      r_checkOGrad);
   } else {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
     }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
   }
 }
 
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frameSize, int batchSize,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate,
                       activation_mode_t active_state) {
   dim3 threads;
   dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
   } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
+    /* frame_per_block = 32 batch_per_block = 32 */
     threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
   }
 
   auto stream =
       reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batchSize == 1) {
+  if (batch_size == 1) {
     KeLstmForward<T, Op,
-                  /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frameSize, batchSize, active_node, active_gate,
+                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
         active_state);
   } else {
     KeLstmForward<T, Op,
-                  /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frameSize, batchSize, active_node, active_gate,
+                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
         active_state);
   }
 }
@@ -214,41 +219,36 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                        LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frameSize, int batchSize,
+                       int frame_size, int batch_size,
                        activation_mode_t active_node,
                        activation_mode_t active_gate,
                        activation_mode_t active_state) {
   dim3 threads;
   dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
   } else {
-    /* framePerBlock = 32 batchPerBlock = 16 */
+    /* frame_per_block = 32 batch_per_block = 16 */
     threads = dim3(32, 16);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
   }
 
   auto stream =
       reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batchSize == 1) {
+  if (batch_size == 1) {
     KeLstmBackward<T, Op,
-                   /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frameSize, batchSize, active_node, active_gate,
+                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
         active_state);
   } else {
     KeLstmBackward<T, Op,
-                   /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frameSize, batchSize, active_node, active_gate,
+                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
         active_state);
   }
-
-  cudaStreamSynchronize(stream);
-  // TODO(qingqing): Add cuda error check for each kernel.
-  cudaError_t err = cudaGetLastError();
-  PADDLE_ENFORCE(err, cudaGetErrorString(err));
 }
 
 }  // namespace detail
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h
index 9daaf91981..78f9a249a3 100644
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
@@ -27,19 +27,19 @@ namespace forward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
-                             T &prevState, T &state, T &stateAtv, T &output,
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &prev_state, T &state, T &state_atv, T &output,
                              T &checkI, T &checkF, T &checkO,
                              activation_mode_t active_node,
                              activation_mode_t active_gate,
                              activation_mode_t active_state) {
-    valueIn = activation(valueIn, active_node);
-    valueIg = activation(valueIg + prevState * checkI, active_gate);
-    valueFg = activation(valueFg + prevState * checkF, active_gate);
-    state = valueIn * valueIg + prevState * valueFg;
-    valueOg = activation(valueOg + state * checkO, active_gate);
-    stateAtv = activation(state, active_state);
-    output = valueOg * stateAtv;
+    value_in = activation(value_in, active_node);
+    value_ig = activation(value_ig + prev_state * checkI, active_gate);
+    value_fg = activation(value_fg + prev_state * checkF, active_gate);
+    state = value_in * value_ig + prev_state * value_fg;
+    value_og = activation(value_og + state * checkO, active_gate);
+    state_atv = activation(state, active_state);
+    output = value_og * state_atv;
   }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -48,24 +48,27 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
 
-  HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
-                             __m256 &valueOg, __m256 &prevState, __m256 &state,
-                             __m256 &stateAtv, __m256 &output, __m256 &checkI,
+  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
+                             __m256 &value_fg, __m256 &value_og,
+                             __m256 &prev_state, __m256 &state,
+                             __m256 &state_atv, __m256 &output, __m256 &checkI,
                              __m256 &checkF, __m256 &checkO,
                              activation_mode_t active_node,
                              activation_mode_t active_gate,
                              activation_mode_t active_state) {
-    valueIn = activation(valueIn, active_node);
-    valueIg = activation(
-        _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate);
-    valueFg = activation(
-        _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate);
-    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
-                          _mm256_mul_ps(prevState, valueFg));
-    valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)),
-                         active_gate);
-    stateAtv = activation(state, active_state);
-    output = _mm256_mul_ps(valueOg, stateAtv);
+    value_in = activation(value_in, active_node);
+    value_ig =
+        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
+                   active_gate);
+    value_fg =
+        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
+                   active_gate);
+    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
+                          _mm256_mul_ps(prev_state, value_fg));
+    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
+                          active_gate);
+    state_atv = activation(state, active_state);
+    output = _mm256_mul_ps(value_og, state_atv);
   }
 #endif
 #endif
@@ -78,25 +81,26 @@ namespace backward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
-                             T &gradIn, T &gradIg, T &gradFg, T &gradOg,
-                             T &prevState, T &prevStateGrad, T &state,
-                             T &stateGrad, T &stateAtv, T &outputGrad,
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
+                             T &prev_state, T &prev_state_grad, T &state,
+                             T &state_grad, T &state_atv, T &output_grad,
                              T &checkI, T &checkF, T &checkO, T &checkIGrad,
                              T &checkFGrad, T &checkOGrad,
                              activation_mode_t active_node,
                              activation_mode_t active_gate,
                              activation_mode_t active_state) {
-    gradOg = activation(outputGrad * stateAtv, valueOg, active_gate);
-    stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) +
-                 gradOg * checkO;
-    gradIn = activation(stateGrad * valueIg, valueIn, active_node);
-    gradIg = activation(stateGrad * valueIn, valueIg, active_gate);
-    gradFg = activation(stateGrad * prevState, valueFg, active_gate);
-    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
-    checkIGrad = gradIg * prevState;
-    checkFGrad = gradFg * prevState;
-    checkOGrad = gradOg * state;
+    grad_og = activation(output_grad * state_atv, value_og, active_gate);
+    state_grad += activation(output_grad * value_og, state_atv, active_state) +
+                  grad_og * checkO;
+    grad_in = activation(state_grad * value_ig, value_in, active_node);
+    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
+    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
+    prev_state_grad =
+        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
+    checkIGrad = grad_ig * prev_state;
+    checkFGrad = grad_fg * prev_state;
+    checkOGrad = grad_og * state;
   }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -105,32 +109,32 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
   HOSTDEVICE void operator()(
-      __m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg,
-      __m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg,
-      __m256 &prevState, __m256 &prevStateGrad, __m256 &state,
-      __m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI,
-      __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad,
-      __m256 &checkOGrad, activation_mode_t active_node,
+      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
+      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
+      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
+      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
+      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
+      __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
       activation_mode_t active_gate, activation_mode_t active_state) {
-    gradOg =
-        activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate);
-    stateGrad = _mm256_add_ps(
-        activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state),
-        stateGrad);
-    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
-    gradIn =
-        activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node);
-    gradIg =
-        activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate);
-    gradFg =
-        activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate);
-    prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
-                                  _mm256_mul_ps(gradFg, checkF));
-    prevStateGrad =
-        _mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
-    checkIGrad = _mm256_mul_ps(gradIg, prevState);
-    checkFGrad = _mm256_mul_ps(gradFg, prevState);
-    checkOGrad = _mm256_mul_ps(gradOg, state);
+    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
+                         active_gate);
+    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
+                                          state_atv, active_state),
+                               state_grad);
+    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
+    grad_in =
+        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
+    grad_ig =
+        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
+    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
+                         active_gate);
+    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
+                                    _mm256_mul_ps(grad_fg, checkF));
+    prev_state_grad =
+        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
+    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
+    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
+    checkOGrad = _mm256_mul_ps(grad_og, state);
   }
 #endif
 #endif
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
new file mode 100644
index 0000000000..ae4e47b014
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frame_size, batch_size, active_gate);
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frame_size, batch_size, active_node);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+#ifndef __NVCC__
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
+                                grad, frame_size, batch_size, active_node);
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
+                                grad, frame_size, batch_size, active_gate);
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CPUPlace, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CPUPlace, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
+      }
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<platform::CPUPlace, float>;
+template struct GRUUnitFunctor<platform::CPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
new file mode 100644
index 0000000000..0252bdbdb6
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cu
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* is_batch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* is_batch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
+    }
+
+    if (value.prev_out_value) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* is_batch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
+          active_node);
+    } else {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* is_batch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
+          active_node);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate) {
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
+    }
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
+      }
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
+    }
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::GPUPlace, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::GPUPlace, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<platform::GPUPlace, float>;
+template struct GRUUnitFunctor<platform::GPUPlace, double>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, float>;
+template struct GRUUnitGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
new file mode 100644
index 0000000000..58ea59f68e
--- /dev/null
+++ b/paddle/operators/math/gru_compute.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// TODO(guosheng): refine code style in gru_compute
+template <typename T>
+struct hl_gru_value {
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
+};
+
+template <typename T>
+struct hl_gru_grad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
+};
+
+template <typename Place, typename T>
+struct GRUUnitFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+template <typename Place, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const platform::DeviceContext &context,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index 3b1b0bd71d..c10c44c520 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -28,57 +28,55 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_up,
-                  int padding_down, int padding_left, int padding_right) {
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
     PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    PADDLE_ENFORCE(col->dims().size() == 5);
 
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int output_height = col.dims()[3];
-    int output_width = col.dims()[4];
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int col_height = col->dims()[3];
+    int col_width = col->dims()[4];
 
-    PADDLE_ENFORCE_EQ(
-        (input_height + padding_up + padding_down - filter_height) /
-                stride_height +
-            1,
-        output_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (input_width + padding_left + padding_right - filter_width) /
-                stride_width +
-            1,
-        output_width,
-        "output_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
 
-    int channels_col = input_channels * filter_height * filter_width;
+    int channels_col = im_channels * filter_height * filter_width;
 
     const T* im_data = im.data<T>();
-    T* col_data = col.data<T>();
+    T* col_data = col->data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
       int c_im = c / filter_width / filter_height;
-      for (int h = 0; h < output_height; ++h) {
-        for (int w = 0; w < output_width; ++w) {
-          int im_row_idx = h * stride_height + h_offset - padding_up;
-          int im_col_idx = w * stride_width + w_offset - padding_left;
+      for (int h = 0; h < col_height; ++h) {
+        for (int w = 0; w < col_width; ++w) {
+          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          int col_idx = (c * col_height + h) * col_width + w;
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
 
-          if (im_row_idx < 0 || im_row_idx >= input_height || im_col_idx < 0 ||
-              im_col_idx >= input_width) {
-            col_data[(c * output_height + h) * output_width + w] = T(0);
-          } else {
-            im_row_idx += c_im * input_height;
-            col_data[(c * output_height + h) * output_width + w] =
-                im_data[im_row_idx * input_width + im_col_idx];
-          }
+          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                               im_col_idx < 0 || im_col_idx >= im_width)
+                                  ? static_cast<T>(0)
+                                  : im_data[im_idx];
         }
       }
     }
@@ -94,54 +92,55 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::CPUPlace, T> {
  public:
-  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_up, int padding_down,
-                  int padding_left, int padding_right) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
     int filter_height = col.dims()[1];
     int filter_width = col.dims()[2];
-    int output_height = col.dims()[3];
-    int output_width = col.dims()[4];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
 
-    PADDLE_ENFORCE_EQ(
-        (input_height + padding_up + padding_down - filter_height) /
-                stride_height +
-            1,
-        output_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (input_width + padding_left + padding_right - filter_width) /
-                stride_width +
-            1,
-        output_width,
-        "output_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
 
-    int channels_col = input_channels * filter_height * filter_width;
+    int channels_col = im_channels * filter_height * filter_width;
 
-    T* im_data = im.data<T>();
+    T* im_data = im->data<T>();
     const T* col_data = col.data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
       int c_im = c / filter_width / filter_height;
-      for (int h = 0; h < output_height; ++h) {
-        for (int w = 0; w < output_width; ++w) {
-          int im_row_idx = h * stride_height + h_offset - padding_up;
-          int im_col_idx = w * stride_width + w_offset - padding_left;
+      for (int h = 0; h < col_height; ++h) {
+        for (int w = 0; w < col_width; ++w) {
+          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
 
-          if ((im_row_idx) >= 0 && (im_row_idx) < input_height &&
-              (im_col_idx) >= 0 && (im_col_idx) < input_width) {
-            im_row_idx += c_im * input_height;
-            im_data[im_row_idx * input_width + im_col_idx] +=
-                col_data[(c * output_height + h) * output_width + w];
+          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+            im_row_idx += c_im * im_height;
+            im_data[im_row_idx * im_width + im_col_idx] +=
+                col_data[(c * col_height + h) * col_width + w];
           }
         }
       }
@@ -168,64 +167,59 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_up,
-                  int padding_down, int padding_left, int padding_right) {
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
     PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int output_height = col.dims()[0];
-    int output_width = col.dims()[1];
+    PADDLE_ENFORCE(col->dims().size() == 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
 
     PADDLE_ENFORCE_EQ(
-        (input_height + padding_up + padding_down - filter_height) /
-                stride_height +
-            1,
-        output_height,
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
         "Output_height and padding(padding_up, padding_down) are "
         "inconsistent.");
     PADDLE_ENFORCE_EQ(
-        (input_width + padding_left + padding_right - filter_width) /
-                stride_width +
-            1,
-        output_width,
-        "output_width and padding(padding_left, padding_right) are "
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
         "inconsistent.");
 
     const T* im_data = im.data<T>();
-    T* col_data = col.data<T>();
+    T* col_data = col->data<T>();
 
-    for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
-        for (int channel = 0; channel < input_channels; ++channel) {
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
               int im_row_offset =
-                  col_row_idx * stride_height + filter_row_idx - padding_up;
+                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
-                  col_col_idx * stride_width + filter_col_idx - padding_left;
-              int col_offset = ((((col_row_idx)*output_width + col_col_idx) *
-                                     input_channels +
-                                 channel) *
-                                    filter_height +
-                                filter_row_idx) *
-                                   filter_width +
-                               filter_col_idx;
-              if (im_row_offset < 0 || im_row_offset >= input_height ||
-                  im_col_offset < 0 || im_col_offset >= input_width) {
-                col_data[col_offset] = T(0);
-              } else {
-                int im_offset =
-                    (channel * input_height + im_row_offset) * input_width +
-                    im_col_offset;
-                col_data[col_offset] = im_data[im_offset];
-              }
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+              int col_offset =
+                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              int im_offset = (channel * im_height + im_row_offset) * im_width +
+                              im_col_offset;
+              col_data[col_offset] =
+                  (im_row_offset < 0 || im_row_offset >= im_height ||
+                   im_col_offset < 0 || im_col_offset >= im_width)
+                      ? static_cast<T>(0)
+                      : im_data[im_offset];
             }
           }
         }
@@ -243,60 +237,57 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::CPUPlace, T> {
  public:
-  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_up, int padding_down,
-                  int padding_left, int padding_right) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
     int filter_height = col.dims()[3];
     int filter_width = col.dims()[4];
-    int output_height = col.dims()[0];
-    int output_width = col.dims()[1];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
 
     PADDLE_ENFORCE_EQ(
-        (input_height + padding_up + padding_down - filter_height) /
-                stride_height +
-            1,
-        output_height,
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
         "Output_height and padding(padding_up, padding_down) are "
         "inconsistent.");
     PADDLE_ENFORCE_EQ(
-        (input_width + padding_left + padding_right - filter_width) /
-                stride_width +
-            1,
-        output_width,
-        "output_width and padding(padding_left, padding_right) are "
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
         "inconsistent.");
 
-    T* im_data = im.data<T>();
+    T* im_data = im->data<T>();
     const T* col_data = col.data<T>();
 
-    for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
-        for (int channel = 0; channel < input_channels; ++channel) {
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
               int im_row_offset =
-                  col_row_idx * stride_height + filter_row_idx - padding_up;
+                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
-                  col_col_idx * stride_width + filter_col_idx - padding_left;
-              int col_offset = (((col_row_idx * output_width + col_col_idx) *
-                                     input_channels +
-                                 channel) *
-                                    filter_height +
-                                filter_row_idx) *
-                                   filter_width +
-                               filter_col_idx;
-              if (im_row_offset >= 0 && im_row_offset < input_height &&
-                  im_col_offset >= 0 && im_col_offset < input_width) {
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+              int col_offset =
+                  (((col_row_idx * col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+              if (im_row_offset >= 0 && im_row_offset < im_height &&
+                  im_col_offset >= 0 && im_col_offset < im_width) {
                 int im_offset =
-                    (channel * input_height + im_row_offset) * input_width +
+                    (channel * im_height + im_row_offset) * im_width +
                     im_col_offset;
                 im_data[im_offset] += col_data[col_offset];
               }
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index 7b201fdbf3..bf78942439 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -20,36 +20,32 @@ namespace operators {
 namespace math {
 
 template <class T>
-__global__ void im2col(const T* data_im, int num_outs, int height, int width,
+__global__ void im2col(const T* data_im, int num_outs, int im_height,
+                       int im_width, int dilation_h, int dilation_w,
                        int filter_height, int filter_width, int stride_height,
                        int stride_width, int padding_height, int padding_width,
-                       int output_height, int output_width, T* data_col) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                       int col_height, int col_width, T* data_col) {
+  const int index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
   if (index < num_outs) {
-    int w_out = index % output_width;
-    index /= output_width;
-    int h_out = index % output_height;
-    int channel_in = index / output_height;
+    int w_out = index % col_width;
+    int h_out = (index / col_width) % col_height;
+    int channel_in = index / col_width / col_height;
     int channel_out = channel_in * filter_height * filter_width;
-    int h_in = h_out * stride_height;
-    int w_in = w_out * stride_width;
+    int h_in = h_out * stride_height - padding_height;
+    int w_in = w_out * stride_width - padding_width;
 
-    data_col += (channel_out * output_height + h_out) * output_width + w_out;
+    data_col += (channel_out * col_height + h_out) * col_width + w_out;
+    data_im += (channel_in * im_height + h_in) * im_width + w_in;
     for (int i = 0; i < filter_height; ++i) {
       for (int j = 0; j < filter_width; ++j) {
-        int rIdx = int(h_in + i);
-        int cIdx = int(w_in + j);
-        if ((rIdx - (int)padding_height) >= (int)height ||
-            (rIdx - (int)padding_height) < 0 ||
-            (cIdx - (int)padding_width) >= (int)width ||
-            (cIdx - (int)padding_width) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in * height - padding_height;
-          cIdx = cIdx - padding_width;
-          *data_col = data_im[rIdx * width + cIdx];
-        }
-        data_col += output_height * output_width;
+        int rIdx = h_in + i * dilation_h;
+        int cIdx = w_in + j * dilation_w;
+        *data_col =
+            (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
+                ? 0
+                : data_im[i * dilation_h * im_width + j * dilation_w];
+        data_col += col_height * col_width;
       }
     }
   }
@@ -65,30 +61,36 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_up,
-                  int padding_down, int padding_left, int padding_right) {
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
     PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int output_height = col.dims()[3];
-    int output_width = col.dims()[4];
-
-    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
-                           stride_height +
-                       1 ==
-                   output_height);
-    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
-                           stride_width +
-                       1 ==
-                   output_width);
-
-    int num_outputs = input_channels * output_height * output_width;
+    PADDLE_ENFORCE(col->dims().size() == 5);
+
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int col_height = col->dims()[3];
+    int col_width = col->dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int num_outputs = im_channels * col_height * col_width;
     int blocks = (num_outputs + 1024 - 1) / 1024;
     int block_x = 512;
     int block_y = (blocks + 512 - 1) / 512;
@@ -97,56 +99,57 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     im2col<T><<<grid, threads, 0,
                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
                     .stream()>>>(
-        im.data<T>(), num_outputs, input_height, input_width, filter_height,
-        filter_width, stride_height, stride_width, padding_up, padding_left,
-        output_height, output_width, col.data<T>());
+        im.data<T>(), num_outputs, im_height, im_width, dilation[0],
+        dilation[1], filter_height, filter_width, stride[0], stride[1],
+        padding[0], padding[1], col_height, col_width, col->data<T>());
   }
 };
 
 template <class T>
-__global__ void col2im(size_t n, const T* data_col, size_t height, size_t width,
-                       size_t channels, size_t filter_height,
-                       size_t filter_width, size_t stride_height,
-                       size_t stride_width, size_t padding_height,
-                       size_t padding_width, size_t output_height,
-                       size_t output_width, T* data_im) {
-  size_t index =
+__global__ void col2im(int n, const T* data_col, int im_height, int im_width,
+                       int dilation_h, int dilation_w, int filter_height,
+                       int filter_width, int stride_height, int stride_width,
+                       int padding_height, int padding_width, int col_height,
+                       int col_width, T* data_im) {
+  const int index =
       (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
+  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+
   if (index < n) {
     T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)padding_width) >= 0 &&
-        (w - (int)padding_width) < (width - 2 * padding_width) &&
-        (h - (int)padding_height) >= 0 &&
-        (h - padding_height) < (height - 2 * padding_height)) {
-      // compute the start and end of the output
-      int w_col_start = (w < (int)filter_width)
-                            ? 0
-                            : (w - int(filter_width)) / (int)stride_width + 1;
-      int w_col_end =
-          min((int)(w / (int)stride_width + 1), (int)(output_width));
-      int h_col_start = (h < (int)filter_height)
-                            ? 0
-                            : (h - (int)filter_height) / (int)stride_height + 1;
-      int h_col_end = min(int(h / stride_height + 1), int(output_height));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * filter_height * filter_width) +
-                      (h - h_col * (int)stride_height) * (int)filter_width +
-                      (w - w_col * (int)stride_width);
-          val +=
-              data_col[(c_col * output_height + h_col) * output_width + w_col];
+    int w = index % im_width + padding_width;
+    int h = (index / im_width) % im_height + padding_height;
+    int c = index / (im_width * im_height);
+
+    // compute the start and end of the output
+    int w_col_start =
+        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, col_width);
+    int h_col_start =
+        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, col_height);
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_off = (h - h_col * stride_height);
+        int w_off = (w - w_col * stride_width);
+        if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
+          h_off /= dilation_h;
+          w_off /= dilation_w;
+          int data_col_index =
+              (((c * filter_height + h_off) * filter_width + w_off) *
+                   col_height +
+               h_col) *
+                  col_width +
+              w_col;
+
+          val += data_col[data_col_index];
         }
       }
-      h -= padding_height;
-      w -= padding_width;
-      data_im[c * ((width - 2 * padding_width) *
-                   (height - 2 * padding_height)) +
-              h * (width - 2 * padding_width) + w] += val;
     }
+    data_im[index] = val;
   }
 }
 
@@ -159,33 +162,38 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                     platform::GPUPlace, T> {
  public:
-  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_up, int padding_down,
-                  int padding_left, int padding_right) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
     int filter_height = col.dims()[1];
     int filter_width = col.dims()[2];
-    int output_height = col.dims()[3];
-    int output_width = col.dims()[4];
-
-    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
-                           stride_height +
-                       1 ==
-                   output_height);
-    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
-                           stride_width +
-                       1 ==
-                   output_width);
-
-    size_t num_kernels = input_channels *
-                         (input_height + padding_up + padding_down) *
-                         (input_width + padding_left + padding_right);
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    size_t num_kernels = im_channels * im_height * im_width;
 
     size_t blocks = (num_kernels + 1024 - 1) / 1024;
     size_t block_x = 512;
@@ -198,10 +206,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     col2im<T><<<grid, threads, 0,
                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
                     .stream()>>>(
-        num_kernels, col.data<T>(), input_height + padding_up + padding_down,
-        input_width + padding_left + padding_left, input_channels,
-        filter_height, filter_width, stride_height, stride_width, padding_up,
-        padding_left, output_height, output_width, im.data<T>());
+        num_kernels, col.data<T>(), im_height, im_width, dilation[0],
+        dilation[1], filter_height, filter_width, stride[0], stride[1],
+        padding[0], padding[2], col_height, col_width, im->data<T>());
   }
 };
 
@@ -215,33 +222,32 @@ template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              platform::GPUPlace, double>;
 
 template <class T>
-__global__ void im2colOCF(const T* im_data, T* col_data, int input_channels,
-                          int input_height, int input_width, int filter_height,
-                          int filter_width, int stride_height, int stride_width,
-                          int padding_height, int padding_width,
-                          int output_height, int output_width) {
+__global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
+                          int im_width, int filter_height, int filter_width,
+                          int stride_height, int stride_width,
+                          int padding_height, int padding_width, int col_height,
+                          int col_width, T* col_data) {
   int swid = blockIdx.x;
   int shid = blockIdx.y;
-  for (int channelid = threadIdx.z; channelid < input_channels;
+  for (int channelid = threadIdx.z; channelid < im_channels;
        channelid += blockDim.z) {
     for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
         int width_offset = idx + swid * stride_width - padding_width;
         int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * input_width +
-                        channelid * input_height * input_width;
+        int im_offset = width_offset + height_offset * im_width +
+                        channelid * im_height * im_width;
 
         int col_offset = idx + idy * filter_width +
                          channelid * filter_height * filter_width +
-                         (shid * output_width + swid) *
-                             (input_channels * filter_height * filter_width);
-
-        if (height_offset >= input_height || height_offset < 0 ||
-            width_offset >= input_width || width_offset < 0) {
-          col_data[col_offset] = T(0);
-        } else {
-          col_data[col_offset] = im_data[im_offset];
-        }
+                         (shid * col_width + swid) *
+                             (im_channels * filter_height * filter_width);
+
+        col_data[col_offset] =
+            (height_offset >= im_height || height_offset < 0 ||
+             width_offset >= im_width || width_offset < 0)
+                ? T(0)
+                : im_data[im_offset];
       }
     }
   }
@@ -257,27 +263,33 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_up,
-                  int padding_down, int padding_left, int padding_right) {
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
     PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int output_height = col.dims()[0];
-    int output_width = col.dims()[1];
-
-    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
-                           stride_height +
-                       1 ==
-                   output_height);
-    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
-                           stride_width +
-                       1 ==
-                   output_width);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
 
     int block_dim_x = 0;
     int block_dim_y = 0;
@@ -296,42 +308,41 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     }
 
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
-    dim3 threads(block_dim_x, block_dim_y,
-                 std::min(block_dim_z, input_channels));
-    dim3 grid(output_width, output_height);
+    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
+    dim3 grid(col_width, col_height);
     im2colOCF<T><<<grid, threads, 0,
                    reinterpret_cast<const platform::CUDADeviceContext&>(context)
                        .stream()>>>(
-        im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
-        filter_height, filter_width, stride_height, stride_width, padding_up,
-        padding_left, output_height, output_width);
+        im.data<T>(), im_channels, im_height, im_width, filter_height,
+        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
+        col_width, col->data<T>());
   }
 };
 
 template <class T>
-__global__ void col2imOCF(T* im_data, const T* col_data, int input_channels,
-                          int input_height, int input_width, int filter_height,
-                          int filter_width, int stride_height, int stride_width,
-                          int padding_height, int padding_width,
-                          int output_height, int output_width) {
+__global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
+                          int im_width, int filter_height, int filter_width,
+                          int stride_height, int stride_width,
+                          int padding_height, int padding_width, int col_height,
+                          int col_width, T* im_data) {
   int swid = blockIdx.x;
   int shid = blockIdx.y;
-  for (int channelid = threadIdx.z; channelid < input_channels;
+  for (int channelid = threadIdx.z; channelid < im_channels;
        channelid += blockDim.z) {
     for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
         int width_offset = idx + swid * stride_width - padding_width;
         int height_offset = idy + shid * stride_height - padding_height;
-        int im_offset = width_offset + height_offset * input_width +
-                        channelid * input_height * input_width;
+        int im_offset = width_offset + height_offset * im_width +
+                        channelid * im_height * im_width;
 
         int col_offset = idx + idy * filter_width +
                          channelid * filter_height * filter_width +
-                         (shid * output_width + swid) *
-                             (input_channels * filter_height * filter_width);
+                         (shid * col_width + swid) *
+                             (im_channels * filter_height * filter_width);
 
-        if (height_offset >= 0 && height_offset < input_height &&
-            width_offset >= 0 && width_offset < input_width) {
+        if (height_offset >= 0 && height_offset < im_height &&
+            width_offset >= 0 && width_offset < im_width) {
           paddle::platform::CudaAtomicAdd(im_data + im_offset,
                                           col_data[col_offset]);
         }
@@ -349,28 +360,35 @@ template <class T>
 class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                     platform::GPUPlace, T> {
  public:
-  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_up, int padding_down,
-                  int padding_left, int padding_right) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
-    int input_channels = im.dims()[0];
-    int input_height = im.dims()[1];
-    int input_width = im.dims()[2];
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
     int filter_height = col.dims()[3];
     int filter_width = col.dims()[4];
-    int output_height = col.dims()[0];
-    int output_width = col.dims()[1];
-
-    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
-                           stride_height +
-                       1 ==
-                   output_height);
-    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
-                           stride_width +
-                       1 ==
-                   output_width);
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
 
     int block_dim_x = 0;
     int block_dim_y = 0;
@@ -389,15 +407,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     }
 
     int block_dim_z = 1024 / block_dim_x / block_dim_y;
-    dim3 threads(block_dim_x, block_dim_y,
-                 std::min(block_dim_z, input_channels));
-    dim3 grid(output_width, output_height);
+    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
+    dim3 grid(col_width, col_height);
     col2imOCF<T><<<grid, threads, 0,
                    reinterpret_cast<const platform::CUDADeviceContext&>(context)
                        .stream()>>>(
-        im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
-        filter_height, filter_width, stride_height, stride_width, padding_up,
-        padding_left, output_height, output_width);
+        col.data<T>(), im_channels, im_height, im_width, filter_height,
+        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
+        col_width, im->data<T>());
   }
 };
 
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index c736d4fa52..24fd9a06e9 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -35,6 +36,15 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
  * \param colData  Column data.
  * \param colShape The shape of colData.
  *
+ * \param dilations    dilation data.
+ * \param 2-dimension  [dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 2-dimension  [stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+ *
  * If the template argument Format is kCFO, the shape of colData is:
  * [input_channels, filter_height, filter_width, output_height, output_width]
  * So, it is easy to reshape into a convolution matrix for convolution
@@ -73,18 +83,19 @@ template <ColFormat Format, typename Place, typename T>
 class Im2ColFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_up,
-                  int padding_down, int padding_left, int padding_right);
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col);
 };
 
 template <ColFormat Format, typename Place, typename T>
 class Col2ImFunctor {
  public:
-  void operator()(const platform::DeviceContext& context, framework::Tensor& im,
-                  const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_up, int padding_down,
-                  int padding_left, int padding_right);
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 5763782c4e..ae197a97ed 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -45,10 +45,14 @@ void testIm2col() {
   int input_height = 2;
   int input_width = 3;
   int filter_size = 2;
-  int stride = 1;
-  int padding = 0;
-  int output_height = (input_height - filter_size + 2 * padding) / stride + 1;
-  int output_width = (input_width - filter_size + 2 * padding) / stride + 1;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
   float* input_ptr = input_tmp.mutable_data<float>(
       {1, input_height, input_width}, paddle::platform::CPUPlace());
   float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -70,7 +74,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -85,10 +89,8 @@ void testIm2col() {
       paddle::operators::math::ColFormat::kOCF, Place, float>
       im2col_ocf;
 
-  im2col(*context, input, output_cfo, stride, stride, padding, padding, padding,
-         padding);
-  im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding,
-             padding, padding);
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
 
   float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
   float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
@@ -97,7 +99,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -108,7 +110,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -128,17 +130,16 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
-  col2im(*context, input, output_cfo, stride, stride, padding, padding, padding,
-         padding);
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
 
   float* in_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -150,16 +151,15 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
-  col2im_ocf(*context, input, output_ocf, stride, stride, padding, padding,
-             padding, padding);
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
 
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
index 0febf8e3b7..ad3a59bcdb 100644
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
       detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
                                ActiveType(cand_act), ActiveType(gate_act),
                                ActiveType(cell_act));
-      value.gateValue += frame_size * 4;
-      value.stateValue += frame_size;
-      value.stateActiveValue += frame_size;
-      value.outputValue += frame_size;
-      if (value.prevStateValue) {
-        value.prevStateValue += frame_size;
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
       }
     }
   }
@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
                                 frame_size, ActiveType(cand_act),
                                 ActiveType(gate_act), ActiveType(cell_act));
 
-      value.gateValue += frame_size * 4;
-      value.stateValue += frame_size;
-      value.stateActiveValue += frame_size;
-      value.outputValue += frame_size;
-      if (value.prevStateValue) {
-        value.prevStateValue += frame_size;
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
       }
 
-      grad.gateGrad += frame_size * 4;
-      grad.stateGrad += frame_size;
-      grad.stateActiveGrad += frame_size;
-      grad.outputGrad += frame_size;
-      if (grad.prevStateGrad) {
-        grad.prevStateGrad += frame_size;
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
       }
     }
   }
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
index 28d2c6fd3b..9652399d4c 100644
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -31,26 +31,26 @@ typedef enum {
 
 template <class T>
 struct LstmMetaValue {
-  T *gateValue;
-  T *prevStateValue;
-  T *stateValue;
-  T *stateActiveValue;
-  T *outputValue;
-  T *checkIg;
-  T *checkFg;
-  T *checkOg;
+  T *gate_value;
+  T *prev_state_value;
+  T *state_value;
+  T *state_active_value;
+  T *output_value;
+  T *check_ig;
+  T *check_fg;
+  T *check_og;
 };
 
 template <class T>
 struct LstmMetaGrad {
-  T *gateGrad;
-  T *prevStateGrad;
-  T *stateGrad;
-  T *stateActiveGrad;
-  T *outputGrad;
-  T *checkIgGrad;
-  T *checkFgGrad;
-  T *checkOgGrad;
+  T *gate_grad;
+  T *prev_state_grad;
+  T *state_grad;
+  T *state_active_grad;
+  T *output_grad;
+  T *check_ig_grad;
+  T *check_fg_grad;
+  T *check_og_grad;
 };
 
 inline activation_mode_t ActiveType(const std::string &type) {
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 2a9c09a0f1..2e333a8cde 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/math/math_function.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/operators/math/math_function_impl.h"
 
 namespace paddle {
 namespace operators {
@@ -231,7 +233,87 @@ void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
   cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
+template <>
+void axpy<platform::CPUPlace, float>(const platform::DeviceContext& context,
+                                     const int n, const float alpha,
+                                     const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
+                                      const int n, const double alpha,
+                                      const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
 template struct SetConstant<platform::CPUPlace, float>;
+template struct SetConstant<platform::CPUPlace, double>;
+template struct SetConstant<platform::CPUPlace, int>;
+template struct SetConstant<platform::CPUPlace, int64_t>;
+template struct SetConstant<platform::CPUPlace, bool>;
+
+#define DEFINE_CPU_TRANS(RANK)                                \
+  template struct Transpose<platform::CPUPlace, float, RANK>; \
+  template struct Transpose<platform::CPUPlace, double, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void operator()() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#ifdef PADDLE_WITH_CUDA
+  tensor->place().apply_visitor(func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
+template struct RowwiseAdd<platform::CPUPlace, float>;
+template struct RowwiseAdd<platform::CPUPlace, double>;
+template struct ColwiseSum<platform::CPUPlace, float>;
+template struct ColwiseSum<platform::CPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index e6fd8bf235..3018e50a4f 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define EIGEN_USE_GPU
+#include "paddle/framework/data_type.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/math_function_impl.h"
 
 namespace paddle {
 namespace operators {
@@ -230,7 +233,89 @@ void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
       cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
 }
 
+template <>
+void axpy<platform::GPUPlace, float>(const platform::DeviceContext& context,
+                                     const int n, const float alpha,
+                                     const float* x, float* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      n, &alpha, x, 1, y, 1));
+}
+
+template <>
+void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
+                                      const int n, const double alpha,
+                                      const double* x, double* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      n, &alpha, x, 1, y, 1));
+}
+
 template struct SetConstant<platform::GPUPlace, float>;
+template struct SetConstant<platform::GPUPlace, double>;
+template struct SetConstant<platform::GPUPlace, int>;
+template struct SetConstant<platform::GPUPlace, int64_t>;
+template struct SetConstant<platform::GPUPlace, bool>;
+
+#define DEFINE_GPU_TRANS(RANK)                                \
+  template struct Transpose<platform::GPUPlace, float, RANK>; \
+  template struct Transpose<platform::GPUPlace, double, RANK>;
+
+DEFINE_GPU_TRANS(1);
+DEFINE_GPU_TRANS(2);
+DEFINE_GPU_TRANS(3);
+DEFINE_GPU_TRANS(4);
+DEFINE_GPU_TRANS(5);
+DEFINE_GPU_TRANS(6);
+
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const platform::DeviceContext& context,
+                       framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    SetConstant<platform::GPUPlace, T> functor;
+    functor(context_, tensor_, static_cast<T>(value_));
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::GPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantGPU(context, tensor, value));
+}
+
+template struct RowwiseAdd<platform::GPUPlace, float>;
+template struct RowwiseAdd<platform::GPUPlace, double>;
+template struct ColwiseSum<platform::GPUPlace, float>;
+// template struct ColwiseSum<platform::GPUPlace, double>;
+// The ColwiseSum<platform::GPUPlace, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<platform::GPUPlace, double>::operator()(
+    const platform::DeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<platform::GPUPlace, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::GPUPlace, double>(context, true, static_cast<int>(in_dims[0]),
+                                   static_cast<int>(in_dims[1]), 1.0,
+                                   input.data<double>(), one.data<double>(),
+                                   0.0, vector->data<double>());
+}
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 3bb5aa0332..5a42854f22 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -19,11 +19,6 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif
 
-#ifdef PADDLE_USE_MKL
-#include <mkl.h>
-#include <mkl_lapacke.h>
-#endif
-
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
 #include <cblas.h>
@@ -54,6 +49,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
@@ -98,14 +94,41 @@ void gemv(const platform::DeviceContext& context, const bool trans_a,
           const int M, const int N, const T alpha, const T* A, const T* B,
           const T beta, T* C);
 
+template <typename Place, typename T>
+void axpy(const platform::DeviceContext& context, const int n, const T alpha,
+          const T* x, T* y);
+
+template <typename Place, typename T, int Rank>
+struct Transpose {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis);
+};
+
 template <typename Place, typename T>
 struct SetConstant {
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, T num) {
-    auto t = framework::EigenVector<T>::Flatten(*tensor);
-    t.device(*context.GetEigenDevice<Place>()) =
-        t.constant(static_cast<T>(num));
-  }
+                  framework::Tensor* tensor, T num);
+};
+
+template <typename Place>
+void set_constant_with_place(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value);
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value);
+
+template <typename Place, typename T>
+struct RowwiseAdd {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, const framework::Tensor& vec,
+                  framework::Tensor* output);
+};
+
+template <typename Place, typename T>
+struct ColwiseSum {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
new file mode 100644
index 0000000000..4dc17a4e52
--- /dev/null
+++ b/paddle/operators/math/math_function_impl.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename Place, typename T>
+void SetConstant<Place, T>::operator()(const platform::DeviceContext& context,
+                                       framework::Tensor* tensor, T num) {
+  auto t = framework::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
+}
+
+template <typename Place, typename T, int Rank>
+void Transpose<Place, T, Rank>::operator()(
+    const platform::DeviceContext& context, const framework::Tensor& in,
+    framework::Tensor* out, const std::vector<int>& axis) {
+  Eigen::array<int, Rank> permute;
+  for (int i = 0; i < Rank; i++) {
+    permute[i] = axis[i];
+  }
+  auto in_dim = in.dims();
+  auto out_dim = out->dims();
+
+  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
+  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+  auto* dev = context.GetEigenDevice<Place>();
+  eigen_out.device(*dev) = eigen_in.shuffle(permute);
+}
+
+template <typename Place, typename T>
+void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
+                                      const framework::Tensor& input,
+                                      const framework::Tensor& vector,
+                                      framework::Tensor* output) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector.numel(), size);
+  PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenMatrix<T>::From(vector);
+  auto out = framework::EigenMatrix<T>::From(*output);
+  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
+  Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
+  out.device(*context.GetEigenDevice<Place>()) =
+      in + vec.reshape(shape).broadcast(bcast);
+}
+
+template <typename Place, typename T>
+void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
+                                      const framework::Tensor& input,
+                                      framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+
+  auto vec = framework::EigenMatrix<T>::From(*vector);
+  auto in = framework::EigenMatrix<T>::From(input);
+  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
+  vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
+      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 7d84ad9aad..983c9fdcff 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -139,3 +139,15 @@ TEST(math_function, gemv) {
   GemvTest<float>(12, 7, true);
   GemvTest<double>(7, 9, true);
 }
+
+TEST(math_funciton, set_constant) {
+  paddle::framework::Tensor t;
+  t.Resize({10, 10});
+  t.mutable_data<int>(paddle::platform::CPUPlace());
+  auto* ctx = new paddle::platform::CPUDeviceContext();
+  paddle::operators::math::set_constant(*ctx, &t, 10);
+  for (int64_t i = 0; i < t.numel(); ++i) {
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+  }
+  delete ctx;
+}
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 780d17ffc6..d5d6f0c73b 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
   context.Wait();
 
   EXPECT_EQ(input3_ptr[0], 0);
@@ -205,14 +205,15 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CUDADeviceContext context(*gpu_place);
-  g_mat_a.CopyFrom(mat_a, *gpu_place, context);
-  g_vec_b.CopyFrom(vec_b, *gpu_place, context);
+  paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b);
 
   paddle::operators::math::gemv<paddle::platform::GPUPlace, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
-  vec_c.CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context);
+  paddle::framework::CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context,
+                              &vec_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index f01fa18391..c9003962d3 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -18,86 +18,72 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename MaxOutProcess, typename T>
-class MaxOutFunctor<platform::CPUPlace, MaxOutProcess, T> {
+// All tensors are in NCHW format, and the groups must be greater than 1
+template <typename T>
+class MaxOutFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  int groups, int num_channels, MaxOutProcess maxout_process) {
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = num_channels/groups;
-
+    const int output_channels = output->dims()[1];
     int fea_size = input_height * input_width;
+    // c_size means the output size of each sample
     int c_size = fea_size * output_channels;
-
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
-    for (int i = 0; i < batch_size; i++) {
-      int new_bindex =  c_size * i;
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex = c_size * i;
       for (int c = 0; c < output_channels; ++c) {
         int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; f++) {
-          T ele = maxout_process.initial();
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = static_cast<T>(-FLT_MAX);
           for (int ph = 0; ph < groups; ++ph) {
-            maxout_process.compute(ele,
-              input_data[(new_bindex+new_cindex) * groups+ph*fea_size+f]);
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
+            ele = ele > x ? ele : x;
           }
-          maxout_process.finalize(ele, (static_cast<T>(groups)));
-          output_data[(new_bindex+new_cindex+f)] = ele;
+          output_data[(new_bindex + new_cindex + f)] = ele;
         }
       }
     }
   }
 };
 
-
-
 template <class T>
 class MaxOutGradFunctor<platform::CPUPlace, T> {
-public:
+ public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor& input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups, int num_channels) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = num_channels / groups;
-
+    const int output_channels = output.dims()[1];
     int fea_size = input_height * input_width;
-
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    for (int i = 0; i < batch_size; i++) {
+    for (int i = 0; i < batch_size; ++i) {
       int blen = fea_size * output_channels * i;
       for (int c = 0; c < output_channels; ++c) {
         int clen = fea_size * c;
-        for (int f = 0; f < fea_size; f++) {
-          int input_idx = 0;
-          bool stop = false;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx0 = (blen + clen) * groups + f;
+          bool continue_match = true;
           int output_idx = blen + clen + f;
-          for (int g = 0; g < groups && !stop; g++) {
-              input_idx = (blen + clen) * groups + fea_size * g + f;
-              input_grad_data[input_idx] = 0;
-              if (input_data[input_idx] == output_data[output_idx]) {
-                input_grad_data[input_idx] += output_grad_data[output_idx];
-                stop = true;
-              } else {
-                input_grad_data[input_idx] = 0;
-              }
+          for (int g = 0; g < groups && continue_match; ++g) {
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
           }
         }
       }
@@ -107,10 +93,8 @@ public:
 
 template class MaxOutGradFunctor<platform::CPUPlace, float>;
 template class MaxOutGradFunctor<platform::CPUPlace, double>;
-template class MaxOutFunctor<platform::CPUPlace,
-                             paddle::operators::math::MaxOut<float>, float>;
-template class MaxOutFunctor<platform::CPUPlace,
-                             paddle::operators::math::MaxOut<double>, double>;
+template class MaxOutFunctor<platform::CPUPlace, float>;
+template class MaxOutFunctor<platform::CPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index b1c0dd8fd4..c3fabcae08 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -19,108 +19,102 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename MaxOutProcess, typename T>
+template <typename T>
 __global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                             T* output_data, const int channels,
-                             const int input_height, const int input_width,
-                             int groups, MaxOutProcess maxout_process) {
-  int size = input_height * input_width * channels / groups;
-  int featLen = input_height * input_width;
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-              index += blockDim.x * gridDim.x) {
-    int batch_idx = index / size;
-    int i = index % size;
-    int channel_idx = i / featLen;
-    int feat_idx = i % featLen;
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
     int data_idx =
-      (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
-    T ele = maxout_process.initial();
-    for (int g = 0; g < groups; g++) {
-      maxout_process.compute(ele, input_data[data_idx + g * featLen]);
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    T ele = static_cast<T>(-FLT_MAX);
+    for (int g = 0; g < groups; ++g) {
+      T x = input_data[data_idx + g * feat_len];
+      ele = ele > x ? ele : x;
     }
-    maxout_process.finalize(ele, (static_cast<T>(groups)));
-    output_data[index] = ele;
+    output_data[i] = ele;
   }
 }
 template <typename T>
-__global__ void KernelMaxoutGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, int groups) {
-    int size = input_height * input_width * channels / groups;
-    int featLen = input_height * input_width;
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-         index += blockDim.x * gridDim.x) {
-      int batch_idx = index / size;
-      int i = index % size;
-      int channel_idx = i / featLen;
-      int feat_idx = i % featLen;
-      int data_idx =
-        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
-      int maxIndex = -1;
-      bool stop = false;
-      for (int g = 0; g < groups && !stop; g++) {
-        if (input_data[data_idx + g * featLen] == output_data[index]) {
-          maxIndex = data_idx + g * featLen;
-          stop = true;
-        }
-      }
-      if (maxIndex != -1) {
-        // atomic add
-        platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
       }
     }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
 }
 /*
  * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
  */
-template <typename MaxOutProcess, typename T>
-class MaxOutFunctor<platform::GPUPlace, MaxOutProcess, T> {
+template <typename T>
+class MaxOutFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  int groups, int num_channels,
-                  MaxOutProcess maxout_process) {
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = num_channels / groups;
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = output->numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
     KernelMaxOut<
-        MaxOutProcess,
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, input_channels,
-                              input_height, input_width, groups,
-                              maxout_process);
+                 .stream()>>>(nthreads, input_data, input_channels,
+                              input_height, input_width, groups, output_data);
   }
 };
 /*
  * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
  */
 template <typename T>
 class MaxOutGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups, int num_channels) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -132,9 +126,8 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
-
-    int nthreads = batch_size * output_channels * output_height * output_width;
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = output.numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -142,19 +135,17 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     KernelMaxoutGrad<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups);
+                 .stream()>>>(nthreads, input_data, output_data,
+                              output_grad_data, input_grad_data, input_channels,
+                              input_height, input_width, groups);
   }
 };
 
 template class MaxOutGradFunctor<platform::GPUPlace, float>;
 template class MaxOutGradFunctor<platform::GPUPlace, double>;
 
-template class MaxOutFunctor<platform::GPUPlace,
-                             paddle::operators::math::MaxOut<float>, float>;
-template class MaxOutFunctor<platform::GPUPlace,
-                             paddle::operators::math::MaxOut<double>, double>;
+template class MaxOutFunctor<platform::GPUPlace, float>;
+template class MaxOutFunctor<platform::GPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index aeac084944..2d9069b0b3 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/hostdevice.h"
@@ -22,78 +21,25 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-  __FLT_MAX__  // It might need to be placed in another file, but I'm still
-               // wondering where to put it.
+#define FLT_MAX __FLT_MAX__
 
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxOut {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
-  DEVICE inline void finalize(T& y, const T& group) {}
-};
+template <typename Place, typename T>
 
-template <class T>
-class MaxOutGrad {
- public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
-                             T scale) {
-    dx += dy * (x == y);
-  }
-};
-
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
- * number of channels, H and W is the height and width of feature.
- * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
- * number of channels, D, H and W is the depth, height and width of feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-template <typename Place, typename MaxOutProcess, typename T>
 class MaxOutFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  int groups, int num_channels, MaxOutProcess maxout_compute);
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups);
 };
 
-
 template <typename Place, class T>
 class MaxOutGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor& input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, int groups,
-                  int num_channels);
+                  const framework::Tensor& output_grad, int groups);
 };
-
-
-
-
-
-
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
index 50cfb88bb5..135984586a 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
     const int output_stride = output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -87,11 +87,12 @@ template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -154,10 +155,11 @@ template <class T>
 class MaxPool2dGradFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -240,17 +242,17 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -315,11 +317,12 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -398,10 +401,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -494,19 +498,19 @@ template class Pool3dGradFunctor<
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -516,9 +520,9 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
-    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -531,7 +535,7 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
             int wend = std::min(wstart + ksize_width, input_width);
             wstart = std::max(wstart, 0);
 
-            T ele = static_cast<T>(-FLT_MAX);
+            T1 ele = static_cast<T1>(-FLT_MAX);
             int index = -1;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -559,26 +563,26 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
     const int output_channels = output_grad.dims()[1];
     const int output_height = output_grad.dims()[2];
     const int output_width = output_grad.dims()[3];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -598,31 +602,31 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float>;
-template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double>;
-template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float, int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double, int>;
 
 /*
  * All tensors are in NCDHW format.
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -635,9 +639,9 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
     const int input_stride = input_depth * input_height * input_width;
     const int output_stride = output_depth * output_height * output_width;
 
-    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -655,7 +659,7 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
               wstart = std::max(wstart, 0);
 
               int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = static_cast<T>(-FLT_MAX);
+              T1 ele = static_cast<T1>(-FLT_MAX);
               int index = -1;
               for (int d = dstart; d < dend; ++d) {
                 for (int h = hstart; h < hend; ++h) {
@@ -687,18 +691,18 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
     const int output_channels = output_grad.dims()[1];
     const int output_depth = output_grad.dims()[2];
     const int output_height = output_grad.dims()[3];
@@ -706,9 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
     const int input_stride = input_depth * input_height * input_width;
     const int output_stride = output_depth * output_height * output_width;
 
-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -731,10 +735,10 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float>;
-template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double>;
-template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float, int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double, int>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
index 736327f4b7..ca3560f264 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -21,13 +21,13 @@ namespace math {
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(const int nthreads, const T* input_data,
-                             T* output_data, const int channels,
-                             const int input_height, const int input_width,
-                             const int output_height, const int output_width,
-                             const int ksize_height, const int ksize_width,
-                             const int stride_height, const int stride_width,
-                             const int padding_height, const int padding_width,
-                             PoolProcess pool_process) {
+                             const int channels, const int input_height,
+                             const int input_width, const int output_height,
+                             const int output_width, const int ksize_height,
+                             const int ksize_width, const int stride_height,
+                             const int stride_width, const int padding_height,
+                             const int padding_width, PoolProcess pool_process,
+                             T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -59,11 +59,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, PoolProcess pool_process) {
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    PoolProcess pool_process, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -107,11 +107,11 @@ __global__ void KernelPool2DGrad(
 template <typename T>
 __global__ void KernelMaxPool2DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width) {
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -158,16 +158,16 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -176,7 +176,7 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -187,11 +187,10 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
         PoolProcess,
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, input_channels,
-                              input_height, input_width, output_height,
-                              output_width, ksize_height, ksize_width,
-                              stride_height, stride_width, padding_height,
-                              padding_width, pool_process);
+                 .stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, output_data);
   }
 };
 
@@ -204,11 +203,11 @@ template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -225,7 +224,7 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -237,10 +236,10 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, output_height, output_width,
-        ksize_height, ksize_width, stride_height, stride_width, padding_height,
-        padding_width, pool_process);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, input_grad_data);
   }
 };
 
@@ -253,10 +252,11 @@ template <typename T>
 class MaxPool2dGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -274,7 +274,7 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -285,10 +285,10 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, output_height, output_width,
-        ksize_height, ksize_width, stride_height, stride_width, padding_height,
-        padding_width);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
   }
 };
 
@@ -313,14 +313,16 @@ template class Pool2dGradFunctor<
     platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, T* output_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process) {
+__global__ void KernelPool3D(const int nthreads, const T* input_data,
+                             const int channels, const int input_depth,
+                             const int input_height, const int input_width,
+                             const int output_depth, const int output_height,
+                             const int output_width, const int ksize_depth,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_depth, const int stride_height,
+                             const int stride_width, const int padding_depth,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -358,13 +360,13 @@ __global__ void KernelPool3D(
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process) {
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, PoolProcess pool_process,
+    T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -422,13 +424,12 @@ __global__ void KernelPool3DGrad(
 template <typename T>
 __global__ void KernelMaxPool3DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -480,18 +481,18 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -503,7 +504,7 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -516,11 +517,11 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
-        pool_process);
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process,
+        output_data);
   }
 };
 
@@ -533,11 +534,11 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -560,7 +561,7 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -573,11 +574,11 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_depth, input_height, input_width, output_depth,
-        output_height, output_width, ksize_depth, ksize_height, ksize_width,
-        stride_depth, stride_height, stride_width, padding_depth,
-        padding_height, padding_width, pool_process);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, input_grad_data);
   }
 };
 
@@ -590,10 +591,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -616,7 +618,7 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -628,11 +630,11 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_depth, input_height, input_width, output_depth,
-        output_height, output_width, ksize_depth, ksize_height, ksize_width,
-        stride_depth, stride_height, stride_width, padding_depth,
-        padding_height, padding_width);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data);
   }
 };
 
@@ -656,13 +658,13 @@ template class Pool3dGradFunctor<
 template class Pool3dGradFunctor<
     platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T* input_data, T* output_data, T* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width) {
+    const int nthreads, const T1* input_data, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -679,7 +681,7 @@ __global__ void KernelMaxPool2dWithIdx(
     wstart = max(wstart, 0);
 
     input_data += (batch_idx * channels + c) * input_height * input_width;
-    T ele = -FLT_MAX;
+    T1 ele = -FLT_MAX;
     int max_index = -1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
@@ -695,13 +697,13 @@ __global__ void KernelMaxPool2dWithIdx(
   }
 }
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, T* input_grad, const T* output_grad, const T* mask_data,
+    const int nthreads, const T1* output_grad, const T2* mask_data,
     const int channels, const int input_height, const int input_width,
     const int output_height, const int output_width, const int ksize_height,
     const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width) {
+    const int padding_height, const int padding_width, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -722,7 +724,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(
     int pw_end =
         min((w_offset + padding_width) / stride_width + 1, output_width);
 
-    T gradient = 0;
+    T1 gradient = 0;
     int input_current_featuremap_idx = h_offset * input_width + w_offset;
     int output_idx =
         (batch_idx * channels + c_offset) * output_height * output_width;
@@ -744,20 +746,20 @@ __global__ void KernelMaxPool2DWithIdxGrad(
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -765,9 +767,9 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[0];
     const int padding_width = paddings[1];
 
-    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -775,13 +777,12 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool2dWithIdx<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, mask_data,
-                              input_channels, input_height, input_width,
-                              output_height, output_width, ksize_height,
-                              ksize_width, stride_height, stride_width,
-                              padding_height, padding_width);
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, output_data, mask_data);
   }
 };
 
@@ -790,18 +791,18 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
-class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_channels = input_grad.dims()[1];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
     const int output_height = output_grad.dims()[2];
     const int output_width = output_grad.dims()[3];
     const int ksize_height = ksize[0];
@@ -811,9 +812,9 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[0];
     const int padding_width = paddings[1];
 
-    const T* mask_data = mask.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -821,30 +822,30 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool2DWithIdxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_grad_data, output_grad_data,
-                              mask_data, input_channels, input_height,
-                              input_width, output_height, output_width,
-                              ksize_height, ksize_width, stride_height,
-                              stride_width, padding_height, padding_width);
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_height,
+        input_width, output_height, output_width, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
   }
 };
 
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float>;
-template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double>;
-template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float, int>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double, int>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double, int>;
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T* input_data, T* output_data, T* mask_data,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const int nthreads, const T1* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T1* output_data, T2* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -864,7 +865,7 @@ __global__ void KernelMaxPool3DWithIdx(
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
 
-    T ele = -FLT_MAX;
+    T1 ele = -FLT_MAX;
     int max_index = -1;
     input_data +=
         (batch_idx * channels + c) * input_depth * input_height * input_width;
@@ -884,15 +885,15 @@ __global__ void KernelMaxPool3DWithIdx(
   }
 }
 
-template <typename T>
+template <typename T1, typename T2>
 __global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, T* input_grad, const T* output_grad, const T* mask,
+    const int nthreads, const T1* output_grad, const T2* mask,
     const int channels, const int input_depth, const int input_height,
     const int input_width, const int output_depth, const int output_height,
     const int output_width, const int ksize_depth, const int ksize_height,
     const int ksize_width, const int stride_depth, const int stride_height,
     const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const int padding_width, T1* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -921,7 +922,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
     int pw_end =
         min((w_offset + padding_width) / stride_width + 1, output_width);
 
-    T gradient = 0;
+    T1 gradient = 0;
     int input_current_feature_map_idx =
         (d_offset * input_height + h_offset) * input_width + w_offset;
     int output_idx = (batch_idx * channels + c_offset) * output_depth *
@@ -948,22 +949,22 @@ __global__ void KernelMaxPool3DWithIdxGrad(
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -974,9 +975,9 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[1];
     const int padding_width = paddings[2];
 
-    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -985,14 +986,13 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdx<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-        nthreads, input_data, output_data, mask_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width);
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, output_data, mask_data);
   }
 };
 
@@ -1001,19 +1001,19 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
  * Ksize, strides, paddings are three elements. These three elements represent
  * depth, height and width, respectively.
  */
-template <typename T>
-class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T1, T2> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_channels = input_grad.dims()[1];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
     const int output_depth = output_grad.dims()[2];
     const int output_height = output_grad.dims()[3];
     const int output_width = output_grad.dims()[4];
@@ -1027,9 +1027,9 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
     const int padding_height = paddings[1];
     const int padding_width = paddings[2];
 
-    const T* output_grad_data = output_grad.data<T>();
-    const T* mask_data = mask.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    const T1* output_grad_data = output_grad.data<T1>();
+    const T2* mask_data = mask.data<T2>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1038,21 +1038,21 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
     dim3 grid(blocks, 1);
 
     KernelMaxPool3DWithIdxGrad<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-        nthreads, input_grad_data, output_grad_data, mask_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width);
+        T1, T2><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        input_grad_data);
   }
 };
 
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float>;
-template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double>;
-template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float, int>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double, int>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double, int>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
index c50c57b5c5..19fbd8b4bb 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };
 
 template <typename Place, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
 template <typename Place, class T>
 class MaxPool2dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 template <typename Place, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };
 
 template <typename Place, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
 template <typename Place, class T>
 class MaxPool3dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 /*
@@ -151,42 +153,42 @@ class MaxPool3dGradFunctor {
  * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
  * NCDHW format.
  */
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool2dWithIndexGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
-template <typename Place, typename T>
+template <typename Place, typename T1, typename T2>
 class MaxPool3dWithIndexGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
index 075196b47e..514f2adef2 100644
--- a/paddle/operators/math/selected_rows_functor.cc
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -145,6 +145,8 @@ struct SelectedRowsAddTo<platform::CPUPlace, T> {
 
 template struct SelectedRowsAddTo<platform::CPUPlace, float>;
 template struct SelectedRowsAddTo<platform::CPUPlace, double>;
+template struct SelectedRowsAddTo<platform::CPUPlace, int>;
+template struct SelectedRowsAddTo<platform::CPUPlace, int64_t>;
 
 template <typename T>
 struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
@@ -175,6 +177,8 @@ struct SelectedRowsAddToTensor<platform::CPUPlace, T> {
 
 template struct SelectedRowsAddToTensor<platform::CPUPlace, float>;
 template struct SelectedRowsAddToTensor<platform::CPUPlace, double>;
+template struct SelectedRowsAddToTensor<platform::CPUPlace, int>;
+template struct SelectedRowsAddToTensor<platform::CPUPlace, int64_t>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index 47fe3b44a5..c1dd323ba2 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -173,6 +173,8 @@ struct SelectedRowsAddTo<platform::GPUPlace, T> {
 
 template struct SelectedRowsAddTo<platform::GPUPlace, float>;
 template struct SelectedRowsAddTo<platform::GPUPlace, double>;
+template struct SelectedRowsAddTo<platform::GPUPlace, int>;
+template struct SelectedRowsAddTo<platform::GPUPlace, int64_t>;
 
 namespace {
 template <typename T, int block_size>
@@ -223,7 +225,8 @@ struct SelectedRowsAddToTensor<platform::GPUPlace, T> {
 
 template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
 template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;
-
+template struct SelectedRowsAddToTensor<platform::GPUPlace, int>;
+template struct SelectedRowsAddToTensor<platform::GPUPlace, int64_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 09de9dc53a..7de9291c17 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
-  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
+  CopyFrom(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
-  tensor1_cpu.CopyFrom(*tensor1, cpu_place, ctx);
+  CopyFrom(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index 10c6e105b9..5b3bde02fb 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -22,8 +22,8 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index) {
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index 4f34994678..c5d968aeb2 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define EIGEN_USE_GPU
 #include "paddle/operators/math/sequence2batch.h"
 
 namespace paddle {
@@ -41,8 +42,8 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index) {
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index b1ba35a6d4..73295ddbcb 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
@@ -21,6 +22,10 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
 class CopyMatrixRowsFunctor {
  public:
@@ -30,8 +35,8 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index);
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index);
 };
 
 template <typename Place, typename T>
@@ -57,7 +62,7 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch.lod();
-      PADDLE_ENFORCE_EQ(lods.size(), 2UL);
+      PADDLE_ENFORCE_GT(lods.size(), 2UL);
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
       CopyMatrixRowsFunctor<Place, T> to_batch;
@@ -66,8 +71,8 @@ class LoDTensor2BatchFunctor {
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
     auto lod = lods[0];
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
@@ -78,8 +83,7 @@ class LoDTensor2BatchFunctor {
     std::sort(seq_info.begin(), seq_info.end(),
               [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
 
-    // calculate the start position of each batch
-    // (numBatch equal the maxLength of sequences)
+    // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
     //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
     //           num_batch = 5,
@@ -95,19 +99,25 @@ class LoDTensor2BatchFunctor {
     //                                6, 2, 11,
     //                                7, 3,
     //                                8}
-    // The batch number represents batch size after rearranging the
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The num_batch represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
     paddle::framework::LoD batch_lods;
     batch_lods.emplace_back(std::vector<size_t>{0});
     batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
 
     // batch_lods[0] is the start positions for batch LoDTensor
     int num_batch = seq_info[0].length;
     batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
-    auto dims = lod_tensor.dims();
-    batch_lods[1].resize(static_cast<size_t>(dims[0]));
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
 
     size_t* batch_starts = batch_lods[0].data();
     size_t* seq2batch_idx = batch_lods[1].data();
@@ -127,6 +137,10 @@ class LoDTensor2BatchFunctor {
       }
       batch_starts[n + 1] = static_cast<size_t>(batch_id);
     }
+    size_t* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
     batch.set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<Place, T> to_batch;
@@ -141,8 +155,7 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor& lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
-                      "The LoD size of input `batch` should be 2.");
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
     CopyMatrixRowsFunctor<Place, T> to_seq;
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
new file mode 100644
index 0000000000..5913c99fdb
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), 1);
+    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::CPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
new file mode 100644
index 0000000000..5ed951402f
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename T>
+__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+                                  T* output, int* index, int64_t num_seq,
+                                  int64_t dim) {
+  int dim_idx = threadIdx.x;
+  int seq_id = blockIdx.x;
+  if (seq_id >= num_seq) return;
+  size_t start = starts[seq_id];
+  size_t end = starts[seq_id + 1];
+
+  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+    T max_val = static_cast<T>(-FLT_MAX);
+    int max_id = -1;
+    for (size_t step_id = start; step_id < end; step_id++) {
+      if (max_val < input[step_id * dim + i]) {
+        max_val = input[step_id * dim + i];
+        max_id = step_id;
+      }
+    }
+    output[seq_id * dim + i] = max_val;
+    index[seq_id * dim + i] = max_id;
+  }
+}
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+
+    dim3 threads(256, 1);
+    dim3 grid(num_seq, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+        in_data, starts.data(), out_data, max_index, num_seq, dim);
+  }
+};
+
+template <typename T>
+__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+                                      T* in_grad, int64_t num_seq,
+                                      int64_t dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int col_idx = idx % dim;
+  if (idx < num_seq * dim) {
+    int step_id = max_index[idx];
+    in_grad[step_id * dim + col_idx] = out_grad[idx];
+  }
+}
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto idx_dims = index.dims();
+    auto ig_dims = in_grad->dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::GPUPlace, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+
+    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+    dim3 threads(128, 1);
+    dim3 grid(blocks, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+        og_data, max_index, ig_data, num_seq, dim);
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolFunctor<platform::GPUPlace, double>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, float>;
+template class MaxSeqPoolGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_constant_op.h b/paddle/operators/math/sequence_pooling.h
similarity index 52%
rename from paddle/operators/fill_constant_op.h
rename to paddle/operators/math/sequence_pooling.h
index 3668f42f1c..35dfe26de1 100644
--- a/paddle/operators/fill_constant_op.h
+++ b/paddle/operators/math/sequence_pooling.h
@@ -13,25 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
 
 template <typename Place, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index);
+};
+
+template <typename Place, class T>
+class MaxSeqPoolGradFunctor {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto value = ctx.Attr<float>("value");
-
-    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
-    auto place = ctx.GetEigenDevice<Place>();
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
-  }
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad);
 };
 
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index 0ba8197ab8..3e2f15d6c2 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/math/softmax.h"
+#include "paddle/operators/math/softmax_impl.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
 template class SoftmaxFunctor<platform::CPUPlace, float>;
+template class SoftmaxFunctor<platform::CPUPlace, double>;
 template class SoftmaxGradFunctor<platform::CPUPlace, float>;
+template class SoftmaxGradFunctor<platform::CPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
index 99f988d51e..4dbab51d46 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
@@ -15,13 +15,16 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/math/softmax.h"
+#include "paddle/operators/math/softmax_impl.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
 template class SoftmaxFunctor<platform::GPUPlace, float>;
+template class SoftmaxFunctor<platform::GPUPlace, double>;
 template class SoftmaxGradFunctor<platform::GPUPlace, float>;
+template class SoftmaxGradFunctor<platform::GPUPlace, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index b7f627eee7..fe10746502 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -13,60 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
 #include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
 template <typename Place, typename T>
 class SoftmaxFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor* X, framework::Tensor* Y) {
-    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*Y);
-
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class))
-                              .unaryExpr(ValueClip<T>());
-
-    softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
-    softmax.device(*context.GetEigenDevice<Place>()) =
-        (softmax *
-         softmax.sum(along_class)
-             .inverse()
-             .eval()
-             .reshape(batch_by_one)
-             .broadcast(one_by_class));
-  }
+                  const framework::Tensor* X, framework::Tensor* Y);
 };
 
 template <typename Place, typename T>
@@ -74,29 +31,7 @@ class SoftmaxGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
-    auto softmax = EigenMatrix<T>::From(*y);
-    auto softmax_grad = EigenMatrix<T>::From(*y_grad);
-    auto logits_grad = EigenMatrix<T>::From(*x_grad);
-
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-
-    const int batch_size = softmax.dimension(kBatchDim);
-    const int num_classes = softmax.dimension(kClassDim);
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto dot = (softmax * softmax_grad)
-                   .sum(along_class)
-                   .eval()
-                   .reshape(batch_by_one)
-                   .broadcast(one_by_class);
-    logits_grad.device(*context.GetEigenDevice<Place>()) =
-        (softmax_grad - dot) * softmax;
-  }
+                  framework::Tensor* x_grad);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h
new file mode 100644
index 0000000000..05793eeb3e
--- /dev/null
+++ b/paddle/operators/math/softmax_impl.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = -64.;
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename Place, typename T>
+void SoftmaxFunctor<Place, T>::operator()(
+    const platform::DeviceContext& context, const framework::Tensor* X,
+    framework::Tensor* Y) {
+  auto logits = EigenMatrix<T>::From(*X);
+  auto softmax = EigenMatrix<T>::From(*Y);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = logits.dimension(kBatchDim);
+  const int num_classes = logits.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto shifted_logits = (logits -
+                         logits.maximum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class))
+                            .unaryExpr(ValueClip<T>());
+
+  softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
+  softmax.device(*context.GetEigenDevice<Place>()) =
+      (softmax *
+       softmax.sum(along_class)
+           .inverse()
+           .eval()
+           .reshape(batch_by_one)
+           .broadcast(one_by_class));
+}
+
+template <typename Place, typename T>
+void SoftmaxGradFunctor<Place, T>::operator()(
+    const platform::DeviceContext& context, const framework::Tensor* y,
+    const framework::Tensor* y_grad, framework::Tensor* x_grad) {
+  auto softmax = EigenMatrix<T>::From(*y);
+  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+  auto logits_grad = EigenMatrix<T>::From(*x_grad);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = softmax.dimension(kBatchDim);
+  const int num_classes = softmax.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto dot = (softmax * softmax_grad)
+                 .sum(along_class)
+                 .eval()
+                 .reshape(batch_by_one)
+                 .broadcast(one_by_class);
+  logits_grad.device(*context.GetEigenDevice<Place>()) =
+      (softmax_grad - dot) * softmax;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000..b57d3dc141
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000..37c3c8b689
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000..7077d7c227
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename Place, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename Place, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
index e9718a0473..99eb7fd46d 100644
--- a/paddle/operators/math/vol2col.cc
+++ b/paddle/operators/math/vol2col.cc
@@ -28,28 +28,51 @@ template <class T>
 class Vol2ColFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& vol, framework::Tensor& col,
-                  int stride_depth, int stride_height, int stride_width,
-                  int padding_depth, int padding_height,
-                  int padding_width) const {
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const {
     PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
+    PADDLE_ENFORCE(col->dims().size() == 7);
 
     int input_channels = vol.dims()[0];
     int input_depth = vol.dims()[1];
     int input_height = vol.dims()[2];
     int input_width = vol.dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+
     const T* vol_data = vol.data<T>();
-    T* col_data = col.data<T>();
+    T* col_data = col->data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
@@ -57,24 +80,23 @@ class Vol2ColFunctor<platform::CPUPlace, T> {
       int d_offset = (c / filter_width / filter_height) % filter_depth;
       int c_in = c / filter_width / filter_height / filter_depth;
       for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * stride_depth - padding_depth + d_offset;
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
         for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * stride_height - padding_height + h_offset;
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
           for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * stride_width - padding_width + w_offset;
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
 
             int col_idx =
                 ((c * output_depth + d) * output_height + h) * output_width + w;
-            if (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) {
-              col_data[col_idx] = static_cast<T>(0);
-            } else {
-              int vol_idx =
-                  ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-              col_data[col_idx] = vol_data[vol_idx];
-            }
+            int vol_idx =
+                ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                    input_width +
+                w_pad;
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
           }
         }
       }
@@ -92,17 +114,18 @@ template <class T>
 class Col2VolFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& vol, const framework::Tensor& col,
-                  int stride_depth, int stride_height, int stride_width,
-                  int padding_depth, int padding_height,
-                  int padding_width) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const {
+    PADDLE_ENFORCE(vol->dims().size() == 4);
     PADDLE_ENFORCE(col.dims().size() == 7);
 
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
     int filter_depth = col.dims()[1];
     int filter_height = col.dims()[2];
     int filter_width = col.dims()[3];
@@ -112,7 +135,28 @@ class Col2VolFunctor<platform::CPUPlace, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    T* vol_data = vol.data<T>();
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    T* vol_data = vol->data<T>();
     const T* col_data = col.data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
@@ -121,11 +165,11 @@ class Col2VolFunctor<platform::CPUPlace, T> {
       int d_offset = (c / filter_width / filter_height) % filter_depth;
       int cIm = c / filter_width / filter_height / filter_depth;
       for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * stride_depth - padding_depth + d_offset;
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
         for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * stride_height - padding_height + h_offset;
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
           for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * stride_width - padding_width + w_offset;
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
 
             if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
                 w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
@@ -133,6 +177,7 @@ class Col2VolFunctor<platform::CPUPlace, T> {
                   ((cIm * input_depth + d_pad) * input_height + h_pad) *
                       input_width +
                   w_pad;
+
               int col_idx =
                   ((c * output_depth + d) * output_height + h) * output_width +
                   w;
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
index 27b11fb237..dae3be858e 100644
--- a/paddle/operators/math/vol2col.cu
+++ b/paddle/operators/math/vol2col.cu
@@ -21,11 +21,12 @@ namespace math {
 
 template <class T>
 __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
-                        int height, int width, int filter_depth,
-                        int filter_height, int filter_width, int stride_depth,
-                        int stride_height, int stride_width, int padding_depth,
-                        int padding_height, int padding_width, int output_detph,
-                        int output_height, int output_width, T* data_col) {
+                        int height, int width, int dilation_d, int dilation_h,
+                        int dilation_w, int filter_depth, int filter_height,
+                        int filter_width, int stride_depth, int stride_height,
+                        int stride_width, int padding_depth, int padding_height,
+                        int padding_width, int output_detph, int output_height,
+                        int output_width, T* data_col) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
        index += blockDim.x * gridDim.x) {
     int w_out = index % output_width;
@@ -44,12 +45,14 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
     for (int k = 0; k < filter_depth; ++k) {
       for (int i = 0; i < filter_height; ++i) {
         for (int j = 0; j < filter_width; ++j) {
-          int d = d_in + k;
-          int h = h_in + i;
-          int w = w_in + j;
+          int d = d_in + k * dilation_d;
+          int h = h_in + i * dilation_h;
+          int w = w_in + j * dilation_w;
+          int col_idx = (k * dilation_d * height + i * dilation_h) * width +
+                        j * dilation_w;
           *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
                        w < width)
-                          ? data_vol[(k * height + i) * width + j]
+                          ? data_vol[col_idx]
                           : 0;
           data_col += output_detph * output_height * output_width;
         }
@@ -68,23 +71,46 @@ template <class T>
 class Vol2ColFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& vol, framework::Tensor& col,
-                  int stride_depth, int stride_height, int stride_width,
-                  int padding_depth, int padding_height,
-                  int padding_width) const {
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const {
     PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
+    PADDLE_ENFORCE(col->dims().size() == 7);
 
     int input_channels = vol.dims()[0];
     int input_depth = vol.dims()[1];
     int input_height = vol.dims()[2];
     int input_width = vol.dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "Mismatching.");
 
     int num_outputs =
         input_channels * output_depth * output_height * output_width;
@@ -95,19 +121,25 @@ class Vol2ColFunctor<platform::GPUPlace, T> {
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
                      .stream()>>>(
         num_outputs, vol.data<T>(), input_depth, input_height, input_width,
-        filter_depth, filter_height, filter_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
-        output_depth, output_height, output_width, col.data<T>());
+        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
+        filter_width, strides[0], strides[1], strides[2], paddings[0],
+        paddings[1], paddings[2], output_depth, output_height, output_width,
+        col->data<T>());
   }
 };
 
 template <class T>
 __global__ void col2vol(int num_kernels, const T* data_col, int depth,
-                        int height, int width, int filter_depth,
-                        int filter_height, int filter_width, int stride_depth,
-                        int stride_height, int stride_width, int padding_depth,
-                        int padding_height, int padding_width, int output_detph,
-                        int output_height, int output_width, T* data_vol) {
+                        int height, int width, int dilation_d, int dilation_h,
+                        int dilation_w, int filter_depth, int filter_height,
+                        int filter_width, int stride_depth, int stride_height,
+                        int stride_width, int padding_depth, int padding_height,
+                        int padding_width, int output_detph, int output_height,
+                        int output_width, T* data_vol) {
+  const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
+  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
+  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
        index += blockDim.x * gridDim.x) {
     T src_val = 0;
@@ -115,35 +147,41 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth,
     int h = (index / width) % height + padding_height;
     int d = (index / width / height) % depth + padding_depth;
     int c = index / width / height / depth;
+
     // compute the start and end of the output
     int w_col_start =
-        (w < filter_width) ? 0 : (w - filter_width) / stride_width + 1;
+        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
     int w_col_end = min(w / stride_width + 1, output_width);
     int h_col_start =
-        (h < filter_height) ? 0 : (h - filter_height) / stride_height + 1;
+        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
     int h_col_end = min(h / stride_height + 1, output_height);
     int d_col_start =
-        (d < filter_depth) ? 0 : (d - filter_depth) / stride_depth + 1;
+        (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
     int d_col_end = min(d / stride_depth + 1, output_detph);
 
-    int offset = (c * filter_depth * filter_height * filter_width +
-                  d * filter_width * filter_height + h * filter_width + w) *
-                 output_detph * output_height * output_width;
-
-    int coeff_d_col =
-        (1 - stride_depth * filter_width * filter_height * output_detph) *
-        output_height * output_width;
-    int coeff_h_col =
-        (1 - stride_height * filter_width * output_detph * output_height) *
-        output_width;
-    int coeff_w_col =
-        (1 - stride_width * output_detph * output_height * output_width);
-
     for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
       for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
         for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          src_val += data_col[offset + d_col * coeff_d_col +
-                              h_col * coeff_h_col + w_col * coeff_w_col];
+          int d_off = (d - d_col * stride_depth);
+          int h_off = (h - h_col * stride_height);
+          int w_off = (w - w_col * stride_width);
+          if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
+              w_off % dilation_w == 0) {
+            d_off /= dilation_d;
+            h_off /= dilation_h;
+            w_off /= dilation_w;
+
+            int data_col_index =
+                (((((c * filter_depth + d_off) * filter_height + h_off) *
+                       filter_width +
+                   w_off)));
+            data_col_index =
+                ((data_col_index * output_detph + d_col) * output_height +
+                 h_col) *
+                    output_width +
+                w_col;
+            src_val += data_col[data_col_index];
+          }
         }
       }
     }
@@ -161,17 +199,18 @@ template <class T>
 class Col2VolFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& vol, const framework::Tensor& col,
-                  int stride_depth, int stride_height, int stride_width,
-                  int padding_depth, int padding_height,
-                  int padding_width) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const {
+    PADDLE_ENFORCE(vol->dims().size() == 4);
     PADDLE_ENFORCE(col.dims().size() == 7);
 
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
     int filter_depth = col.dims()[1];
     int filter_height = col.dims()[2];
     int filter_width = col.dims()[3];
@@ -179,6 +218,28 @@ class Col2VolFunctor<platform::GPUPlace, T> {
     int output_height = col.dims()[5];
     int output_width = col.dims()[6];
 
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "Mismatching.");
+
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
     const int threads = 1024;
@@ -188,9 +249,10 @@ class Col2VolFunctor<platform::GPUPlace, T> {
                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
                      .stream()>>>(
         num_kernels, col.data<T>(), input_depth, input_height, input_width,
-        filter_depth, filter_height, filter_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
-        output_depth, output_height, output_width, vol.data<T>());
+        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
+        filter_width, strides[0], strides[1], strides[2], paddings[0],
+        paddings[1], paddings[2], output_depth, output_height, output_width,
+        vol->data<T>());
   }
 };
 
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
index f022365a16..dc64d1d977 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
@@ -31,6 +32,15 @@ namespace math {
  * \param colData  Column data.
  * \param colShape The shape of colData.
  *
+ * \param dilations    dilation data.
+ * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 3-dimension  [stride_depth, stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 3-dimension  [d_pad, h_pad, w_pad].
+ *
  * The shape of colData is:
  * [input_channels, filter_depth, filter_height, filter_width, output_depth,
  * output_height, output_width]
@@ -57,20 +67,22 @@ template <typename Place, typename T>
 class Vol2ColFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& vol, framework::Tensor& col,
-                  int stride_depth, int stride_height, int stride_width,
-                  int padding_depth, int padding_height,
-                  int padding_width) const;
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const;
 };
 
 template <typename Place, typename T>
 class Col2VolFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& vol, const framework::Tensor& col,
-                  int stride_depth, int stride_height, int stride_width,
-                  int padding_depth, int padding_height,
-                  int padding_width) const;
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const;
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index 74590d17cd..62c3152304 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -62,11 +62,15 @@ void testVol2col() {
   int input_height = 2;
   int input_width = 3;
   int filter_size = 2;
-  int stride = 1;
-  int padding = 0;
-  int output_depth = (input_depth - filter_size + 2 * padding) / stride + 1;
-  int output_height = (input_height - filter_size + 2 * padding) / stride + 1;
-  int output_width = (input_width - filter_size + 2 * padding) / stride + 1;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
 
   // Vol2Col test
   float* input_ptr =
@@ -78,22 +82,21 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
                              *place);
 
   paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
-  vol2col(*context, input, output, stride, stride, stride, padding, padding,
-          padding);
+  vol2col(*context, input, dilations, strides, paddings, &output);
 
   float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -107,18 +110,17 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
-  col2vol(*context, input, output, stride, stride, stride, padding, padding,
-          padding);
+  col2vol(*context, output, dilations, strides, paddings, &input);
 
   float* in_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index 5ecbee3b41..5a1a615420 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -144,7 +144,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(false);
     AddComment(R"DOC(
-The MatMul operator is used to perform (batched) matrix multiplication
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
 over the last two dimensions of the input tensors `X` and `Y`.
 
 If a transpose flag is specified, the last two dimensions of the
@@ -166,7 +169,8 @@ The differences are:
 - We add `transpose_X` and `transpose_Y` flags.
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/matmul_op.cu b/paddle/operators/matmul_op.cu.cc
similarity index 100%
rename from paddle/operators/matmul_op.cu
rename to paddle/operators/matmul_op.cu.cc
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 5ce30740c9..1e4aa48b70 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/matmul.h"
-#include "paddle/operators/transpose_op.h"
 
 namespace paddle {
 namespace operators {
@@ -74,11 +74,13 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
   Tensor output;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
-    output.Resize(in_dims);
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
     output.mutable_data<T>(context.GetPlace());
-    EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2});
+    std::vector<int> axis = {1, 0, 2};
+    math::Transpose<Place, T, 3> trans;
+    trans(context.device_context(), input, &output, axis);
     std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize(make_ddim(out_dims));
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
   } else {
     output.ShareDataWith(input);
   }
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
new file mode 100644
index 0000000000..798022c9dd
--- /dev/null
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class MaxSeqenceLenOp : public framework::OperatorBase {
+ public:
+  MaxSeqenceLenOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
+    *out_ptr = rank_table.items()[0].length;
+  }
+};
+
+class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RankTable", "The lod_rank_table.");
+    AddOutput("Out", "The max sequence length.");
+    AddComment(
+        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  }
+};
+
+class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
+                  paddle::operators::MaxSeqenceLenOpProtoMaker,
+                  paddle::operators::MaxSeqenceLenInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 41b3860a86..44bf402e95 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -12,104 +12,96 @@
  *     See the License for the specific language governing permissions and
  *     limitations under the License. */
 
-
 #include "paddle/operators/maxout_op.h"
 namespace paddle {
 namespace operators {
 
 using framework::Tensor;
 
-/********first define ProtoMaker类 ***************/
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-        "(Tensor) The input tensor of pooling operator. "
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of maxout operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
-        "(Tensor) The output tensor of pooling operator."
-        "The format of output tensor is also NCHW."
-        "Where N is batch size, C is "
-        "the number of channels, H and W is the height and "
-        "width of feature.");
-
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
     AddAttr<int>(
         "groups",
-        R"DOC(The group number of input layer.
-        )DOC")
-        .SetDefault(2);
-    AddAttr<int>(
-        "num_channels",
-        R"DOC(The channel number of input layer.
-        )DOC")
-        .SetDefault(0);
-    AddComment(R"DOC(A layer to do max out on conv layer output.
-        - Input: output of a conv layer.
-        - Output: feature map size same as input. Channel is (input channel) / groups.
-        So groups should be larger than 1, and the num of channels should be able
-        to devided by groups.
+        R"DOC("Specifies how many groups the input tensor will be split"
+        "in the channel dimension. And the number of output channel is "
+        "the number of channels divided by groups.."
         )DOC");
+    AddComment(R"DOC(
+MaxOut Operator.
+
+Assumed the input shape is (N, Ci, H, W).
+The output shape is (N, Co, H, W).
+Then $Co = Ci / groups$ and the operator formula is as follows:
+
+$$
+y_{si+j} = \max_k x_{gsi + sk + j} \\
+g = groups \\
+s = \frac{input.size}{num\_channels} \\
+0 \le i < \frac{num\_channels}{groups} \\
+0 \le j < s \\
+0 \le k < groups
+$$
+
+Please refer to Paper:
+  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+  - Multi-digit Number Recognition from Street View \
+    Imagery using Deep Convolutional Neural Networks: \
+    https://arxiv.org/pdf/1312.6082v4.pdf
+
+)DOC");
   }
 };
 
-/******************2nd **********************************/
-
 class MaxOutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of maxoutOp"
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of maxoutOp should not be null.");
+                   "Output(Out) of MaxoutOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     int groups = ctx->Attrs().Get<int>("groups");
-    int num_channels = ctx->Attrs().Get<int>("num_channels");
-
     // check groups > 1
-    PADDLE_ENFORCE_GT(
-        groups, 1,
-        "in maxoutop  groups should be larger than 1");
-    // check num_channels%groups=0
-    PADDLE_ENFORCE_EQ(num_channels % groups, 0,
-                      "the num of channels should be able"
-    "to devided by groups");
-
-    int out_num_channels = num_channels / groups;
-
-    std::vector<int64_t> output_shape({in_x_dims[0], out_num_channels});
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
     output_shape.push_back(in_x_dims[2]);
     output_shape.push_back(in_x_dims[3]);
-
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   }
 };
 
-
 class MaxOutOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-    "Input(X@GRAD) should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
-}    // namespace operators
-}    // namespace paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-                        ops::MaxOutOpGrad);
-
-
-REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
-                       float>);
-REGISTER_OP_CPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
-                       float>);
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/maxout_op.cu b/paddle/operators/maxout_op.cu.cc
similarity index 66%
rename from paddle/operators/maxout_op.cu
rename to paddle/operators/maxout_op.cu.cc
index 44a149b065..decd43913d 100644
--- a/paddle/operators/maxout_op.cu
+++ b/paddle/operators/maxout_op.cu.cc
@@ -12,12 +12,12 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/maxout_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::GPUPlace,
-                       float>);
-REGISTER_OP_GPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                       float>);
+REGISTER_OP_GPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
+                       ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
+    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index 2321613512..44a0d073dd 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/maxouting.h"
@@ -30,17 +29,10 @@ class MaxOutKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* in_x = context.Input<Tensor>("X");
     Tensor* out = context.Output<Tensor>("Out");
-
     int groups = context.template Attr<int>("groups");
-    int num_channels = context.template Attr<int>("num_channels");
-
 
-    paddle::operators::math::MaxOutFunctor<
-    Place, paddle::operators::math::MaxOut<T>, T>
-    maxout_forward;
-    paddle::operators::math::MaxOut<T> maxout_process;
-    maxout_forward(context.device_context(), *in_x, *out, groups, num_channels,
-    maxout_process);
+    math::MaxOutFunctor<Place, T> maxout_forward;
+    maxout_forward(context.device_context(), *in_x, out, groups);
   }
 };
 
@@ -53,22 +45,15 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
     const Tensor* out_grad =
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-
     int groups = context.template Attr<int>("groups");
-    int num_channels = context.template Attr<int>("num_channels");
-
-
-
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
-      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(context.GetEigenDevice<Place>()) =
-      temp.constant(static_cast<T>(0));
-
-      paddle::operators::math::MaxOutGradFunctor<Place, T>
-      maxout_backward;
-      maxout_backward(context.device_context(), *in_x, *in_x_grad, *out,
-      *out_grad, groups, num_channels);
+      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
+      math::MaxOutGradFunctor<Place, T> maxout_backward;
+      maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
+                      *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 7caa1c9d0c..dcc5b4286f 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -36,7 +36,11 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
-    AddComment(R"DOC( Mean Operator
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
 )DOC");
   }
 };
@@ -47,6 +51,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 };
 
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000..adc688dbd5
--- /dev/null
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+
+    // Build LoDTensor `out`
+
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+
+      framework::AppendLoD(out_lod, lod_length);
+
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      auto slice = out->Slice(out_offset, out_offset + len);
+      framework::CopyFrom(input->Slice(start_offset, end_offset), place,
+                          dev_ctx, &slice);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index f7943e99ac..4684c20208 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -52,14 +52,16 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Y", "The right tensor of minus operator.");
     AddOutput("Out", "The output tensor of minus operator.");
 
-    AddComment(R"DOC(Minus Operator
+    AddComment(R"DOC(
+Minus Operator.
 
 Equation:
 
-    Out = X - Y
+    $Out = X - Y$
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 7b9e952895..28528848af 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -43,27 +43,35 @@ class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
                            framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of modified huber loss op."
+             "The input tensor of modified huber loss op. "
              "X is 2-D tensor with shape [batch_size, 1].");
     AddInput("Y",
-             "The target labels of modified huber loss op."
-             "The shape of Y is same as X. Values of Y must be 0 or 1.");
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
     AddOutput("IntermediateVal",
               "Variable to save intermediate result which will be reused in "
               "backward processing.")
         .AsIntermediate();
     AddOutput("Out", "Classification loss for X.");
     AddComment(R"DOC(
-Modified huber loss is used in binary classification problem. The shape of
-input X and target Y are both [N, 1] and so is the shape of output loss.
-Since target Y is not differentiable, cacluating gradient for Y is illegal.
-The formulation of modified huber loss is:
-
-L(y, f(x)) = max(0, 1 - yf(x))^2  for yf(x) >= -1,
-             -4yf(x)              otherwise.
-
-Make sure the values of target label Y are in {0, 1} here. The operator will
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
 scale values of Y to {-1, +1} when computing losses and gradients.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 2d4d6f1372..2ab48fedec 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -71,21 +71,31 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor, default Tensor<float>) "
              "Input learning rate");
 
-    AddOutput("ParamOut", "(Tensor) Output updated parameter");
-    AddOutput("VelocityOut", "(Tensor) Output updated velocity");
+    AddOutput("ParamOut",
+              "(Tensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(Tensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+    AddAttr<bool>("use_nesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
         .SetDefault(false);
     AddComment(R"DOC(
+Momentum Optimizer.
 
-Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
 
-velocity = mu * velocity + gradient
-if (use_nesterov):
-  param = param - gradient * learning_rate + mu * velocity * learning_rate
-else:
-  param = param - learning_rate * velocity
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$
 
 )DOC");
   }
@@ -95,5 +105,5 @@ else:
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
+                       ops::MomentumOpKernel<double>);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
index efc24e795e..be0c8ea071 100644
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
@@ -12,9 +12,67 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-#include "paddle/operators/momentum_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumKernel(const T* p, const T* g, const T* v,
+                               const T* learning_rate, const T mu,
+                               const int64_t num, bool use_nesterov, T* p_out,
+                               T* v_out) {
+  T lr = learning_rate[0];
+  if (use_nesterov) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T g_val = g[i];
+      T v_new = v[i] * mu + g_val;
+      v_out[i] = v_new;
+      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+    }
+  } else {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T v_new = v[i] * mu + g[i];
+      v_out[i] = v_new;
+      p_out[i] = p[i] - lr * v_new;
+    }
+  }
+}
+
+template <typename T>
+class MomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    momentum, ops::MomentumOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
+                       ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
index e6d6d1da3d..da69532ea5 100644
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename T>
 class MomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,8 +33,8 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     param_out->mutable_data<T>(ctx.GetPlace());
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
-    float mu = ctx.Attr<float>("mu");
-    bool use_nesterov = ctx.Attr<bool>("useNesterov");
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
@@ -42,18 +42,13 @@ class MomentumOpKernel : public framework::OpKernel<T> {
     auto p = framework::EigenVector<T>::Flatten(*param);
     auto v = framework::EigenVector<T>::Flatten(*velocity);
     auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+    auto* lr = learning_rate->data<T>();
 
-    auto place = ctx.GetEigenDevice<Place>();
-
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
-
-    v_out.device(place) = v * mu + g;
+    v_out = v * mu + g;
     if (use_nesterov) {
-      p_out.device(place) = p - g * lr.broadcast(grad_dsize) +
-                            v_out * mu * lr.broadcast(grad_dsize);
+      p_out = p - (g - v_out * mu) * lr[0];
     } else {
-      p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out;
+      p_out = p - lr[0] * v_out;
     }
   }
 };
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 90acf034d9..3c39ae10dc 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -78,6 +78,7 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output of mul op");
     AddAttr<int>(
         "x_num_col_dims",
+        "(int, default 1) "
         R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
             in that case, tensors will be reshaped to a matrix. The matrix's first
             dimension(column length) will be the product of tensor's last
@@ -88,20 +89,24 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         .EqualGreaterThan(1);
     AddAttr<int>(
         "y_num_col_dims",
+        "(int, default 1) "
         R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
              in that case, tensors will be reshaped to a matrix. Just like input `X`.
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Mul operator is used to perform matrix multiplication for input X and Y.
+Mul Operator. 
+
+This operator is used to perform matrix multiplication for input X and Y.
 
 The equation is:
 
-    Out = X * Y
+    $$Out = X * Y$$
 
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with input `X`.
+or not. But the output only shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu.cc
similarity index 97%
rename from paddle/operators/mul_op.cu
rename to paddle/operators/mul_op.cu.cc
index a81444dbe6..66dc3d6d10 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu.cc
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index bd1bdb4f81..0eb9df41e9 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -16,16 +16,12 @@
 
 #include "paddle/operators/math/math_function.h"
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class MulKernel : public framework::OpKernel<T> {
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 4d86769026..8e7f544e0d 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -51,9 +51,11 @@ class MultiplexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
   }
 };
 
@@ -66,7 +68,8 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The candidate tensors of multiplex operator.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
-    AddComment(R"DOC(Multiplex operator
+    AddComment(R"DOC(
+Multiplex Operator.
 
 Multiplex multiple tensors according to the index provided by the index tensor.
 
@@ -77,10 +80,11 @@ the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-y[i] = x_{k}[i]
+$$y[i] = x_{k}[i]$$
 
-where y is the output tensor. `x_{k}` is the k-th input tensor
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
 and `k = Ids[i]`.
+
 )DOC");
   }
 };
@@ -95,19 +99,15 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
                    "Output(X@Grad) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
-    std::vector<framework::DDim> d_ins;
-    auto ins = ctx->GetInputsDim("X");
-    // No need to compute gradient for Input(Ids)
-    for (size_t i = 0; i < ins.size(); i++) {
-      d_ins.push_back(ins[i]);
-    }
-    ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 143a14fef5..10dff8d021 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -33,11 +33,9 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
+    auto stream = ctx.cuda_device_context().stream();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       int32_t k = index[i];
@@ -70,12 +68,10 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
+    auto stream = ctx.cuda_device_context().stream();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
       size_t k = static_cast<size_t>(index[i]);
diff --git a/paddle/operators/name_convention.md b/paddle/operators/name_convention.md
index 5a21690795..b5cb176e00 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -4,10 +4,10 @@ To make the operator document itself more clear, we recommend operator names obe
 
 ### OpProtoMaker names
 
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator. 
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
 
 - Input/Output.
-  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words. 
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
   - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
 
 - Attribute.
@@ -15,7 +15,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 
 - Comments.
   - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
-  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`. 
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
 
 - Order.
   - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
@@ -24,7 +24,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
 
 Here we give some examples to show how these rules will be used.
 
-- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`. 
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
 
 - The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
 
@@ -38,23 +38,27 @@ public:
   AccumulateOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor. 
-    If the output size is not the same as input size, 
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
     the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
     AddOutput("Out", "(Tensor) Accumulated output tensor");
     AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
     AddComment(R"DOC(
-Accumulate operator accumulates the input tensor to the output tensor. If the
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
 output tensor already has the right size, we add to it; otherwise, we first
 initialize the output tensor to all zeros, and then do accumulation. Any
 further calls to the operator, given that no one else fiddles with the output
 in the interim, will do simple accumulations.
-Accumulation is done as shown:
+
+Accumulation is done as follows:
 
 Out = 1*X + gamma*Out
 
 where X is the input tensor, Out is the output tensor and gamma is the multiplier
 argument.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 5858cd4839..48e322f993 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -35,6 +35,7 @@ constexpr int kInvalidGPUId = -1;
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
+  bool inited_;
 
   Communicator() {}
 
@@ -42,17 +43,21 @@ struct Communicator {
 
   void InitAll(const std::vector<int>& gpus) {
     comms_.resize(gpus.size());
+    inited_ = false;
     for (size_t i = 0; i < gpus.size(); ++i) {
       comm_id_map_[gpus[i]] = i;
     }
     PADDLE_ENFORCE(
         dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    inited_ = true;
   }
 
   ~Communicator() {
-    for (size_t i = 0; i < comms_.size(); ++i) {
-      // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::ncclCommDestroy(comms_[i]);
+    if (inited_) {
+      for (size_t i = 0; i < comms_.size(); ++i) {
+        // FIXME(dzh) : PADDLE_ENFORCE return void
+        dynload::ncclCommDestroy(comms_[i]);
+      }
     }
   }
 
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index d39cb2fcf9..22a37ff1bb 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -48,12 +48,17 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
-    AddAttr<std::vector<int>>("gpus", "gpu id lists");
-    AddAttr<int>("data_type", "output data type")
+    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
         .SetDefault(framework::DataType::FP32);
     AddComment(R"DOC(
-               create communicator.
-        )DOC");
+NCCLInit Operator.
+
+Create communicator.
+
+)DOC");
   }
 };
 
@@ -143,11 +148,15 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                          "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
         .SetDefault("ncclSum");
     AddComment(R"DOC(
-            AllReduce the input tensors.
-        )DOC");
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
   }
 };
 
@@ -161,14 +170,20 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
                          "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
         .SetDefault("ncclSum");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
         .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
-            Reduce the tensors)DOC");
+NCCLReduce Operator.
+
+Reduce the tensors.
+
+)DOC");
   }
 };
 
@@ -182,12 +197,16 @@ class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
     AddAttr<int>("root",
-                 "root gpu of the parameter. if not "
-                 "set(platform::kInvalidGPUId). hashed by name.")
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
         .SetDefault(platform::kInvalidGPUId);
     AddComment(R"DOC(
-            Bcast the tensors.
-        )DOC");
+NCCLBcast Operator.
+
+Bcast the tensors.
+
+)DOC");
   }
 };
 
diff --git a/paddle/operators/nccl_op.cu b/paddle/operators/nccl_op.cu.cc
similarity index 97%
rename from paddle/operators/nccl_op.cu
rename to paddle/operators/nccl_op.cu.cc
index 86dee8ee8e..4f0a2a79ed 100644
--- a/paddle/operators/nccl_op.cu
+++ b/paddle/operators/nccl_op.cu.cc
@@ -64,9 +64,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
 
     auto* comm = ctx.Input<Communicator>("Communicator");
 
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
+    auto stream = ctx.cuda_device_context().stream();
 
     // device id
     int gpu_id = boost::get<platform::GPUPlace>(ctx.GetPlace()).GetDeviceId();
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu.cc
similarity index 98%
rename from paddle/operators/nccl_op_test.cu
rename to paddle/operators/nccl_op_test.cu.cc
index e5927d56ae..bb7ae20286 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -26,7 +26,6 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/var_desc.h"
-#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
@@ -98,7 +97,7 @@ class NCCLTester : public ::testing::Test {
       send_tensor->mutable_data<T>(kDims, place);
 
       std::vector<T> send_vector(f::product(kDims), gpu_id);
-      send_tensor->CopyFromVector<T>(send_vector, *ctx);
+      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
       ctx->Wait();
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
new file mode 100644
index 0000000000..952da10434
--- /dev/null
+++ b/paddle/operators/nce_op.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/nce_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class NCEOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Label"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
+                        ctx->GetInputDim("Bias")[0]);
+    }
+    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
+    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
+    std::vector<int> custom_neg_classes =
+        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
+    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
+    if (custom_neg_classes.size() > 0) {
+      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
+                        static_cast<size_t>(num_neg_samples));
+    }
+    // set dims of output(Out)
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(x_dims[0]);
+    out_dims.push_back(1);
+    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
+
+    // set dims of output(SampleOut)
+    std::vector<int64_t> sample_out_dims;
+    sample_out_dims.push_back(x_dims[0]);
+    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
+    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
+    AddInput(
+        "Label",
+        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
+        "'num_true_class' is the number of target classes in each sample."
+        "The number of target classes per sample should be same. "
+        "If you have a variable number of target classes, "
+        "you can pad them out to a constant number by either repeating them"
+        " or by padding with an otherwise unused class.)");
+    AddInput("Weight",
+             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
+             "total number of class.");
+    AddInput(
+        "Bias",
+        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
+        "number of class. It is a dispensable input.")
+        .AsDispensable();
+    AddInput("SampleWeight",
+             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
+             "each sample. And it is a dispensable input. The default value of "
+             "sample is 1.")
+        .AsDispensable();
+    AddOutput("Cost",
+              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
+    AddOutput("SampleLogits",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "Given X is  the dot product of input tensor and sampled labels' "
+              "weights."
+              "Then 'SampleLogits' is sigmoid(X).")
+        .AsIntermediate();
+    AddOutput("SampleLabels",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "")
+        .AsIntermediate();
+    AddAttr<int>("num_total_classes",
+                 "Total number of classes in all samples.");
+    AddAttr<int>("num_neg_samples",
+                 "The number of negative classes. The default value is 10.")
+        .SetDefault(10);
+    AddAttr<std::vector<int>>("custom_neg_classes",
+                              "This attribute only be used in unitest. Classes "
+                              "in this list wiil be used as negative classes "
+                              "for every samples. Under normal conditions, "
+                              "user should avoid setting this attribute.");
+    AddComment(R"DOC(
+Compute and return the noise-contrastive estimation training loss.
+See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+By default this operator uses a uniform distribution for sampling.
+)DOC");
+  }
+};
+
+class NCEOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasInput("Cost"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
+                   "The input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto x_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto w_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(w_grad_name)) {
+      ctx->SetOutputDim(w_grad_name, w_dims);
+    }
+
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(nce_grad,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
new file mode 100644
index 0000000000..0a8a95de5f
--- /dev/null
+++ b/paddle/operators/nce_op.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <math.h>
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+void PrepareSamples(const framework::ExecutionContext& context) {
+  auto label = context.Input<Tensor>("Label");
+  const int64_t* label_data = label->data<int64_t>();
+  auto label_dims = label->dims();
+  int num_total_classes = context.Attr<int>("num_total_classes");
+  // for unitest
+  std::vector<int> custom_neg_classes =
+      context.Attr<std::vector<int>>("custom_neg_classes");
+  // random machine
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
+
+  auto sample_labels = context.Output<Tensor>("SampleLabels");
+  auto sample_labels_dims = sample_labels->dims();
+  int64_t* sample_labels_data =
+      sample_labels->mutable_data<int64_t>(context.GetPlace());
+
+  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
+  int index = 0;
+  for (int64_t i = 0; i < label_dims[0]; ++i) {
+    int j = 0;
+    for (; j < num_label; ++j) {
+      sample_labels_data[index++] = label_data[i * num_label + j];
+    }
+    if (custom_neg_classes.size() > 0) {
+      for (auto label : custom_neg_classes) {
+        sample_labels_data[index++] = label;
+      }
+    } else {
+      for (; j < sample_labels_dims[1]; ++j) {
+        // TODO(wanghaoshuang): support more distribution sampling
+        sample_labels_data[index++] = rand(rng);
+      }
+    }
+  }
+}
+
+template <typename Place, typename T>
+class NCEKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PrepareSamples<Place, T>(context);
+    auto sample_labels = context.Output<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_out = context.Output<Tensor>("SampleLogits");
+    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
+    auto label = context.Input<Tensor>("Label");
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    auto out = context.Output<Tensor>("Cost");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int64_t num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    // forward bias
+    auto bias = context.Input<Tensor>("Bias");
+    if (bias != nullptr) {
+      const T* bias_data = bias->data<T>();
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = bias_data[sample_labels_data[i]];
+      }
+    } else {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = 0;
+      }
+    }
+    // forward mul
+    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+           weight_mat.chip(sample_labels_data[i], 0))
+              .sum();
+      sample_out_data[i] += result(0);
+      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+    }
+    // forward cost
+    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      int64_t j = 0;
+      out_data[i] = 0;
+      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
+      // for true classes
+      for (; j < num_true_class; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(o / (o + b));
+        out_data[i] += w * cost;
+      }
+      // for sampled neg classes
+      for (; j < sample_labels->dims()[1]; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(b / (o + b));
+        out_data[i] += w * cost;
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class NCEGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
+    const T* d_out_data = d_out->data<T>();
+    auto label = context.Input<Tensor>("Label");
+    auto sample_out = context.Input<Tensor>("SampleLogits");
+    const T* sample_out_data = sample_out->data<T>();
+    auto sample_labels = context.Input<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    Tensor sample_grad;  // tmp tensor
+    T* sample_grad_data =
+        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
+    // backward cost
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      T o = sample_out_data[i];
+      T w = sample_weight == nullptr
+                ? 1
+                : sample_weight_data[i / sample_labels->dims()[1]];
+      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
+                                ? w * (b / (o + b)) * (o - 1)
+                                : w * (o * (1 - o) / (o + b));
+      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
+    }
+    // get d_bias
+    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    if (d_bias != nullptr) {
+      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+      }
+    }
+    // get d_w
+    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+    if (d_w != nullptr) {
+      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
+      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
+      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_w_matrix.chip(sample_labels_data[i], 0) +=
+            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            sample_grad_data[i];
+      }
+    }
+    // get d_x
+    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
+    if (d_x != nullptr) {
+      d_x->mutable_data<T>(context.GetPlace());
+      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
+      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 73a0b8baff..adb75df6ef 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -54,41 +54,44 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7)");
     AddOutput("Out",
-              "The output of pad op."
+              "The output of pad op. "
               "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules for each dimension. "
+        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings should be equal to "
+        "2 * dimension size of the input tensor.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
     AddComment(R"DOC(
-Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
+Pad Operator.
+
+Pad input into output, as specified by paddings and pad_value. 
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
 Given:
 
 X = [[1, 2],
-   [3, 4]]
-
-and
+     [3, 4]],
 
-paddings = [0, 1, 1, 2]
+paddings = [0, 1, 1, 2],
 
 and
 
-pad_value = 0
+pad_value = 0,
 
-then we get
+we have:
 
 Out = [[0, 1, 2, 0, 0]
        [0, 3, 4, 0, 0]
        [0, 0, 0, 0, 0]]
+
 )DOC");
-    AddAttr<std::vector<int>>(
-        "paddings",
-        "A list<int> to describes padding rules for each dimension."
-        " For 2-D image tensor, paddings=[0, 1, 2, 3] means"
-        " padding 0 row to top, 1 row to bottom, 2 columns to left"
-        " and 3 columns to right.Size of paddings should be equal to"
-        " 2 * dimension size of input tensor.");
-    AddAttr<float>("pad_value",
-                   "(float) default to 0; "
-                   "The value to fill padded areas.")
-        .SetDefault(0.0f);
   }
 };
 
diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc
index f962d9e3e6..be9fcc5661 100644
--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
@@ -20,6 +20,18 @@ REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
             ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
+
+REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool3d_cudnn,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
diff --git a/paddle/operators/pool_cudnn_op.cu b/paddle/operators/pool_cudnn_op.cu.cc
similarity index 84%
rename from paddle/operators/pool_cudnn_op.cu
rename to paddle/operators/pool_cudnn_op.cu.cc
index 8d0741dccc..66dd194ccd 100644
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -37,11 +37,11 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
     const T *input_data = input->data<T>();
     T *output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
@@ -52,7 +52,13 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_desc;
     ScopedPoolingDescriptor pool_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
@@ -92,12 +98,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
         ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
 
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(input->dims()[i + 2]);
@@ -112,7 +118,13 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_desc;
     ScopedPoolingDescriptor pool_desc;
-    DataLayout layout = DataLayout::kNCHW;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()));
@@ -135,8 +147,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
-      set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
+      // Because beta is zero, it is unnecessary to reset input_grad.
 
       PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
@@ -151,5 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>);
-REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>,
+                       ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                       ops::PoolCudnnGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>,
+                       ops::PoolCudnnOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>,
+                       ops::PoolCudnnGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 4d75c11bc8..e26ffd86e5 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -29,7 +29,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 
   auto in_x_dims = ctx->GetInputDim("X");
 
-  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -37,7 +37,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
   PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                  "Pooling intput should be 4-D or 5-D tensor.");
 
-  if (ctx->Attrs().Get<bool>("globalPooling")) {
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
       paddings[i] = 0;
@@ -73,125 +73,139 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   AddInput(
       "X",
       "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCHW. Where N is batch size, C is the "
-      "number of channels, H and W is the height and width of feature.");
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCHW."
-            "Where N is batch size, C is "
-            "the number of channels, H and W is the height and "
-            "width of feature.");
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
 
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
   AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(height, width) "
-                            "of pooling operator."
-                            "If globalPooling = true, ksize and paddings will "
+                            "(vector<int>) The pooling window "
+                            "size(height, width) of the pooling operator. "
+                            "If global_pooling = true, ksize and paddings will "
                             "be ignored.");  // TODO(Chengduo): Add checker.
                                              // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
-                "If globalPooling = true, ksize and paddings will be ignored.")
+  AddAttr<bool>("global_pooling",
+                "(bool, default false) Whether to use the global pooling. "
+                "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault(false);
-  AddAttr<std::vector<int>>(
-      "strides",
-      "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default {1, 1}), strides(height, "
+                            "width) of pooling operator.")
       .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0}), paddings(height, width) of pooling operator."
-      "If globalPooling = true, paddings and ksize will be ignored.")
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
+      "operator."
+      "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
   // TypedAttrChecker don't support vector type.)
 
   AddComment(R"DOC(
+Pool2d Operator.
+
 The pooling2d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+the input, pooling_type and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+Example:   
   Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, H_out, W_out)
-  where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       Out shape: $(N, C, H_{out}, W_{out})$
+  Where
+       $$ 
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+
 )DOC");
 }
 
 Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                              framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "X",
-      "(Tensor) The input tensor of pooling operator. "
-      "The format of input tensor is NCDHW. Where N is batch size, C is "
-      "the number of channels, D, H and W is the depth, height and width of "
-      "feature.");
+  AddInput("X",
+           "(Tensor) The input tensor of pooling operator. "
+           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "the number of channels, and D, H and W is the depth, height and "
+           "width of "
+           "the feature, respectively.");
   AddOutput("Out",
             "(Tensor) The output tensor of pooling operator."
-            "The format of output tensor is also NCDHW."
-            "Where N is batch size, C is "
-            "the number of channels, D, H and W is the depth, height and "
-            "width of feature.");
+            "The format of output tensor is also NCDHW, "
+            "where N is batch size, C is "
+            "the number of channels, and D, H and W is the depth, height and "
+            "width of the feature, respectively.");
 
-  AddAttr<std::string>("poolingType",
-                       "(string), pooling type, can be \"max\" for max-pooling "
+  AddAttr<std::string>("pooling_type",
+                       "(string) Pooling type, can be \"max\" for max-pooling "
                        "and \"avg\" for average-pooling.")
       .InEnum({"max", "avg"});
-  AddAttr<std::vector<int>>("ksize",
-                            "(vector ), the pooling window size(depth, height, "
-                            "width) of pooling "
-                            "operator."
-                            "If globalPooling = true, ksize and paddings wille "
-                            "be ignored.");  // TODO(Chengduo): Add checker.
-                                             // (Currently,
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "(vector<int>) The pooling window size(depth, height, "
+      "width) of pooling operator. "
+      "If global_pooling = true, ksize and paddings will "
+      "be ignored.");  // TODO(Chengduo): Add checker.
+                       // (Currently,
   // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
-                "(bool default: false), whether to use the global pooling."
-                "If globalPooling = true, ksize and paddings wille be ignored.")
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, ksize and paddings wille be ignored.")
       .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
-                            "(vector, default:{1,1,1}), strides(depth, height, "
-                            "width) of pooling operator.")
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int>, default {1,1,1}) Strides(depth, height, "
+      "width) of the pooling operator.")
       .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector defalut:{0,0,0}), paddings(depth, height, "
-      "width) of pooling operator."
-      "If globalPooling = true, ksize and paddings wille be ignored.")
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
+      "width) of pooling operator. "
+      "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
 
   AddComment(R"DOC(
+Pool3d Operator.
+
 The pooling3d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
-Input(X) and output(Out) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
-The input(X) size and output(Out) size may be different.
+the input, pooling_type, ksize, strides, and paddings parameters.
+Input(X) and output(Out) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. Parameters(ksize, strides, paddings) 
+are three elements. These three elements represent depth, height and 
+width, respectively. The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, D_out, H_out, W_out)
-  where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
+
 )DOC");
 }
 }  // namespace operators
@@ -203,14 +217,18 @@ REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
             ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>)
 
 REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
             ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/pool_op.cu b/paddle/operators/pool_op.cu.cc
similarity index 74%
rename from paddle/operators/pool_op.cu
rename to paddle/operators/pool_op.cu.cc
index 0e3b80868f..1010cb7622 100644
--- a/paddle/operators/pool_op.cu
+++ b/paddle/operators/pool_op.cu.cc
@@ -17,11 +17,15 @@ limitations under the License. */
 namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(pool2d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(pool2d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
 
 REGISTER_OP_GPU_KERNEL(pool3d,
-                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(pool3d_grad,
-                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index d9d445f6a6..63492a89e8 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -57,11 +57,11 @@ class PoolKernel : public framework::OpKernel<T> {
     const Tensor* in_x = context.Input<Tensor>("X");
     Tensor* out = context.Output<Tensor>("Out");
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> {
               Place, paddle::operators::math::MaxPool<T>, T>
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
               Place, paddle::operators::math::AvgPool<T>, T>
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
         }
       } break;
       case 3: {
@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> {
               Place, paddle::operators::math::MaxPool<T>, T>
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
               Place, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -119,12 +119,12 @@ class PoolGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
 
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> {
           if (pooling_type == "max") {
             paddle::operators::math::MaxPool2dGradFunctor<Place, T>
                 pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool2dGradFunctor<
                 Place, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
           }
         } break;
         case 3: {
           if (pooling_type == "max") {
             paddle::operators::math::MaxPool3dGradFunctor<Place, T>
                 pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool3dGradFunctor<
                 Place, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 95e896e7cc..b9c42a6912 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -29,11 +29,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "X(Input) of Pooling should not be null.");
+                   "Input(X) of Pooling should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Out(Output) of Pooling should not be null.");
+                   "Output(Out) of Pooling should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Mask"),
-                   "Mask(Output) of Pooling should not be null.");
+                   "Output(Mask) of Pooling should not be null.");
 
     auto in_x_dims = ctx->GetInputDim("X");
 
@@ -44,7 +44,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                    "Pooling intput should be 4-D or 5-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("globalPooling")) {
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -67,6 +67,14 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
     ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
   }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
@@ -80,6 +88,14 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
                    "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -89,64 +105,73 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of image.");
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the image, "
+        "and W is the width of the image.");
     AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is "
-              "the number of channels, H and W is the height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is "
+              "the number of channels, H is the height of the image "
+              "and W is the width of the image.");
     AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCHW."
-              "Where N is batch size, C is the number of channels, H and W "
-              "is the height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the image, "
+              "and W is the width of the image. "
+              "It represents the index in the current feature map.");
 
     AddAttr<std::vector<int>>("ksize",
-                              "(vector ), the pooling window size(height, "
-                              "width) of pooling operator."
-                              "If globalPooling = true, ksize and paddings "
+                              "(vector<int>) The pooling window size(height, "
+                              "width) of pooling operator. "
+                              "If global_pooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
-        "(bool default: false), whether to use the global pooling."
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "global_pooling",
+        "(bool, default:false) Whether to use the global pooling. "
+        "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "strides",
-        "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1, 1}), strides(height, "
+                              "width) of pooling operator.")
         .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0, 0}), paddings(height, width) of pooling operator."
-        "If globalPooling = true, paddings and will be ignored.")
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
+        "operator. "
+        "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
     AddComment(R"DOC(
+MaxPool2d Operator.
+
 The maxPooling2d with index operation calculates the output and the mask
-based on the input and ksize, strides, paddings parameters. Input(X) and
-output(Out, Mask) are in NCHW format. Where N is batch size, C is the
-number of channels, H and W is the height and width of feature.
+based on the input, ksize, strides, and paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, 
+and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, H_in, W_in)
+       X shape: $(N, C, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, H_out, W_out)
-       Mask shape: (N, C, H_out, W_out)
-  where
-       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       Out shape: $(N, C, H_{out}, W_{out})$
+       Mask shape: $(N, C, H_{out}, W_{out})$
+  Where
+       $$
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+
 )DOC");
   }
 };
@@ -156,70 +181,76 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
   MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor), the input tensor of pooling operator. "
-        "The format of input tensor is NCDHW. Where N is batch size, C is "
-        "the number of channels, D, H and W is the depth, height and width of "
-        "image.");
+    AddInput("X",
+             "(Tensor) The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW, where N is batch size, C is "
+             "the number of channels, and D, H and W are the depth, height and "
+             "width of "
+             "the image, respectively");
     AddOutput("Out",
-              "(Tensor), the output tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is "
-              "the number of channels, D, H and W is the depth, height and "
-              "width of image.");
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, "
+              "and D, H and W are the depth, height and "
+              "width of the image, respectively.");
     AddOutput("Mask",
-              "(Tensor), the Mask tensor of pooling operator."
-              "The format of output tensor is also NCDHW."
-              "Where N is batch size, C is the number of channels, D, H and W "
-              "is the depth, height and width of image."
-              "The value in it is the index in current feature map");
+              "(Tensor) The Mask tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, and "
+              "D, H and W are the depth, height and width "
+              "of the image, respectively. "
+              "It represents the index in the current feature map.");
 
     AddAttr<std::vector<int>>("ksize",
-                              "(vector), the pooling window size(depth, "
-                              "height, width) of pooling "
-                              "operator."
-                              "If globalPooling = true, ksize and paddings "
+                              "(vector<int>) The pooling window size(depth, "
+                              "height, width) of pooling operator. "
+                              "If global_pooling = true, ksize and paddings "
                               "will be ignored.");  // TODO(Chengduo): Add
                                                     // checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<bool>(
-        "globalPooling",
-        "(bool default: false), whether to use the global pooling."
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "global_pooling",
+        "(bool, default false) Whether to use the global pooling. "
+        "If global_pooling = true, ksize and paddings will be ignored.")
         .SetDefault(false);
     AddAttr<std::vector<int>>("strides",
-                              "(vector, default:{1,1,1}), strides(depth, "
+                              "(vector<int>, default {1,1,1}), strides(depth, "
                               "height, width) of pooling operator.")
         .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector defalut:{0,0,0}), paddings(depth, "
-        "height, width) of pooling operator."
-        "If globalPooling = true, paddings and ksize will be ignored.")
+        "(vector, default {0,0,0}), paddings(depth, "
+        "height, width) of pooling operator. "
+        "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
     // TypedAttrChecker don't support vector type.)
 
     AddComment(R"DOC(
+MaxPool3d Operator.
+
 The maxpooling3d with index operation calculates the output and the mask
 based on the input and ksize, strides, paddings parameters.
-Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
-size, C is the number of channels, D, H and W is the depth, height and
-width of feature. Parameters(ksize, strides, paddings) are three elements.
+Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. 
+Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
 
 Example:
   Input:
-       X shape: (N, C, D_in, H_in, W_in)
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
-       Out shape: (N, C, D_out, H_out, W_out)
-       Mask shape: (N, C, D_out, H_out, W_out)
-  where
-       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  Where
+       $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       $$
+
 )DOC");
   }
 };
@@ -235,10 +266,12 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
 
 REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
             ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
@@ -246,7 +279,9 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, double, int>);
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, double, int>)
diff --git a/paddle/operators/pool_with_index_op.cu b/paddle/operators/pool_with_index_op.cu.cc
similarity index 76%
rename from paddle/operators/pool_with_index_op.cu
rename to paddle/operators/pool_with_index_op.cu.cc
index 287657d4b1..335064a7ee 100644
--- a/paddle/operators/pool_with_index_op.cu
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -18,14 +18,18 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     max_pool2d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(
     max_pool2d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
 
 REGISTER_OP_GPU_KERNEL(
     max_pool3d_with_index,
-    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, double, int>);
 REGISTER_OP_GPU_KERNEL(
     max_pool3d_with_index_grad,
-    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float, int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, double, int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index 4862774043..40766c7e82 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -24,8 +24,8 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
+template <typename Place, typename T1, typename T2>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* in_x = context.Input<Tensor>("X");
@@ -35,7 +35,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -44,24 +44,24 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
 
     switch (ksize.size()) {
       case 2: {
-        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
+        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T1, T2>
             pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
       } break;
       case 3: {
-        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
+        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T1, T2>
             pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
   }
 };
 
-template <typename Place, typename T>
-class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
+template <typename Place, typename T1, typename T2>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* mask = context.Input<Tensor>("Mask");
@@ -72,7 +72,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
         ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
@@ -80,23 +80,22 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
     }
 
     if (in_x_grad) {
-      in_x_grad->mutable_data<T>(context.GetPlace());
-      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(context.GetEigenDevice<Place>()) =
-          temp.constant(static_cast<T>(0));
+      in_x_grad->mutable_data<T1>(context.GetPlace());
+      auto& device_ctx = context.device_context();
+      math::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
-          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T1, T2>
               pool2d_backward;
-          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
+                          paddings, in_x_grad);
         } break;
         case 3: {
-          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T1, T2>
               pool3d_backward;
-          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
+                          paddings, in_x_grad);
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
       }
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
new file mode 100644
index 0000000000..4ba40a62ec
--- /dev/null
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/positive_negative_pair_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PositiveNegativePairOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Score"),
+        "Input(Score) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Label"),
+        "Input(Label) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("QueryID"),
+        "Input(QueryID) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PositivePair"),
+        "Output(PositivePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegativePair"),
+        "Output(NegativePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NeutralPair"),
+        "Output(NeutralPair) of PositiveNegativePairOp should not be null.");
+    auto scalar_dim = framework::make_ddim({1});
+    if (ctx->HasInput("AccumulatePositivePair") ||
+        ctx->HasInput("AccumulateNegativePair") ||
+        ctx->HasInput("AccumulateNeutralPair")) {
+      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
+                         ctx->HasInput("AccumulateNegativePair") &&
+                         ctx->HasInput("AccumulateNeutralPair"),
+                     "All optional inputs(AccumulatePositivePair, "
+                     "AccumulateNegativePair, AccumulateNeutralPair) of "
+                     "PositiveNegativePairOp are required if one of them is "
+                     "specified.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
+                        "Shape of AccumulatePositivePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
+                        "Shape of AccumulateNegativePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
+                        "Shape of AccumulateNeutralPair should be {1}.");
+    }
+
+    auto score_dim = ctx->GetInputDim("Score");
+    auto label_dim = ctx->GetInputDim("Label");
+    auto query_dim = ctx->GetInputDim("QueryID");
+    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        label_dim[0], score_dim[0],
+        "Tensor Score and Label should have the same height (batch size).");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1,
+                      "The width of Label should be 1, i.e. each item should "
+                      "have a scalar label.");
+    PADDLE_ENFORCE(query_dim == label_dim,
+                   "QueryID should have the same shape as Label.");
+    if (ctx->HasInput("Weight")) {
+      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                     "Weight should have the same shape as Label.");
+    }
+    int column = ctx->Attrs().Get<int>("column");
+    auto depth = score_dim[1];
+    PADDLE_ENFORCE(column < depth && column >= -depth,
+                   "Attribute column should be in the range of [-%l, %l)",
+                   depth, depth);
+
+    ctx->SetOutputDim("PositivePair", scalar_dim);
+    ctx->SetOutputDim("NegativePair", scalar_dim);
+    ctx->SetOutputDim("NeutralPair", scalar_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
+        ctx.device_context());
+  }
+};
+
+class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PositiveNegativePairOpMaker(framework::OpProto *proto,
+                              framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Score",
+             "(Tensor, float) Model Score on an item (with "
+             "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
+             "depth], where the column specified by the attribute \"column\" "
+             "is used as item score.");
+    AddInput("Label",
+             "(Tensor, float) Label of an item (with repsect to "
+             "QueryId). It's a 2-D tensor with shape [batch_size, 1].");
+    AddInput("QueryID",
+             "(Tensor, int64) Query ID that indicates the context. Its shape "
+             "should be the same as Label.");
+    AddInput(
+        "AccumulatePositivePair",
+        "(float) Optional. The accumulated number of positive pairs over a "
+        "stream of data. If provided, the output PositivePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput(
+        "AccumulateNegativePair",
+        "(float) Optional. The accumulated number of negative pairs over a "
+        "stream of data. If provided, the output NegativePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput("AccumulateNeutralPair",
+             "(float) Optional. The accumulated number of neutral pairs over a "
+             "stream of data. If provided, the output NeutralPair will be "
+             "initialized with this number rather than 0. it won't be modified "
+             "in place.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(float) Optional. Weight of current item. If specified, its "
+             "shape should be the same as Label, and the meaning of the output "
+             "changes from numbers of pairs to the total sum of pairs' "
+             "weights. Weight of a pair of items is the average of their "
+             "weights.")
+        .AsDispensable();
+    AddOutput("PositivePair",
+              "(float) Number of positive pairs, i.e. the pairs of "
+              "items that are ranked correctly.");
+    AddOutput("NegativePair",
+              "(float) Number of negative pairs, i.e. the pairs of "
+              "items that are ranked incorrectly.");
+    AddOutput("NeutralPair",
+              "(float) Number of neutral pairs, i.e. the pairs of items "
+              "that have the same score.")
+        .AsDispensable();
+    AddAttr<int>(
+        "column",
+        "(int, default -1) The column position of Score used to rank items in "
+        "descending order. It must be in the range of [-rank(Score), "
+        "rank(Score)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Noting that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+        PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) 
+        model performance. 
+        Within some context, e.g. the "query", a LTR model generates scores
+        for a list of items, which gives a partial order of the items.
+        PositiveNegativePairOp takes a list of reference rank order 
+        (Input("Label")) and the model generated scores (Input(Score)) as 
+        inputs and counts the pairs that ranked correctly and incorrectly.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
+                             ops::PositiveNegativePairOp,
+                             ops::PositiveNegativePairOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    positive_negative_pair,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h
new file mode 100644
index 0000000000..2efd3777e0
--- /dev/null
+++ b/paddle/operators/positive_negative_pair_op.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class PositiveNegativePairKernel : public framework::OpKernel<T> {
+ public:
+  struct PredictionResult {
+    PredictionResult(T score, T label, T weight)
+        : score(score), label(label), weight(weight) {}
+    T score;
+    T label;
+    T weight;
+  };
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto score_t = context.Input<Tensor>("Score");
+    auto label_t = context.Input<Tensor>("Label");
+    auto query_t = context.Input<Tensor>("QueryID");
+    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
+    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
+    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
+    auto positive_t = context.Output<Tensor>("PositivePair");
+    auto negative_t = context.Output<Tensor>("NegativePair");
+    auto neutral_t = context.Output<Tensor>("NeutralPair");
+    auto weight_t = context.Input<Tensor>("Weight");
+
+    auto score = score_t->data<T>();
+    auto label = label_t->data<T>();
+    auto query = query_t->data<int64_t>();
+    const T* weight = nullptr;
+    if (weight_t != nullptr) {
+      weight = weight_t->data<T>();
+    }
+    T* positive = positive_t->mutable_data<T>(context.GetPlace());
+    T* negative = negative_t->mutable_data<T>(context.GetPlace());
+    T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
+
+    auto score_dim = score_t->dims();
+    auto batch_size = score_dim[0];
+    auto width = score_dim[1];
+    auto column = context.Attr<int32_t>("column");
+    if (column < 0) {
+      column += width;
+    }
+
+    // construct document instances for each query: Query => List[<score#0,
+    // label#0, weight#0>, ...]
+    std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
+    for (auto i = 0; i < batch_size; ++i) {
+      if (predictions.find(query[i]) == predictions.end()) {
+        predictions.emplace(
+            std::make_pair(query[i], std::vector<PredictionResult>()));
+      }
+      predictions[query[i]].emplace_back(score[i * width + column], label[i],
+                                         weight_t != nullptr ? weight[i] : 1.0);
+    }
+
+    // for each query, accumulate pair counts
+    T pos = 0, neg = 0, neu = 0;
+    if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
+        acc_neutral_t != nullptr) {
+      pos = acc_positive_t->data<T>()[0];
+      neg = acc_negative_t->data<T>()[0];
+      neu = acc_neutral_t->data<T>()[0];
+    }
+    auto evaluate_one_list = [&pos, &neg,
+                              &neu](std::vector<PredictionResult> vec) {
+      for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
+        for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
+          if (ite1->label == ite2->label) {  // labels are equal, ignore.
+            continue;
+          }
+          T w = (ite1->weight + ite2->weight) * 0.5;
+          if (ite1->score == ite2->score) {
+            neu += w;
+          }
+          (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
+              ? pos += w
+              : neg += w;
+        }
+      }
+    };
+    for (auto prediction : predictions) {
+      evaluate_one_list(prediction.second);
+    }
+    *positive = pos;
+    *negative = neg;
+    *neutral = neu;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 39da1e0bf8..1ace4f2a59 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,9 +80,11 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
+        ctx.device_context());
   }
 };
 
@@ -92,76 +94,78 @@ class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
                          framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("MaxProbs",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the max probability "
              "of an instance which computed by the previous top_k (k=1) "
              "operator.");
     AddInput("Indices",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the corresponding "
              "index which computed by the previous top_k (k=1) operator.");
     AddInput("Labels",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each element is a label and the "
              "value should be in [0, class_number - 1].");
     AddInput("Weights",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x 1, "
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. This input is optional. If provided, "
              "weight of instance would be considered when computing metrics.")
         .AsDispensable();
     AddInput("StatesInfo",
-             "(Tensor, default Tensor<int>), a 2-D tensor with shape D x 4, "
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
              "where D is the number of classes. This input is optional. If "
              "provided, current state will be accumulated to this state and "
-             "the accumulation state will be as the output state.")
+             "the accumulation state will be the output state.")
         .AsDispensable();
     AddOutput("BatchMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for current batch data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for current batch data. "
               "The layout is [macro average precision, macro average recall, "
               "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
     AddOutput("AccumMetrics",
-              "(Tensor, default Tensor<float>), a 1-D tensor with shape {6}."
-              "This output tensor contains metrics for accumulated data."
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for accumulated data. "
               "The layout is [macro average precision, macro average recall, "
               "macro f1 score, micro average precision, micro average recall, "
-              "micro f1 score]");
+              "micro f1 score].");
     AddOutput("AccumStatesInfo",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape D x 4, "
+              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
               "where D is equal to class number. This output tensor contains "
               "accumulated state variables used to compute metrics. The layout "
               "for each class is [true positives, false positives, "
               "true negatives, false negatives].");
-    AddAttr<int>("class_number", "Number of classes to be evaluated.");
+    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
     AddComment(R"DOC(
-When given 'Input(Indices)' and 'Input(Labels)', this operator can be used
+Precision Recall Operator.
+
+When given Input(Indices) and Input(Labels), this operator can be used
 to compute various metrics including:
-  - macro average precision
-  - macro average recall
-  - macro f1 score
-  - micro average precision
-  - micro average recall
-  - micro f1 score
+1. macro average precision
+2. macro average recall
+3. macro f1 score
+4. micro average precision
+5. micro average recall
+6. micro f1 score
 
 To compute the above metrics, we need to do statistics for true positives,
-false positives and false negatives. Here count of true negatives is not
+false positives and false negatives. Here the count of true negatives is not
 necessary, but counting it may provide potential usage and the cost is
-trivial, so the operator also provides count of true negatives.
+trivial, so the operator also provides the count of true negatives.
 
 We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
 state contains statistic variables for corresponding class. Layout of each row
 is: TP(true positives), FP(false positives), TN(true negatives),
-FN(false negatives). If 'Input(Weights)' provided, TP, FP, TN, FN will be
-calculated by given weight instead of instance count.
+FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
+calculated by given weight instead of the instance count.
 
 This operator also supports metrics computing for cross-batch situation. To
-achieve this, 'Input(StatesInfo)' should be provided. State of current batch
-data will be accumulated to 'Input(StatesInfo)' and 'Output(AccumStatesInfo)'
+achieve this, Input(StatesInfo) should be provided. State of current batch
+data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
 is the accumulation state.
 
-'Output(BatchMetrics)' is metrics of current batch data while
-'Output(AccumStatesInfo)' is metrics of accumulation data.
+Output(BatchMetrics) is metrics of current batch data while
+Output(AccumStatesInfo) is metrics of accumulation data.
 
 )DOC");
   }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index eef2e34eaa..055c471b45 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -41,17 +41,24 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
   PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of prelu operator.");
-    AddInput("Alpha", "The alpha weight of PRelu operator.");
-    AddOutput("Out", "The output tensor of PRelu operator.");
-    AddComment(R"DOC(PRelu operator
+    AddInput("Alpha", "The alpha weight of prelu operator.");
+    AddOutput("Out", "The output tensor of prelu operator.");
+    AddComment(R"DOC(
+PRelu Operator.
 
 The equation is:
 
-  f(x) = alpha * x , for x < 0
-  f(x) = x         , for x >= 0
+$$
+f(x) =
+\begin{cases}
+\alpha * x, \quad  \text{if} \ x < 0 \\
+x,         \qquad  \text{if} \ x >= 0
+\end{cases}
+$$
 
 The input `X` can carry the LoD (Level of Details) information,
-or not. And the output shares the LoD with input `X`.
+or not. And the output shares the LoD information with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index 39fbf80003..36e460103a 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -83,22 +83,26 @@ class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
                    "L1 regularization strength.")
         .SetDefault(0.0f);
     AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                    "L2 regularization strength.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+Proximal Adagrad Optimizer.
 
-Optimizer that implements the proximal adagrad algorithm.
+Optimizer that implements the proximal adagrad algorithm:
 
-moment = moment + grad * grad
-prox_param = param - learning_rate * grad * (1 / sqrt(moment))
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+moment = moment + grad * grad \\
+prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1 , 0)
+$$
 
 The paper that proposed Proximal GD: 
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
 Here, we use the adagrad learning rate as specified here: 
 (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
 )DOC");
   }
 };
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index e4b014b9f5..5693d0ec9e 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -67,19 +67,23 @@ class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
                    "L1 regularization strength.")
         .SetDefault(0.0f);
     AddAttr<float>("l2",
-                   "(float, default 0.0)"
+                   "(float, default 0.0) "
                    "L2 regularization strength.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+ProximalGD Operator.
 
-Optimizer that implements the proximal gradient descent algorithm.
+Optimizer that implements the proximal gradient descent algorithm:
 
-prox_param = param - learning_rate * grad
-param = sign(prox_param) / (1 + learning_rate * l2) *
-        max { |prox_param| - learning_rate * l1 , 0 }
+$$
+prox\_param = param - learning\_rate * grad \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1, 0)
+$$        
 
 The paper that proposed Proximal Gradient Descent:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+
 )DOC");
   }
 };
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 17ef2b1d01..912f88f455 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,18 +26,19 @@ class RankLossOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
-    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
-    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null");
-    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
 
     auto label_dims = ctx->GetInputDim("Label");
     auto left_dims = ctx->GetInputDim("Left");
     auto right_dims = ctx->GetInputDim("Right");
 
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size");
-    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
-                   "All inputs must be row vector with size batch_size x 1.");
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
     ctx->SetOutputDim("Out", label_dims);
   }
 };
@@ -48,34 +49,42 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
-             "The label indicating A ranked higher than B or not, row vector.");
-    AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor");
-    AddOutput("Out", "The output loss of RankLoss operator, vector.");
-    AddComment(R"DOC(RankLoss operator
-
-Rank loss operator for RankNet[1]. RankNet is a pairwise ranking model with
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
+    AddComment(R"DOC(
+RankLoss Operator.
+
+RankLoss operator for RankNet
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+RankNet is a pairwise ranking model with
 one training sample consisting of a pair of doc A and B, and the label P
 indicating that A is ranked higher than B or not:
 
 P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
-The RankLoss operator contains three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for two docs and the label
-respectively, and yields the rank loss C_{i,j} by following the expression
+The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
 
-\f[
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
   o_{i,j} =  o_i - o_j  \\
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f]
+$$
 
-The operator can take inputs of one sample or in batch.
+The operator can take batch inputs with size batch_size (batch_size >= 1).
 
-[1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
-     Rank using Gradient Descent.
-     http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
 )DOC");
   }
 };
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 779588ff36..5382e3a629 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index f184d6efcb..703c77a0b2 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 9eb2d79b4f..8b60b9c912 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -284,7 +284,8 @@ class RecurrentOp : public RecurrentBase {
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            dst_out.CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx);
+            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
+                                &dst_out);
           });
 
       scopes.Next();
@@ -365,7 +366,8 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          cur_grad_tensor->CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx);
+          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
+                              cur_grad_tensor);
         }
       }
 
@@ -387,8 +389,8 @@ class RecurrentGradOp : public RecurrentBase {
         auto &p_names = Inputs(kParameters);
         PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
 
-        for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
-          auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
           // If does not compute gradient of that variable inside rnn, just
           // continue
@@ -401,32 +403,24 @@ class RecurrentGradOp : public RecurrentBase {
             auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
                                       ->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
-            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
             attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
 
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
           // sum gradient
-          auto *outside_var = scope.FindVar(pg_names[prog_id]);
-          PADDLE_ENFORCE(outside_var != nullptr);
-          auto &outside_tensor =
-              *outside_var->GetMutable<framework::LoDTensor>();
-
-          std::string result_var_name;
-          auto *local_result_var = cur_scope.Var(&result_var_name);
-          auto &local_result_tensor =
-              *local_result_var->GetMutable<framework::LoDTensor>();
-
-          local_result_tensor.ShareDataWith(outside_tensor);
 
           auto sum_op = framework::OpRegistry::CreateOp(
-              "sum", {{"X", {result_var_name, inside_grad_name}}},
-              {{"Out", {result_var_name}}}, {});
+              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+              {{"Out", {pg_names[param_id]}}}, {});
           sum_op->Run(cur_scope, dev_ctx);
+
+          cur_scope.Rename(new_inside_name, inside_grad_name);
         }
       }
       VLOG(5) << "Accumulate Parameter finished ";
@@ -446,7 +440,7 @@ class RecurrentGradOp : public RecurrentBase {
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            dst.CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -459,7 +453,7 @@ class RecurrentGradOp : public RecurrentBase {
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
               outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              outside->CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
@@ -509,14 +503,14 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
     AddInput(kParameters,
              "Parameters are used by step block as its input. However, the "
-             "inputs is not a sequence tensor. Every time step, each operator "
-             "in step block just use the parameter directly")
+             "input is not a sequence tensor. Every time step, each operator "
+             "in step block just use the parameter directly.")
         .AsDuplicable();
     AddOutput(kOutputs,
-              "The output sequence of RNN. The sequence length must be same")
+              "The output sequence of RNN. The sequence length must be same.")
         .AsDuplicable();
     AddOutput(kStepScopes,
-              "StepScopes contains all local variables in each time step.");
+              "StepScopes contain all local variables in each time step.");
     AddAttr<std::vector<std::string>>(kExStates,
                                       string::Sprintf(
                                           R"DOC(The ex-state variable names.
@@ -556,10 +550,12 @@ if reverse is True
       o          o          o         o
 )DOC").SetDefault(false);
     AddAttr<bool>(kIsTrain, "").SetDefault(true);
-    AddComment(R"DOC(Static Length Recurrent Operator
+    AddComment(R"DOC(
+Static Length Recurrent Operator.
+
+The static length recurrent operator can only operate on fixed size sequence
+data, i.e. in each mini-batch, the sequence length of all inputs are the same.
 
-The static length recurrent operator can only operate on fix sized sequence
-data, i.e. in each mini-batch, the sequence length of all inputs are same.
 )DOC");
   }
 };
@@ -603,7 +599,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
     std::vector<std::string> output{kOutputs};
     for (auto &s : input) {
       PADDLE_ENFORCE(ctx->HasInputs(s));
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
     }
     for (auto &s : output) {
       PADDLE_ENFORCE(ctx->HasInputs(s));
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000..c69e416e10
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+void RunServer(Server **rpc_server,
+               std::shared_ptr<detail::SendRecvServerImpl> service,
+               const std::string &server_address) {
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  *rpc_server = server.get();
+  LOG(INFO) << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      rpc_service_.reset(new detail::SendRecvServerImpl());
+      std::string endpoint = Attr<std::string>("endpoint");
+      server_thread_.reset(
+          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+    }
+  }
+
+  virtual ~RecvOp() {
+    rpc_server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // blocking get one var from client.
+    const framework::LoDTensor &t = rpc_service_->Get();
+    framework::Scope &recv_scope = scope.NewScope();
+    // set graph input var
+    auto *var = recv_scope.Var(Input("RX"));
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    // FIXME(typhoonzero): do not copy
+    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
+
+    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
+    auto *program = block->Program();
+    framework::Executor executor(dev_ctx);
+    // Run sub graph to get optimized tensor
+    executor.Run(*program, &recv_scope, block->ID(),
+                 false /*create_local_scope*/);
+
+    auto *out_var = recv_scope.FindVar("Out");
+    // push back
+    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+  }
+
+ protected:
+  // grpc server instance to track status and gracefully shutdown.
+  // borrow an pointer from server thread.
+  Server *rpc_server_{nullptr};
+  // grpc send/recv service implement to register.
+  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
+                                        "optimize network run in server");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 0599daa768..2589a54cfc 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -80,24 +80,27 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor. Tensors with rank at most 6 are supported");
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
     AddOutput("Out", "(Tensor) The result tensor.");
     AddAttr<int>(
         "dim",
-        "(int, default 1) The dimension to reduce. "
+        "(int, default 0) The dimension to reduce. "
         "Must be in the range [-rank(input), rank(input)). "
         "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Noting that reducing on the first dim will make the LoD info lost.")
+        "Note that reducing on the first dim will make the LoD info lost.")
         .SetDefault(0);
     AddAttr<bool>("keep_dim",
                   "(bool, default false) "
                   "If true, retain the reduced dimension with length 1.")
         .SetDefault(false);
     comment_ = R"DOC(
-{ReduceOP} operator computes the {reduce} of input tensor along the given dimension. 
-The result tensor has 1 fewer dimension than the input unless `keep_dim` is true.
+{ReduceOp} Operator.
+
+This operator computes the {reduce} of input tensor along the given dimension. 
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+
 )DOC";
     AddComment(comment_);
   }
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index 45043c440b..dd6547542d 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "glog/logging.h"
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
@@ -26,6 +27,10 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+
 struct SumFunctor {
   template <typename Place, typename X, typename Y, typename Dim>
   void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
@@ -133,10 +138,17 @@ class ReduceKernel : public framework::OpKernel<T> {
       dims_vector.erase(dims_vector.begin() + dim);
       dims = framework::make_ddim(dims_vector);
     }
-    auto out = EigenTensor < T, D == 1 ? 1 : (D - 1) > ::From(*output, dims);
+
     auto& place = context.GetEigenDevice<Place>();
     Functor functor;
-    functor(place, x, out, reduce_dim);
+
+    if (D == 1) {
+      auto out = EigenScalar<T>::From(*output);
+      functor(place, x, out, reduce_dim);
+    } else {
+      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
+      functor(place, x, out, reduce_dim);
+    }
   }
 };
 
@@ -186,13 +198,13 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
     auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
 
-    Eigen::array<int, D> braodcast_dim;
-    for (size_t i = 0; i < D; ++i) braodcast_dim[i] = 1;
-    braodcast_dim[dim] = input0->dims()[dim];
+    Eigen::array<int, D> broadcast_dim;
+    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+    broadcast_dim[dim] = input0->dims()[dim];
     auto& place = context.GetEigenDevice<Place>();
     Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, braodcast_dim,
-            braodcast_dim[dim]);
+    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+            broadcast_dim[dim]);
   }
 };
 
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 9213cc7a85..39bf2118d6 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -1,11 +1,10 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
     // TODO(qiao) change batch_size
     for (size_t i = 1; i < shape.size(); ++i) {
       PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of shape "
-                     "must be positiv except the first.");
+                     "Each dimension of Attr(shape) "
+                     "must be positive except the first one.");
     }
     if (shape[0] < 0) {
       shape[0] = x_dims[0];
@@ -71,8 +70,11 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
     AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape", "Target shape of reshape operator.");
-    AddComment(R"DOC(Reshape operator
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Target shape of reshape operator.");
+    AddComment(R"DOC(
+Reshape Operator.
 
 Reshape Input(X) into the shape specified by Attr(shape).
 
@@ -81,7 +83,7 @@ Given a 2-D tensor X with 2 rows and 2 columns
 
     [[1, 2], [3, 4]]
 
-with target shape = [1, 4], the reshape operator will transform
+and target shape = [1, 4], the reshape operator will transform
 the tensor X into a 1-D tensor:
 
     [1, 2, 3, 4]
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
index 23dbe089d3..dca6c15007 100644
--- a/paddle/operators/reshape_op.cu
+++ b/paddle/operators/reshape_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index beb951713a..73fd1da642 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto out_dims = out->dims();
     out->mutable_data<T>(ctx.GetPlace());
-    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
+    framework::CopyFrom(*in, ctx.GetPlace(), ctx.device_context(), out);
     out->Resize(out_dims);
   }
 };
@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
+    framework::CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index fd5567a365..a9c45f639c 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -68,22 +68,22 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated");
+             "Input parameter value that has to be updated.");
     AddInput("MeanSquare",
              "(Tensor, default Tensor<float>)"
-             " The mean square value that gets updated");
+             " The mean square value that gets updated.");
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1");
+             "The learning rate should be a tensor of size 1.");
     AddInput("Grad",
              "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter");
+             "Input gradient of the parameter.");
     AddInput("Moment",
-             "(Tensor, default Tensor<float>) The moment that gets updated");
+             "(Tensor, default Tensor<float>) The moment that gets updated.");
 
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value");
-    AddOutput("MomentOut", "(Tensor) Output updated moment");
-    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value");
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment.");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
 
     AddAttr<float>("epsilon",
                    "(float, default 1e-10) Constant "
@@ -93,18 +93,19 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float, default 0.9) "
                    "Discounting factor for coming gradient.")
         .SetDefault(0.9f);
-    AddAttr<float>("momentum", "(float, default 0.0) Constant value")
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
         .SetDefault(0.0f);
     AddComment(R"DOC(
+Rmsprop Optimizer. 
 
-RMSprop
-
-MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad
+$$
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
 MomentOut = momentum * Moment +
-            LearningRate * Grad / sqrt(MeanSquareOut + epsilon)
+            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
 ParamOut = Param -  MomentOut
+$$
 
-The original slides that proposed RMSprop: Slide 29 of
+The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
 )DOC");
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
deleted file mode 100644
index ee61ea300c..0000000000
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-namespace f = paddle::framework;
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len) {
-  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
-  for (size_t i = 0; i < inlinks.size(); ++i) {
-    // global inputs
-    auto input_var = step_scopes[0]->parent().FindVar(inlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(input_var, "input link [%s] is not in scope.",
-                            inlinks[i]);
-
-    LoDTensor* input = input_var->GetMutable<LoDTensor>();
-    f::DDim dims = input->dims();
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inputs be the same length");
-    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
-    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input =
-          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
-      // The input of operators of each step is Tensor here.
-      // Maybe need to modify Slice function.
-      *step_input = input->Slice(j, j + 1);
-      step_input->Resize(step_dims);
-    }
-  }
-}
-
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx) {
-  for (size_t i = 0; i < outlinks.size(); i++) {
-    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
-                            outlinks[i]);
-    LoDTensor* output = output_var->GetMutable<LoDTensor>();
-
-    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-    f::DDim step_dims =
-        step_scope_var->template GetMutable<LoDTensor>()->dims();
-    std::vector<int64_t> dims_vec = vectorize(step_dims);
-    dims_vec.insert(dims_vec.begin(), seq_len);
-    output->Resize(f::make_ddim(dims_vec));
-    output->mutable_data<float>(platform::CPUPlace());
-    for (size_t j = 0; j < seq_len; j++) {
-      LoDTensor* step_output =
-          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-      // TODO(luotao02) data type and platform::DeviceContext() should set
-      // correctly
-      (output->Slice(j, j + 1))
-          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
-    }
-  }
-}
-
-void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::StateAttr>& memories,
-                  const size_t step_id, const int offset) {
-  PADDLE_ENFORCE_LT(step_id, scopes.size(),
-                    "step [%d] is out of range of step scopes' size [%d]",
-                    step_id, scopes.size());
-  PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
-                    "offset [%d] must be large than -[%d]", offset, step_id);
-  PADDLE_ENFORCE_LT(
-      step_id + offset, scopes.size(),
-      "offset [%d] is out of range, it must be less than (%d - %d)", offset,
-      scopes.size(), step_id);
-  auto* scope = scopes[step_id];
-  auto* linked_scope = scopes[step_id + offset];
-  for (auto& attr : memories) {
-    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    mem->Resize(linked_mem->dims());
-    mem->ShareDataWith(*linked_mem);
-  }
-}
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad) {
-  arg->step_scopes =
-      is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes);
-  arg->inlinks = op.Inputs(name.inlinks);
-  arg->outlinks = op.Outputs(name.outlinks);
-
-  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
-                                : op.Inputs(name.initial_states);
-  // attributes
-  auto& memories = op.Attr<std::vector<std::string>>(name.states);
-  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
-
-  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of states, initial_states don't match:%d,%d",
-                 memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of ex_states, initial_states don't match:%d,%d",
-                 pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
-
-  for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::StateAttr mem_attr;
-    mem_attr.var = memories[i];
-    mem_attr.pre_var = pre_memories[i];
-    mem_attr.boot_var = boot_memories[i];
-    (arg->states).push_back(mem_attr);
-  }
-}
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
deleted file mode 100644
index fb0e158e07..0000000000
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-using Scope = framework::Scope;
-
-/**
- * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
- *
- * Memory attributes cached by this op, dims will be infered from
- * boot memories in father scope. Other attributes are copied from Op's proto
- * attributes.
- */
-struct StateAttr {
-  // name of current state variable
-  std::string var;
-  // name of previous step's state variable
-  std::string pre_var;
-  // name of the variables to init this memory (same role of `boot_layer` in
-  // PaddlePaddle), which is store in father's scope.
-  std::string boot_var;
-};
-
-struct Argument {
-  std::string step_net;
-  std::string step_scopes;
-  std::vector<std::string> inlinks;
-  std::vector<std::string> outlinks;
-  std::vector<rnn::StateAttr> states;
-};
-
-struct ArgumentName {
-  std::string step_net;
-  std::string step_scopes;
-  std::string inlinks;
-  std::string outlinks;
-  std::string states;          // the memory name
-  std::string ex_states;       // the previous memory name
-  std::string initial_states;  // the boot memory name
-};
-
-/**
- * Prepare inputs for each step net.
- */
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len);
-
-/**
- * Process outputs of step nets and merge to variables.
- */
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx);
-
-void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<StateAttr>& memories, const size_t step_id,
-                  const int offset);
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad = false);
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index b621c7f1ba..3a035f0b9a 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -62,7 +62,7 @@ class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddOutput("Out", "");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
@@ -95,7 +95,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
 
       framework::AttributeMap attrs;
-      attrs["data_type"] = framework::ToDataType(in_var_tensor.type());
+      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
       attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
       attrs["value"] = 0.0f;
 
@@ -121,7 +121,7 @@ class RNNMemoryHelperGradOpInfoMaker
     AddInput("X", "");
     AddInput("Out", "");
     AddOutput(framework::GradVarName("X"), "");
-    AddAttr<int>("data_type",
+    AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::DataType::FP32);
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
new file mode 100644
index 0000000000..2b5e66c96b
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
+class ROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
+                   "Output(Argmax) of ROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ROIPoolOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(Tensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddOutput("Argmax",
+              "(Tensor), "
+              "Argmaxes corresponding to indices in X used "
+              "for gradient computation. Only output "
+              "if arg “is_test” is false.")
+        .AsIntermediate();
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+        .SetDefault(1);
+    AddComment(R"DOC(
+ROIPool operator
+
+ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool_grad,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
new file mode 100644
index 0000000000..9a4c8ca752
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cu
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GPUROIPoolBackward(
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* argmax = ctx.Output<Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    auto in_stride = framework::stride(in_dims);
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    size_t rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* argmax = ctx.Input<Tensor>("Argmax");
+
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    size_t rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+
+      int output_grad_size = out_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    roi_pool_grad,
+    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
new file mode 100644
index 0000000000..3812c66c65
--- /dev/null
+++ b/paddle/operators/roi_pool_op.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto argmax_stride = framework::stride(argmax->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    const int64_t* rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      PADDLE_ENFORCE_GE(roi_batch_id, 0);
+      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
+      rois_data += roi_stride[0];
+    }
+
+    rois_data = rois->data<int64_t>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      int roi_start_w = round(rois_data[1] * spatial_scale);
+      int roi_start_h = round(rois_data[2] * spatial_scale);
+      int roi_end_w = round(rois_data[3] * spatial_scale);
+      int roi_end_h = round(rois_data[4] * spatial_scale);
+
+      // Force malformed ROIs to be 1x1
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+      const float bin_size_h =
+          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+      const float bin_size_w =
+          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            //  Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart =
+                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+            int wstart =
+                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+            int hend =
+                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+            int wend =
+                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            const int pool_index = ph * pooled_width + pw;
+
+            // Define an empty pooling region to be zero
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > output_data[pool_index]) {
+                  output_data[pool_index] = batch_data[index];
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+        argmax_data += argmax_stride[1];
+      }
+      // Increment ROI data pointer
+      rois_data += roi_stride[0];
+    }
+    return;
+  }
+};
+
+template <typename Place, typename T>
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+
+    if (in_grad) {
+      const int64_t* rois_data = rois->data<int64_t>();
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
+
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
+
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
+
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int pool_index = ph * pooled_width + pw;
+              if (argmax_data[pool_index] >= 0) {
+                auto index = argmax_data[pool_index];
+                batch_grad_data[index] += out_grad_data[pool_index];
+              }
+            }
+          }
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
+        }
+        rois_data += roi_stride[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index 56909fb65f..d4921cb80c 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
-
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-       // int32_t  size
-       // void*    protobuf message
-      framework::TensorDesc desc;
-      desc.set_data_type(framework::ToDataType(tensor.type()));
-      auto dims = framework::vectorize(tensor.dims());
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      fout.write(out.data(), size);
-    }
-    {  // the 3rd field, tensor data
-      uint64_t size = tensor.memory_size();
-      auto *data_ptr = tensor.data<void>();
-      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                     "Index overflow when writing tensor");
-      if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-        std::unique_ptr<char[]> buf(new char[kBufSize]);
-        auto &gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-        platform::CPUPlace cpu;
-        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-        while (size != 0) {
-          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-          memory::Copy(cpu, buf.get(),
-                       boost::get<platform::GPUPlace>(tensor.place()),
-                       reinterpret_cast<const void *>(data), size_to_write,
-                       gpu_dev_ctx.stream());
-          gpu_dev_ctx.Wait();
-          fout.write(buf.get(), size_to_write);
-          data += size_to_write;
-          size -= size_to_write;
-        }
-#else
-        PADDLE_THROW("Unexpected branch");
-#endif
-      } else {
-        fout.write(static_cast<const char *>(data_ptr),
-                   static_cast<std::streamsize>(size));
-      }
-    }
-    {  // the 4th field, lod information
-       // uint64_t lod_level
-       // uint64_t lod_level_1 size in byte.
-       // int*     lod_level_1 data
-       // ...
-      auto lod = tensor.lod();
-      uint64_t size = lod.size();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-      for (auto &each : lod) {
-        size = each.size() * sizeof(framework::LoD::value_type::value_type);
-        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-        fout.write(reinterpret_cast<const char *>(each.data()),
-                   static_cast<std::streamsize>(size));
-      }
-    }
+    framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 5745580504..e5c10fec4d 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 820fd4e685..0d70775159 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -16,4 +16,6 @@
 
 REGISTER_OP_GPU_KERNEL(
     scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 62e6c70b45..573bbcd187 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,9 +49,11 @@ class ScatterOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
   }
 };
 
@@ -66,9 +68,11 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
   }
 };
 
@@ -83,10 +87,15 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Updates", "The updated value of updates op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
-Scatter Operator by selecting from the first axis,
+Scatter Operator.
 
-Out = Ref
+This operator obtains output by updating the input on selected indices on the first axis:
+
+$$
+Out = Ref \\
 Out[Index] = Ref[Index] + Updates
+$$
+
 )DOC");
   }
 };
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000..a3059847f2
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(typhoonzero): this is a simple implementation which only send
+// one tensor
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    // init client when the operator is created at runtime.
+    if (!client_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      client_.reset(new detail::RPCClient(
+          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
+      // TODO(typhoonzero): how to call InitVariables
+    }
+  }
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto iname = Input("X");
+    auto oname = Output("Out");
+    // TODO(typhoonzero): currently it's non-blocking,
+    // should block until server responds.
+    bool ret = client_->SendVariable(scope, iname, oname);
+    if (!ret) {
+      LOG(ERROR) << "send variable error";
+    }
+  }
+
+ protected:
+  std::shared_ptr<detail::RPCClient> client_{nullptr};
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be saved");
+    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000..ac03eb3752
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+// TODO(typhoonzero): add python bindings for this test as
+// a RemoteOptimizer.
+
+#include <unistd.h>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+// global for simplicity.
+std::unique_ptr<paddle::framework::OperatorBase> recv_op;
+
+void InitTensorsInScope(paddle::framework::Scope &scope,
+                        paddle::platform::CPUPlace &place) {
+  paddle::platform::CPUDeviceContext ctx(place);
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({10, 10});
+  float *expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(i);
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  tensor->mutable_data<float>(place);  // allocate
+}
+
+void AddOp(const std::string &type,
+           const paddle::framework::VariableNameMap &inputs,
+           const paddle::framework::VariableNameMap &outputs,
+           paddle::framework::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet() {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  // sub program run in recv_op, for simple test we use sum
+  paddle::framework::ProgramDescBind program;
+  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"OptimizeBlock", block});
+  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
+                                                    {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  recv_op->Run(scope, ctx);
+}
+
+TEST(SendRecvOp, CPU) {
+  std::thread server_thread(StartServerNet);
+  sleep(5);  // wait server to start
+  // local net
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+
+  auto send_op = paddle::framework::OpRegistry::CreateOp(
+      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  send_op->Run(scope, ctx);
+
+  auto in_var = scope.Var("X");
+  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
+  float *expected = tensor->data<float>();
+
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
+  // send fail cause output is none.
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op.reset();  // dtor can shutdown and join server thread.
+  server_thread.join();
+}
diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc
index 08fda9b445..b862056ad4 100644
--- a/paddle/operators/seq_expand_op.cc
+++ b/paddle/operators/seq_expand_op.cc
@@ -53,8 +53,10 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LodTensor)The output of seq_expand op."
               "The lod of output will be as same as input(Y)'s lod.");
     AddComment(R"DOC(
-Expand input(X) according to LOD of input(Y).
+Seq Expand Operator.
 
+This operator expands input(X) according to LOD of input(Y).
+Following are cases to better explain how this works:
 Case 1:
 
 Given 2-level a LoDTensor input(X)
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index ec4ad50dab..d1de0b4447 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -47,7 +47,7 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(vector<LoDTensor>) Input is a vector of LoDTensor, "
+             "(LodTensorArray) Input is a vector of LoDTensor, "
              "each of which is a variable-length sequence or nested sequence.")
         .AsDuplicable();
     AddOutput("Out",
@@ -68,34 +68,40 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-Sequence Concat operator
-
-The sequence_concat operator concatenates multiple LoDTensors.
-It only supports sequence (LoD Tensor with level number is 1)
+The sequence_concat operator concatenates multiple LoDTensors. 
+It only supports sequence (LoD Tensor with level number is 1) 
 or a nested sequence (LoD tensor with level number is 2) as its input.
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD
+  each input should have the same LoD information and the LoD 
   information of the output keeps the same as the input.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
-    LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along
+  If the axis is 0(here, leve is 0), the inputs are concatenated along 
   time steps, the LoD information of the output need to re-compute.
+  The LoD information of level-1 should be same.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,2,3,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,1,2,3,4,5,6,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
 
 - Case3:
   If the axis is 0(here, level is 1).
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
+
+- Case4:
+  If the LoD number is 1, axis is 0, level is 0
+
+  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
 
 NOTE: The levels of all the inputs should be the same.
     )DOC");
diff --git a/paddle/operators/sequence_concat_op.cu b/paddle/operators/sequence_concat_op.cu.cc
similarity index 97%
rename from paddle/operators/sequence_concat_op.cu
rename to paddle/operators/sequence_concat_op.cu.cc
index 8dc4764785..9ca99c2258 100644
--- a/paddle/operators/sequence_concat_op.cu
+++ b/paddle/operators/sequence_concat_op.cu.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 6adf96120c..09212070aa 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -24,28 +24,38 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 template <typename T>
-LoD concatLoD(const std::vector<const T*> ins, const size_t axis,
-              const size_t level) {
+LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   auto out_lod = ins[0]->lod();
+  auto numLevels = ins[0]->NumLevels();
   const size_t n = ins.size();
-  if (axis == 0UL) {
-    for (size_t i = 1; i < n; ++i) {
-      for (size_t j = 0; j < ins[i]->lod()[0].size(); ++j) {
-        out_lod[0][j] += ins[i]->lod()[0][j];
-      }
+  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
+  for (size_t i = 1; i < n; ++i) {
+    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
+      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
+    }
+  }
 
-      if (ins[0]->NumLevels() == 2) {
-        for (size_t j = 1; j < ins[i]->lod()[1].size(); ++j) {
-          if (level == 0UL) {
-            out_lod[1].push_back(out_lod[1].back() + ins[i]->lod()[1][j] -
-                                 ins[i]->lod()[1][j - 1]);
-          } else if (level == 1UL) {
-            out_lod[1][j] += ins[1]->lod()[1][j];
-          }
+  for (size_t i = level_idx; i < numLevels - 1; ++i) {
+    size_t lod_len = 1;
+    for (size_t j = 0; j < n; ++j) {
+      lod_len += ins[j]->lod()[i + 1].size() - 1;
+    }
+    out_lod[i + 1].clear();
+    out_lod[i + 1].resize(lod_len);
+
+    size_t idx = 1;
+    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
+      for (size_t k = 0; k < n; ++k) {
+        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
+          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
+                                ins[k]->lod()[i + 1][m + 1] -
+                                ins[k]->lod()[i + 1][m];
+          idx++;
         }
       }
     }
   }
+
   return out_lod;
 }
 
@@ -82,18 +92,21 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
                       "should be greater than the specify level");
 
     out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
+    auto out_lod = ins[0]->lod();
+    if (axis == 0) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
     out->set_lod(out_lod);
 
-    auto out_lod_level = out_lod[level];
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
                                 static_cast<int>(out_lod_level[i + 1]));
       auto out_stride = framework::stride(out_t.dims());
       size_t offset = 0;
-
       for (size_t j = 0; j < n; ++j) {
-        auto in_lod_level = ins[j]->lod()[level];
+        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
         auto in_stride = framework::stride(ins[j]->dims());
         Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
                                     static_cast<int>(in_lod_level[i + 1]));
@@ -124,9 +137,12 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       x_grads[i]->set_lod(ins[i]->lod());
       x_grads[i]->mutable_data<T>(ctx.GetPlace());
     }
-
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
-    auto out_lod_level = out_lod[level];
+    auto out_lod = ins[0]->lod();
+    if (axis == 0UL) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
 
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_grad_t =
@@ -136,7 +152,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       size_t offset = 0;
 
       for (size_t j = 0; j < n; ++j) {
-        auto x_grad_lod_level = x_grads[j]->lod()[level];
+        auto x_grad_lod_level =
+            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
         auto x_grad_stride = framework::stride(x_grads[j]->dims());
         Tensor x_grad_t =
             x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index a3f2ed1443..c5533732d4 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -105,10 +105,10 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
-        "(LoDTensor) the input(X) is a LodTensor, which support "
+        "(LoDTensor) the input(X) is a LodTensor, which supports "
         "variable-time length input sequence. The underlying tensor in "
-        "this LoDTensor is a matrix with shape (T, N), where, T is the "
-        "total time steps in this mini-batch, N is the input_hidden_size.");
+        "this LoDTensor is a matrix with shape (T, N), where T is the "
+        "total time steps in this mini-batch and N is the input_hidden_size.");
     AddInput("PaddingData",
              "(Tensor, optional) the input(PaddingData) is an optional "
              "parameter, and it is learnable. "
@@ -157,14 +157,16 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
         .GreaterThan(0);
 
     AddComment(R"DOC(
-    SequenceConvOp performs convolution operation on features of
-    contextLength time-steps of each instance.
-    The convolution operation calculates the output based on the input, filter
-    and strides, paddings parameters. The size of each dimension of the
-    parameters is checked in the infer-shape. In order to ensure the equal
-    length of sequence before and after convolution, it is necessary to fill
-    the top and bottom of each sequence according to context_length,
-    context_stride and context_start.
+Sequence Conv Operator.
+
+SequenceConvOp performs convolution operation on features of contextLength
+time-steps of each instance. The convolution operation calculates the output
+based on the input, filter, strides and paddings parameters.
+The size of each dimension of the parameters is checked during infer-shape.
+In order to ensure the equal length of sequence before and after convolution,
+it is necessary to fill the top and bottom of each sequence based on
+context_length, context_stride and context_start.
+
     )DOC");
   }
 };
@@ -177,7 +179,9 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
             sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>);
+    sequence_conv, ops::SequenceConvKernel<paddle::platform::CPUPlace, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/sequence_conv_op.cu b/paddle/operators/sequence_conv_op.cu.cc
similarity index 85%
rename from paddle/operators/sequence_conv_op.cu
rename to paddle/operators/sequence_conv_op.cu.cc
index 4c0c673a51..c8136dbcb3 100644
--- a/paddle/operators/sequence_conv_op.cu
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -12,13 +12,13 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_conv_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>);
+    sequence_conv, ops::SequenceConvKernel<paddle::platform::GPUPlace, float>,
+    ops::SequenceConvKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>);
+    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, float>,
+    ops::SequenceConvGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
index a57e1752bb..b8fbe2647c 100644
--- a/paddle/operators/sequence_conv_op.h
+++ b/paddle/operators/sequence_conv_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/context_project.h"
 #include "paddle/operators/math/math_function.h"
@@ -62,9 +61,9 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     math::ContextProjectFunctor<Place, T> seq_project_functor;
 
-    seq_project_functor(context.device_context(), *in, *padding_data, col,
+    seq_project_functor(context.device_context(), *in, *padding_data,
                         padding_trainable, context_start, context_length,
-                        context_stride, up_pad, down_pad);
+                        context_stride, up_pad, down_pad, &col);
 
     math::matmul<Place, T>(context.device_context(), col, false, filter, false,
                            static_cast<T>(1.0), out, static_cast<T>(0.0));
@@ -117,10 +116,10 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       in_g->set_lod(in->lod());
       set_zero(context.device_context(), in_g, static_cast<T>(0));
 
-      seq_project_grad_functor(context.device_context(), *in_g, *padding_data_g,
-                               col, padding_trainable, context_start,
-                               context_length, context_stride, up_pad, down_pad,
-                               true, false);
+      seq_project_grad_functor(context.device_context(), *in_g,
+                               padding_trainable, context_start, context_length,
+                               context_stride, up_pad, down_pad, false, true,
+                               padding_data_g, &col);
     }
 
     if (padding_trainable && padding_data_g) {
@@ -129,9 +128,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
 
       LoDTensor* input = const_cast<LoDTensor*>(in);
       seq_project_grad_functor(context.device_context(), *input,
-                               *padding_data_g, col, padding_trainable,
-                               context_start, context_length, context_stride,
-                               up_pad, down_pad, false, true);
+                               padding_trainable, context_start, context_length,
+                               context_stride, up_pad, down_pad, true, false,
+                               padding_data_g, &col);
     }
 
     if (filter_g) {
@@ -146,9 +145,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      seq_project_functor(context.device_context(), *in, *padding_data, col,
+      seq_project_functor(context.device_context(), *in, *padding_data,
                           padding_trainable, context_start, context_length,
-                          context_stride, up_pad, down_pad);
+                          context_stride, up_pad, down_pad, &col);
 
       math::matmul<Place, T>(context.device_context(), col, true, out_grad,
                              false, T(1.0), &filter_grad, T(1.0));
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index dfe8de4985..bfda8649cd 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -27,6 +27,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SequencePoolOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
+      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
+                     "Output(MaxIndex) of SequencePoolOp should not be null.");
+      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
+    }
   }
 };
 
@@ -35,43 +40,50 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
+    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
-              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "(Tensor) The output of SequencePoolOp does not contain LoD "
               "infomation.");
+    AddOutput("MaxIndex",
+              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "to record the max indexes.")
+        .AsIntermediate();
     AddAttr<std::string>(
         "pooltype",
         "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.")
         .SetDefault("AVERAGE")
         .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
     AddComment(R"DOC(
-    SequencePoolOp pools features of all time-steps of each instance.
-
-    It supports six pooling pooltype:
-    - AVERAGE: Out[i] = average_{for each instance in i-th sequence}{X[i]}
-    - SUM:     Out[i] = sum_{for each instance in i-th sequence}{X[i]}
-    - SQRT:    Out[i] = sum_{for each instance in i-th sequence}{X[i]} 
-                        / sqrt(i-th sequence length)
-    - LAST:    Out[i] = last instance in i-th sequence X[i]
-    - FIRST:   Out[i] = first instance in i-th sequence X[i]
-    - MAX:     Out[i] = max_{for each instance in i-th sequence}{X[i]}
-
-    For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps:
-
-    Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
-    Besides, for the sake of simplicity, we assume M=1 and N=1,
-    and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
-
-    Thus, Out is a [3,1,1] Tensor without LoD infomation.
-    And for different pooltype, the value of Out is as follows:
-
-    - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-    - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+Sequence Pool Operator.
+
+The SequencePoolOp pools features of all time-steps of each instance.
+It supports six pooling types:
+1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
+2. SUM:     $$Out[i] = \sum_jX_{ij}$$
+3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+4. LAST:    Out[i] = last instance in i-th sequence X[i]
+5. FIRST:   Out[i] = first instance in i-th sequence X[i]
+6. MAX:     $$Out[i] = max(X_i)$$
+
+The following example explains how this works:
+For a mini-batch of 3 variable-length sentences,
+containing 2, 3, and 2 time-steps:
+
+Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
+Besides, for the sake of simplicity, we assume M=1 and N=1,
+and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+Thus, Out is a [3,1,1] Tensor without LoD infomation.
+And for different pooltype, the value of Out is as follows:
+
+- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
            6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-    - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-    - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-    - FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
     )DOC");
   }
 };
@@ -92,6 +104,15 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index e0e0493fe0..7f136d8cf0 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +35,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
+    auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
 
     auto dims = in->dims();
@@ -53,6 +54,16 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto lod_level_0 = lod[0];
 
     out->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<Place, T> max_pool;
+      auto* index = context.Output<Tensor>("MaxIndex");
+      index->Resize({dims});
+      index->mutable_data<int>(context.GetPlace());
+      max_pool(context.device_context(), *in, out, index);
+      return;
+    }
+
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
       Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
@@ -69,8 +80,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
       } else if (pooltype == "SQRT") {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "MAX") {
-        out_e.device(place) = in_e.maximum(Eigen::array<int, 1>({{0}}));
       } else if (pooltype == "LAST") {
         out_e.device(place) = in_e.chip(h - 1, 0);
       } else if (pooltype == "FIRST") {
@@ -87,8 +96,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     std::string pooltype = context.Attr<std::string>("pooltype");
 
     auto dims = in->dims();
@@ -96,6 +105,14 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<Place, T> max_pool_grad;
+      auto* index = context.Input<Tensor>("MaxIndex");
+      max_pool_grad(context.device_context(), *out_g, *index, in_g);
+      return;
+    }
+
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
       math::SetConstant<Place, T> functor;
@@ -109,6 +126,7 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "AVERAGE") {
@@ -118,24 +136,10 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "MAX") {
-        auto in_t =
-            in->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-        Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
-            in_t_map(in_t.data<T>(), h, w);
-        int row_id;
-        Eigen::array<int, 2> extents{{1, 1}};
-        for (int col_id = 0; col_id < w; col_id++) {
-          in_t_map.col(col_id).maxCoeff(&row_id);
-          Eigen::array<int, 2> in_offsets{{row_id, col_id}};
-          Eigen::array<int, 2> out_offsets{{0, col_id}};
-          in_g_e.slice(in_offsets, extents).device(place) =
-              out_g_e.slice(out_offsets, extents);
-        }
       } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
       } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e;
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
         PADDLE_THROW("unsupported pooling pooltype");
       }
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
new file mode 100644
index 0000000000..255683a572
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Offset"),
+                   "Input(Offset) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSliceOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+
+    auto offset_dim = ctx->GetInputDim("Offset");
+    auto length_dim = ctx->GetInputDim("Length");
+
+    PADDLE_ENFORCE_EQ(
+        offset_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of offset must be 2.");
+    PADDLE_ENFORCE_EQ(
+        length_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of Length must be 2.");
+
+    // Initialize the output's dims to maximum,
+    // and re-set to real dims by the value of Offset and Length at kernel
+    ctx->SetOutputDim("Out", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSliceOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), "
+             "the input of SequenceSliceOp.");
+    AddInput("Offset",
+             "(Tensor), "
+             "a vector<int> to describe the offset of every input sequence for "
+             "sub sequence item.");
+    AddInput("Length",
+             "(Tensor), "
+             "a vector<int> to describe the length of every input sequence for "
+             "sub sequence item.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
+    AddComment(R"DOC(
+Sequence slice operator
+
+The operator crops a subsequence from given sequence with given start offset and subsequence length.
+It only supports sequence (LoD Tensor with level number is 1).
+- Case:
+    X = [[a1, a2;
+        b1, b2;
+        c1, c2]
+       [d1, d2;
+        e1, e2]]
+    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
+    Offset = [[0], [1]]; Length = [[2], [1]]
+
+    Out = [[a1, a2;
+            b1, b2]
+            [e1, e2]]
+    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
+NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
+            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
new file mode 100755
index 0000000000..a9f59dadba
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_slice_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
new file mode 100644
index 0000000000..428ef556da
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
+                            const int64_t* length_data) {
+  auto out_lod = in.lod();
+  size_t lod_offset = 0;
+
+  auto n = in.lod()[0].size() - 1;
+  out_lod[0][0] = 0;
+  for (size_t i = 0; i < n; ++i) {
+    lod_offset += length_data[i];
+    out_lod[0][i + 1] = lod_offset;
+  }
+  return out_lod;
+}
+
+template <typename Place, typename T>
+class SequenceSliceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    auto n = lod[0].size() - 1;
+
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(length->dims()[0]),
+        "The size of input-sequence and length-array should be the same");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(offset->dims()[0]),
+        "The size of input-sequence and offset-array should be the same");
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
+                          &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
+                          &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_LT(0, offset_data[i],
+                        "The offset[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(0, length_data[i],
+                        "The length[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow.");
+    }
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
+    auto out_dims = in->dims();
+    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
+    out->Resize(out_dims);
+    out->set_lod(out_lod);
+
+    auto in_stride = framework::stride(in->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    size_t out_offset = 0;
+    for (size_t i = 0; i < n; ++i) {
+      Tensor in_t = in->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                       in_t.dims(), out_stride, out->data<T>() + out_offset);
+      out_offset += length_data[i] * in_stride[0];
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
+                          &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
+                          &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    auto lod = in->lod();
+    auto out_lod = out_grad->lod();
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      x_grad->set_lod(in->lod());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+
+      auto out_grad_stride = framework::stride(out_grad->dims());
+
+      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+        Tensor out_grad_t =
+            out_grad->Slice(static_cast<int>(out_lod[0][i]),
+                            static_cast<int>(out_lod[0][i + 1]));
+        auto out_grad_stride = framework::stride(out_grad_t.dims());
+
+        auto x_grad_stride = framework::stride(x_grad->dims());
+
+        Tensor x_grad_t = x_grad->Slice(
+            static_cast<int>(lod[0][i] + offset_data[i]),
+            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index c891ab1fdc..32c1502566 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -43,20 +43,24 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
               "of length 1.");
     AddComment(R"DOC(
-SequenceSoftmaxOp computes softmax activation among all time-steps for each
+Sequence Softmax Operator.
+
+SequenceSoftmaxOp computes the softmax activation among all time-steps for each
 sequence. The dimension of each time-step should be 1. Thus, the shape of
-input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
-lengths.
+input Tensor can be either [N, 1] or [N], where N is the sum of the length
+of all sequences.
 
-Equation:
+The algorithm works as follows:
     for i-th sequence in a mini-batch:
-        Out(X[lod[i]:lod[i+1]], :) =
-            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+        $$Out(X[lod[i]:lod[i+1]], :) =
+            \frac{\exp(X[lod[i]:lod[i+1], :])}
+            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
 
 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
 then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
 and N turns out to be 7.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/sequence_softmax_op.cu b/paddle/operators/sequence_softmax_op.cu.cc
similarity index 97%
rename from paddle/operators/sequence_softmax_op.cu
rename to paddle/operators/sequence_softmax_op.cu.cc
index f2a1e3d5e3..7023795a3b 100644
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 3eb1e2844d..1b68dd0662 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/softmax.h"
 
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 72f4e4d5cb..5576d7b8be 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -55,7 +55,7 @@ SGD operator
 
 This operator implements one step of the stochastic gradient descent algorithm.
 
-$$param_out = param - learning_rate * grad$$
+$$param\_out = param - learning\_rate * grad$$
 
 )DOC");
   }
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 2f41c7fc12..7b6c5ec306 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -20,11 +20,11 @@ namespace paddle {
 namespace operators {
 
 namespace {
-template <typename T>
+template <typename T, int block_size>
 __global__ void SparseSGDFunctorKernel(const T* selected_rows,
                                        const int64_t* rows,
                                        const T* learning_rate, T* tensor_out,
-                                       int64_t row_numel, int block_size) {
+                                       int64_t row_numel) {
   const int ty = blockIdx.y;
   int tid = threadIdx.x;
 
@@ -59,14 +59,15 @@ struct SparseSGDFunctor<platform::GPUPlace, T> {
     auto* in_data = in_value.data<T>();
     auto* out_data = output->data<T>();
 
-    int block_size = 256;
+    const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid(1, in_rows.size());
     SparseSGDFunctorKernel<
-        T><<<grid, threads, 0,
-             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(in_data, in_rows.data(), learning_rate.data<T>(),
-                              out_data, in_row_numel, block_size);
+        T, 256><<<grid, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(in_data, in_rows.data(),
+                                   learning_rate.data<T>(), out_data,
+                                   in_row_numel);
   }
 };
 
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
new file mode 100644
index 0000000000..c380e60686
--- /dev/null
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/operators/array_operator.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class ShrinkRNNMemoryOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    size_t offset = this->GetOffset(scope, dev_ctx);
+    auto *rank_table_var = scope.FindVar(Input("RankTable"));
+    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
+    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
+
+    auto &rank_items = rank_table.items();
+    int dst_num_rows =
+        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
+                         [](const framework::LoDRankTable::TableItem &a,
+                            size_t b) { return a.length > b; }) -
+        rank_items.begin();
+
+    auto *out_var = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out_var != nullptr, "Output Out must be set");
+    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+    if (dst_num_rows != 0) {
+      out_tensor.ShareDataWith(x_tensor.Slice(0, dst_num_rows));
+    }
+  }
+};
+
+class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
+                              framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(
+        R"DOC(
+        In dynamic RNN, we are able to handle sequences of different lengths. 
+        Because of the multiple lengths, the size of each step input can be 
+        different, which may lead to a mismatching between the input of
+        the current step and the memory generated by the previous one. This 
+        operator shrinks memory according to the size of the next step input, 
+        to make sure that they can match each other.
+        )DOC");
+  }
+};
+
+class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasInput("I"));
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNMemoryGradOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
+    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
+    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr);
+
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
+    dx_tensor.Resize(x_tensor.dims());
+    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
+
+    if (dout_var == nullptr) {  // dx_tensor fill zero
+      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+    } else {
+      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
+      auto height = dout_tensor.dims()[0];
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      if (dx_tensor.dims()[0] < height) {
+        auto rest_tensor = dx_tensor.Slice(
+            static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
+        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+      }
+    }
+  }
+};
+
+class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("shrink_rnn_memory_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
+                  ops::ShrinkRNNMemoryInferShape,
+                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
+REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
+                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index e781c8db20..782f4c7936 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
 
     ctx->SetOutputDim("Out", x_dims);
@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shoudl be not null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Output(X@GRAD) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
                       "Input(Out@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
                       "The 1st dimension of Input(X) and Input(Out@Grad) "
@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker
              "This input is a tensor of logits computed by the previous "
              " operator. Logits are unscaled log probabilities given as "
              "log(p/(1-p)).");
-    AddInput("Labels",
+    AddInput("Label",
              "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
              "and shape as X. This input is a tensor of probabalistic labels "
              "for each logit");
@@ -107,26 +105,28 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
-This measures the elementwise probability error in discrete classification tasks
+This measures the element-wise probability error in classification tasks
 in which each class is independent. This can be thought of as predicting labels
-for a data-point that are not mutually exclusive. For example, a news article
-can be about politics, technology or sports at the same time or none of these.
+for a data-point, where labels are not mutually exclusive.
+For example, a news article can be about politics, technology or sports
+at the same time or none of these.
 
 The logistic loss is given as follows:
 
-       loss = -Labels * log(sigmoid(X)) - (1 - Labels) * log(1 - sigmoid(X))
+       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
 
-We know that sigmoid(X) = (1 / (1 + exp(-X))). By substituting this we get
+We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
 
-       loss = X - X * Labels + log(1 + exp(-X))
+       $$loss = X - X * Labels + \log(1 + \exp(-X))$$
 
-For stability and to prevent overflow of exp(-X) when X < 0,
-we can reformulate the loss as follows:
+For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
+we reformulate the loss as follows:
 
-       loss = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
 
 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index 41c619f181..2a9d9bbc77 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
-        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
     framework::Tensor *Out = context.Output<framework::Tensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
 
@@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
-        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
     const framework::Tensor *dOut =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor *dX =
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 758481943d..50543fcc14 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
-                        "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -53,38 +51,53 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of smooth l1 loss op."
-             "The rank should be greater or equal to 2 with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
     AddInput("Y",
-             "The target tensor of smooth l1 loss op "
-             "with the same shape as X.");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
     AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
              "by this tensor element by element.")
         .AsDispensable();
     AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
-             "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.")
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
         .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
         .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
     AddAttr<AttrType>("sigma",
                       "Hyper parameter of smooth l1 loss op."
                       "A float scalar with default value 3.0.")
         .SetDefault(3.0);
     AddComment(R"DOC(
-Compute smooth l1 loss for input and target. The operator take the 1st
-dimension of input as batch size. For each instance, it will compute
-smooth l1 loss element by element first and sum all losses to one value.
-So the output shape is [batch_size, 1].
+Smooth L1 Loss Operator.
+
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
+For each instance, it computes the smooth l1 loss element by element first
+and then sums all the losses. So the shape of Out is [batch_size, 1].
 
 The equation is:
-loss = 0.5 * (sigma * (x-y))^2    if abs(x - y) < 1 / sigma^2
-       abs(x - y) - 0.5 / sigma^2 otherwise
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 00fd0b32a9..93e0525bad 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -44,20 +44,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
              "2-D with shape [batch_size, input_feature_dimensions].");
     AddOutput("Y", "The normalized values with the same shape as X.");
     AddComment(R"DOC(
-The input of softmax operator is a 2-D tensor with shape N x K (N is the
+Softmax Operator.
+
+The input of the softmax operator is a 2-D tensor with shape N x K (N is the
 batch_size, K is the dimension of input feature). The output tensor has the
 same shape as the input tensor.
 
 For each row of the input tensor, the softmax operator squashes the
 K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1. Specifically, it computes the
-exponential of the given dimension and the sum of exponential values of all
-the other dimensions in the K-dimensional vector input. Then the ratio of the
-exponential of the given dimension and the sum of exponential values of all
-the other dimensions is the output of the softmax operator.
-
-For each row `i` and each column `j` in input X, we have:
-    Y[i, j] = exp(X[i, j]) / sum_j(exp(X[i, j]))
+values in the range [0, 1] that add up to 1.
+It computes the exponential of the given dimension and the sum of exponential
+values of all the other dimensions in the K-dimensional vector input.
+Then the ratio of the exponential of the given dimension and the sum of
+exponential values of all the other dimensions is the output of the softmax
+operator.
+
+For each row $i$ and each column $j$ in Input(X), we have:
+    $$Y[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
   }
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu.cc
similarity index 97%
rename from paddle/operators/softmax_op.cu
rename to paddle/operators/softmax_op.cu.cc
index 2e99a89699..013ace19ae 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu.cc
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/softmax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 2c08853f4f..44d1e63f1b 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/softmax.h"
 
@@ -21,9 +20,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 50497da1b7..0c30228863 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -4,17 +4,15 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
-#include <paddle/function/TensorType.h>
-#include <iostream>
 
 namespace paddle {
 namespace operators {
@@ -30,12 +28,10 @@ class SoftmaxWithCrossEntropyOpMaker
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
              "and K is the class number.");
     AddInput("Label",
-             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
-             "tensor. "
-             "If softLabel is set to false, Label is a Tensor<int> with shape "
-             "[N x 1]."
-             "If softLabel is set to true, Label is a Tensor<float/double> "
-             "with shape [N x K].");
+             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+             "soft_label is set to true, Label is a Tensor<float/double> with "
+             "shape [N x K].");
     AddOutput(
         "Softmax",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
@@ -51,32 +47,34 @@ class SoftmaxWithCrossEntropyOpMaker
         "the given labels as soft labels.")
         .SetDefault(false);
     AddComment(R"DOC(
-Cross entropy loss with softmax are used as the output layer extensively. This
+Softmax With Cross Entropy Operator.
+
+Cross entropy loss with softmax is used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is then computed. This provides a more
+tensor, after which cross-entropy loss is computed. This provides a more
 numerically stable gradient.
 
-Because this operators performs a softmax on logits internally, it expects
-unscaled logits. Please do not call this op with the output of softmax operator,
-which will produce incorrect results.
+Because this operator performs a softmax on logits internally, it expects
+unscaled logits. This operator should not be used with the output of
+softmax operator since that would produce incorrect results.
 
-When the attribute softLabel is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with
-probabilities 1. Each sample in the batch with one and only one label.
+When the attribute soft_label is set false, this operators expects mutually
+exclusive hard labels, each sample in a batch is in exactly one class with a
+probability of 1.0. Each sample in the batch will have a single label.
 
-Equation:
+The equation is as follows:
 
-1) hard label (one-hot label)
+1) Hard label (one-hot label, so every sample has exactly one class)
 
-Loss_j = \f$ -\text{Logit}_{Label_j} +
+$$Loss_j =  -\text{Logit}_{Label_j} +
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1, ..., K $\f
+j = 1,..., K$$
 
-2) soft label (a distribution over all classes)
+2) Soft label (each sample can have a distribution over all classes)
 
-Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i -
+$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
 \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K $\f
+j = 1,...,K$$
 
 )DOC");
   }
@@ -121,9 +119,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Logits")->type());
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
   }
 };
 
@@ -160,10 +160,12 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(
-        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type());
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
+        ctx.device_context());
   }
 };
 
@@ -196,6 +198,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
 REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
                   ops::SoftmaxWithCrossEntropyOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyKernel<float>);
+                       ops::SoftmaxWithCrossEntropyKernel<float>,
+                       ops::SoftmaxWithCrossEntropyKernel<double>);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 7602918bb3..b1faddac3f 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
@@ -24,7 +24,7 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
-                                 const int* labels, const int batch_size,
+                                 const int64_t* labels, const int batch_size,
                                  const int class_num) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int sample_idx = tid / class_num;
@@ -50,7 +50,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
   int ids = blockIdx.x * blockDim.x + threadIdx.x;
   if (ids < batch_size * class_num) {
     int row_ids = ids / class_num;
-    logit_grad[ids] = logit_grad[ids] * (loss_grad[row_ids] - labels[ids]);
+    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
 }  // namespace
@@ -104,7 +104,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                               .stream()>>>(logit_grad_data, loss_grad_data,
                                            label_data, batch_size, class_num);
     } else {
-      const int* label_data = labels->data<int>();
+      const int64_t* label_data = labels->data<int64_t>();
       CrossEntropyGrad<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               context.device_context())
@@ -119,6 +119,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
-                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
 REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index 7f3f9e23aa..c4ab3f74b4 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -4,13 +4,13 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -60,25 +60,25 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
     const int class_num = logit_grad->dims()[1];
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+
     if (context.Attr<bool>("soft_label")) {
-      auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-      auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
-
       logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
-          logit_grad_mat *
-          (out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
-           lbl_mat);
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          (logit_grad_mat - lbl_mat);
     } else {
+      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          logit_grad_mat *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+
       const int batch_size = logit_grad->dims()[0];
-      const int* label_data = labels->data<int>();
-      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* label_data = labels->data<int64_t>();
       T* logit_grad_data = logit_grad->data<T>();
-
+      const T* out_grad_data = out_grad->data<T>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        logit_grad_data[index] =
-            out_grad_data[i] * (logit_grad_data[index] - 1.);
+        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
       }
     }
   }
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000..f164a47711
--- /dev/null
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+using LoD = framework::LoD;
+
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        auto slice = out->Slice(static_cast<int>(offset),
+                                static_cast<int>(offset + len));
+        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                                    static_cast<int>(each_range.end)),
+                            x.place(), dev_ctx, &slice);
+        offset += len;
+      }
+    }
+  }
+};
+
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/operators/split_op.cu b/paddle/operators/split_op.cu.cc
similarity index 100%
rename from paddle/operators/split_op.cu
rename to paddle/operators/split_op.cu.cc
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index c8d37ac40c..48d7b1c2d5 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -29,7 +29,7 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
     auto place = context.GetEigenDevice<Place>();
 
     out.device(place) = x.square().sum();
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index d9d3dd6e37..744b2fe3f2 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -12,7 +12,7 @@ limitations under the License. */
 #include "paddle/operators/sum_op.h"
 #include <vector>
 #include "paddle/framework/var_type_inference.h"
-#include "paddle/operators/net_op.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -24,21 +24,76 @@ class SumOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
-    auto x_dims = ctx->GetInputsDim("X");
+
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SumOp should not be null.");
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::VarDesc::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }
 
+    auto x_dims = ctx->GetInputsDim("X");
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
 
-    auto in_dim = x_dims[0];
-    for (size_t i = 1; i < N; i++) {
-      auto dim = x_dims[i];
-      PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
+    framework::DDim in_dim({0});
+    for (auto& x_dim : x_dims) {
+      if (framework::product(x_dim) == 0) {
+        continue;
+      }
+      if (framework::product(in_dim) == 0) {
+        in_dim = x_dim;
+      } else {
+        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+      }
     }
     ctx->SetOutputDim("Out", in_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto x_vars = ctx.MultiInputVar("X");
+    if (x_vars[0]->IsType<framework::LoDTensor>()) {
+      int dtype = -1;
+      for (auto& x_var : x_vars) {
+        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        if (lod_tensor.numel() == 0) {
+          continue;
+        }
+        if (dtype == -1) {
+          dtype = framework::ToDataType(lod_tensor.type());
+        } else {
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+        }
+      }
+      PADDLE_ENFORCE_NE(dtype, -1,
+                        "Sum operator should have at least one tensor");
+
+      return framework::OpKernelType(static_cast<framework::DataType>(dtype),
+                                     ctx.device_context());
+    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
+      return framework::OpKernelType(
+          framework::ToDataType(
+              x_vars[0]->Get<framework::SelectedRows>().value().type()),
+          ctx.device_context());
+    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
+      for (auto& x_var : x_vars) {
+        auto& array = x_var->Get<framework::LoDTensorArray>();
+        for (auto& each : array) {
+          if (each.numel() != 0) {
+            return framework::OpKernelType(framework::ToDataType(each.type()),
+                                           ctx.device_context());
+          }
+        }
+      }
+      PADDLE_THROW("Cannot find the input data type by all input data");
+    }
+    PADDLE_THROW("Unexpected branch. Input type is %s",
+                 x_vars[0]->Type().name());
+  }
 };
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -63,18 +118,50 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
   void operator()(const framework::OpDescBind& op_desc,
                   framework::BlockDescBind* block) const override {
     auto& inputs = op_desc.Input("X");
-    auto default_var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::VarDesc::SELECTED_ROWS;
+
+    for (auto& name : op_desc.Input("X")) {
+      VLOG(10) << name << " "
+               << block->FindRecursiveOrCreateVar(name)->GetType();
+    }
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR;
+          return block->FindRecursiveOrCreateVar(name)->GetType() ==
+                 framework::VarDesc::LOD_TENSOR;
         });
-    if (any_input_is_lod_tensor) {
-      default_var_type = framework::VarDesc::LOD_TENSOR;
+
+    auto is_tensor_array = [block](const std::string& name) {
+      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
+             framework::VarDesc::LOD_TENSOR_ARRAY;
+    };
+
+    bool any_input_is_tensor_array =
+        std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
+    bool all_inputs_are_tensor_array =
+        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
+
+    if (any_input_is_tensor_array) {
+      if (!all_inputs_are_tensor_array) {
+        std::ostringstream os;
+        for (auto& each : inputs) {
+          os << "    " << each << " type is "
+             << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType()
+             << "\n";
+        }
+        PADDLE_ENFORCE(all_inputs_are_tensor_array,
+                       "Not all inputs are tensor array:\n%s", os.str());
+      }
+      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+    } else if (any_input_is_lod_tensor) {
+      var_type = framework::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name));
+    out_var.SetType(var_type);
+    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
+    out_var.SetDataType(in_var.GetDataType());
   }
 };
 
@@ -109,4 +196,6 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
 REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>,
-                       ops::SumKernel<paddle::platform::CPUPlace, double>);
+                       ops::SumKernel<paddle::platform::CPUPlace, double>,
+                       ops::SumKernel<paddle::platform::CPUPlace, int>,
+                       ops::SumKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index 5cf05b876b..5c30dd4d47 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -14,4 +14,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>,
-                       ops::SumKernel<paddle::platform::GPUPlace, double>);
+                       ops::SumKernel<paddle::platform::GPUPlace, double>,
+                       ops::SumKernel<paddle::platform::GPUPlace, int>,
+                       ops::SumKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index ad441a5980..ed6c80ce60 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/selected_rows_functor.h"
@@ -28,7 +29,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename Place, typename T>
 class SumKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
     auto in_vars = context.MultiInputVar("X");
     int N = in_vars.size();
     auto out_var = context.OutputVar("Out");
@@ -36,7 +37,7 @@ class SumKernel : public framework::OpKernel<T> {
     bool in_place = out_var == in_vars[0];
 
     if (out_var->IsType<framework::LoDTensor>()) {
-      auto* out = context.Output<Tensor>("Out");
+      auto *out = context.Output<Tensor>("Out");
       out->mutable_data<T>(context.GetPlace());
 
       auto result = EigenVector<T>::Flatten(*out);
@@ -51,11 +52,14 @@ class SumKernel : public framework::OpKernel<T> {
       // If in_place, just skip the first tensor
       for (int i = in_place ? 1 : 0; i < N; i++) {
         if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          auto& in_t = in_vars[i]->Get<framework::LoDTensor>();
+          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+          if (in_t.numel() == 0) {
+            continue;
+          }
           auto in = EigenVector<T>::Flatten(in_t);
           result.device(place) = result + in;
         } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
-          auto& in_t = in_vars[i]->Get<framework::SelectedRows>();
+          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
           functor(context.device_context(), in_t, out);
         } else {
           PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
@@ -63,8 +67,8 @@ class SumKernel : public framework::OpKernel<T> {
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
       PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
-      auto* out = context.Output<SelectedRows>("Out");
-      auto* out_value = out->mutable_value();
+      auto *out = context.Output<SelectedRows>("Out");
+      auto *out_value = out->mutable_value();
 
       // Runtime InferShape
       size_t first_dim = 0;
@@ -83,14 +87,41 @@ class SumKernel : public framework::OpKernel<T> {
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE_EQ(out->height(),
-                          in_vars[i]->Get<SelectedRows>().height())
+                          in_vars[i]->Get<SelectedRows>().height());
         functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
                 offset, out);
         offset += in_vars[i]->Get<SelectedRows>().value().numel();
       }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::CopyFrom(in_array[i], in_array[i].place(),
+                                  context.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(context.GetEigenDevice<Place>()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
     }
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/tensor.save b/paddle/operators/tensor.save
new file mode 100644
index 0000000000..c24308a7d0
Binary files /dev/null and b/paddle/operators/tensor.save differ
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
new file mode 100644
index 0000000000..2835b84f75
--- /dev/null
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/array_operator.h"
+#include "paddle/operators/detail/safe_ref.h"
+namespace paddle {
+namespace operators {
+
+class WriteToArrayOp : public ArrayOp {
+ public:
+  WriteToArrayOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) return;
+    auto &x_tensor = x->Get<framework::LoDTensor>();
+    size_t offset = GetOffset(scope, dev_ctx);
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
+    if (offset >= out->size()) {
+      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
+               << " to " << offset + 1;
+      out->resize(offset + 1);
+    }
+    if (x_tensor.memory_size() > 0) {
+      auto *out_tensor = &out->at(offset);
+      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
+      out_tensor->set_lod(x_tensor.lod());
+    } else {
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
+    }
+  }
+};
+
+class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WriteToArrayOpProtoMaker(framework::OpProto *proto,
+                           framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
+    AddInput(
+        "I",
+        "(Tensor) the subscript index in tensor array. The number of element "
+        "should be 1");
+    AddOutput("Out", "(TensorArray) the tensor array will be written");
+    AddComment(R"DOC(
+WriteToArray Operator.
+
+This operator writes a LoDTensor to a LoDTensor array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
+equation is
+
+$$A[i] = T$$
+
+)DOC");
+  }
+};
+
+class WriteToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
+    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                      "The number of element of subscript index must be 1");
+    if (!context->HasInput("X")) {
+      return;
+    }
+    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+
+ protected:
+  virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
+
+  virtual const char *NotHasOutError() const {
+    return "Must set the lod tensor array";
+  }
+};
+
+class WriteToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
+    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
+                            "Cannot found %s", out_name);
+    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    auto *x = block->FindVarRecursive(x_name);
+    if (x != nullptr) {
+      out.SetDataType(x->GetDataType());
+    }
+  }
+};
+
+class ReadFromArrayOp : public ArrayOp {
+ public:
+  ReadFromArrayOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    auto &x_array = x->Get<framework::LoDTensorArray>();
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out != nullptr, "Out must be set");
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+    size_t offset = GetOffset(scope, dev_ctx);
+    if (offset < x_array.size()) {
+      framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
+                          out_tensor);
+      out_tensor->set_lod(x_array[offset].lod());
+    } else {
+      VLOG(10) << "offset " << offset << " >= " << x_array.size();
+    }
+  }
+};
+
+class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadFromArrayProtoMaker(framework::OpProto *proto,
+                          framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(TensorArray) the array will be read from.");
+    AddInput("I",
+             "(Tensor) the subscript index in tensor array. The number of "
+             "element should be 1");
+    AddOutput("Out", "(LoDTensor) the tensor will be read from.");
+    AddComment(R"DOC(
+ReadFromArray Operator.
+
+Read a LoDTensor from a LoDTensor Array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
+equation is
+
+$$T = A[i]$$
+
+)DOC");
+  }
+};
+
+class ReadFromArrayInferShape : public WriteToArrayInferShape {
+ protected:
+  const char *NotHasXError() const override {
+    return "The input array X must be set";
+  }
+  const char *NotHasOutError() const override {
+    return "The output tensor out must be set";
+  }
+};
+
+class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("read_from_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("write_to_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
+                  ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
+                  ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
+REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
+                  ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
+                  ops::ReadFromArrayGradMaker);
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index d785e57c83..94de3d5069 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -32,7 +32,7 @@ class TransposeOp : public framework::OperatorWithKernel {
     size_t axis_size = axis.size();
 
     PADDLE_ENFORCE_EQ(x_rank, axis_size,
-                      "the input tensor's rank(%d) "
+                      "The input tensor's rank(%d) "
                       "should be equal to the axis's size(%d)",
                       x_rank, axis_size);
 
@@ -64,12 +64,14 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor)The output tensor");
     AddAttr<std::vector<int>>(
         "axis",
-        "(vector<int>)a list of values, and the size of the list should be "
+        "(vector<int>)A list of values, and the size of the list should be "
         "the same with the input tensor rank, the tensor will "
         "permute the axes according the the values given");
     AddComment(R"DOC(
-The Tensor will be permuted according to the axis values given.
-The op is very much like the numpy.transpose function in python
+Transpose Operator.
+
+The input tensor will be permuted according to the axis values given.
+The op functions similar to how numpy.transpose works in python.
 For example:
  >> input = numpy.arange(6).reshape((2,3))
  >> input
@@ -83,6 +85,7 @@ For example:
 		[2, 5]])
 So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1},
 the output tensor shape will be (N, H, W, C)
+
 )DOC");
   }
 };
diff --git a/paddle/operators/transpose_op.cu b/paddle/operators/transpose_op.cu.cc
similarity index 100%
rename from paddle/operators/transpose_op.cu
rename to paddle/operators/transpose_op.cu.cc
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index aaa3f47ab5..e296032f41 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -14,27 +14,44 @@
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T, int Rank>
-void EigenTranspose(const framework::ExecutionContext& context,
-                    const framework::Tensor& in, framework::Tensor& out,
-                    std::vector<int> axis) {
-  Eigen::array<int, Rank> permute;
-  for (int i = 0; i < Rank; i++) {
-    permute[i] = axis[i];
+template <typename Place, typename T>
+inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
+                         const framework::Tensor& in, framework::Tensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      math::Transpose<Place, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      math::Transpose<Place, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      math::Transpose<Place, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      math::Transpose<Place, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      math::Transpose<Place, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      math::Transpose<Place, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      PADDLE_THROW("Tensors with rank at most 6 are supported");
   }
-  auto in_dim = in.dims();
-  auto out_dim = out.dims();
-
-  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
-  auto eigen_out = framework::EigenTensor<T, Rank>::From(out);
-  auto& dev = context.GetEigenDevice<Place>();
-  eigen_out.device(dev) = eigen_in.shuffle(permute);
 }
 
 template <typename Place, typename T>
@@ -47,28 +64,8 @@ class TransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
-    switch (ndims) {
-      case 1:
-        EigenTranspose<Place, T, 1>(context, *x, *out, axis);
-        break;
-      case 2:
-        EigenTranspose<Place, T, 2>(context, *x, *out, axis);
-        break;
-      case 3:
-        EigenTranspose<Place, T, 3>(context, *x, *out, axis);
-        break;
-      case 4:
-        EigenTranspose<Place, T, 4>(context, *x, *out, axis);
-        break;
-      case 5:
-        EigenTranspose<Place, T, 5>(context, *x, *out, axis);
-        break;
-      case 6:
-        EigenTranspose<Place, T, 6>(context, *x, *out, axis);
-        break;
-      default:
-        PADDLE_THROW("Tensors with rank at most 6 are supported");
-    }
+    auto& dev_ctx = context.device_context();
+    TransCompute<Place, T>(ndims, dev_ctx, *x, out, axis);
   }
 };
 
@@ -80,47 +77,19 @@ class TransposeGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* x_grad =
         context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (x_grad) {
-      x_grad->mutable_data<T>(context.GetPlace());
-
-      std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-      std::vector<int> reversed_axis(axis);
+    if (!x_grad) return;
 
-      for (size_t i = 0; i < axis.size(); i++) {
-        reversed_axis[axis[i]] = i;
-      }
-
-      int ndims = axis.size();
+    x_grad->mutable_data<T>(context.GetPlace());
+    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
 
-      switch (ndims) {
-        case 1:
-          EigenTranspose<Place, T, 1>(context, *out_grad, *x_grad,
-                                      reversed_axis);
-          break;
-        case 2:
-          EigenTranspose<Place, T, 2>(context, *out_grad, *x_grad,
-                                      reversed_axis);
-          break;
-        case 3:
-          EigenTranspose<Place, T, 3>(context, *out_grad, *x_grad,
-                                      reversed_axis);
-          break;
-        case 4:
-          EigenTranspose<Place, T, 4>(context, *out_grad, *x_grad,
-                                      reversed_axis);
-          break;
-        case 5:
-          EigenTranspose<Place, T, 5>(context, *out_grad, *x_grad,
-                                      reversed_axis);
-          break;
-        case 6:
-          EigenTranspose<Place, T, 6>(context, *out_grad, *x_grad,
-                                      reversed_axis);
-          break;
-        default:
-          PADDLE_THROW("Tensors with rank at most 6 are supported");
-      }
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
     }
+
+    int ndims = axis.size();
+    auto& dev_ctx = context.device_context();
+    TransCompute<Place, T>(ndims, dev_ctx, *out_grad, x_grad, reversed_axis);
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index cd22c561ac..fff1dc7ccd 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -63,9 +63,11 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::DataType IndicateDataType(
+  framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    return framework::OpKernelType(
+        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
   }
 };
 
@@ -97,7 +99,7 @@ uniform distribution.
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system.")
         .SetDefault(0);
-    AddAttr<int>("data_type", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
         .SetDefault(framework::DataType::FP32);
   }
 };
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000..89c48e071c
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
+        )DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000..18aafb7dc7
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000..243eb7e532
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+  }
+};
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
new file mode 100644
index 0000000000..9b3f21cf94
--- /dev/null
+++ b/paddle/operators/while_op.cc
@@ -0,0 +1,344 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+
+constexpr char kStepBlock[] = "step_block";
+constexpr char kCondition[] = "Condition";
+constexpr char kStepScopes[] = "StepScopes";
+constexpr char kParameters[] = "X";
+constexpr char kParamGrads[] = "X@GRAD";
+constexpr char kOutputs[] = "Out";
+
+class WhileOp : public framework::OperatorBase {
+ public:
+  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+    framework::Executor executor(dev_ctx);
+    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto step_scopes =
+        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+    while (cond.data<bool>()[0]) {
+      auto &current_scope = scope.NewScope();
+      step_scopes->push_back(&current_scope);
+
+      executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+    }
+  }
+};
+
+class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kParameters,
+             "A set of variables, which are required by operators inside the "
+             "block of While Op.")
+        .AsDuplicable();
+    AddInput(
+        kCondition,
+        "(Bool) An scalar. When it's False, the While Op will be terminated.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the block of While Op.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "(StepScopeVar) A vector of local scope, which size equals the "
+              "step number of While Op. The i'th scope storages temporary "
+              "variables generated in the i'th step.");
+    AddAttr<framework::BlockDescBind *>(kStepBlock,
+                                        "The step block inside WhileOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+class WhileGradOp : public framework::OperatorBase {
+ public:
+  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    framework::Executor executor(dev_ctx);
+    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto *step_scopes =
+        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
+
+    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
+    auto inside_og_names =
+        Attr<std::vector<std::string>>("original_output_grad");
+
+    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
+
+    for (auto cur_scope_iter = step_scopes->rbegin();
+         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      VLOG(3) << "Start backward at time_step "
+              << cur_scope_iter - step_scopes->rbegin();
+      framework::Scope &cur_scope = **cur_scope_iter;
+      // Link OG from outside to inside
+      for (size_t i = 0; i < outside_og_names.size(); ++i) {
+        auto outside_og_name = outside_og_names[i];
+        auto inside_og_name = inside_og_names[i];
+        VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
+                 << inside_og_name;
+        auto &og_outside =
+            detail::Ref(scope.FindVar(outside_og_name),
+                        "Cannot find Outside Gradient %s", outside_og_name);
+        auto &og_inside =
+            detail::Ref(cur_scope.Var(inside_og_name),
+                        "Cannot find inside gradient %s", inside_og_name);
+        if (og_outside.Type().hash_code() ==
+            typeid(framework::LoDTensor).hash_code()) {
+          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
+          auto &inside_tensor =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
+          inside_tensor.set_lod(outside_tensor.lod());
+          inside_tensor.ShareDataWith(outside_tensor);
+        } else if (og_outside.Type().hash_code() ==
+                   typeid(framework::LoDTensorArray).hash_code()) {
+          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
+          auto &inside_array =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
+          VLOG(10) << outside_og_name << " size = " << outside_array.size();
+          inside_array.resize(outside_array.size());
+
+          for (size_t j = 0; j < inside_array.size(); ++j) {
+            VLOG(10) << j << " " << outside_array[j].numel();
+            if (outside_array[j].numel() != 0) {
+              inside_array[j].set_lod(outside_array[j].lod());
+              inside_array[j].ShareDataWith(outside_array[j]);
+            } else {
+              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
+            }
+          }
+        }
+      }
+
+      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+
+      auto &pg_names = Outputs(kParamGrads);
+      auto &p_names = Inputs(kParameters);
+      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+        if (pg_names[param_id] == framework::kEmptyVarName) {
+          continue;  // parameter doesn't have gradient
+        }
+        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
+
+        //  // TODO(tonyyang-svail): Not sure we need the following
+        //  // If does not compute gradient of that variable inside rnn,
+        //  just
+        //  // continue
+        //  if (local_var_names.find(inside_grad_name) ==
+        //  local_var_names.end()) {
+        //    continue;
+        //  }
+
+        // zero gradient variable in step 0
+        if (cur_scope_iter == step_scopes->rbegin()) {
+          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
+          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
+          if (var->IsType<LoDTensor>()) {
+            auto &inside_tensor = var->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
+            zero_op->Run(scope, dev_ctx);
+          }
+        }
+
+        auto new_inside_name = cur_scope.Rename(inside_grad_name);
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+            {{"Out", {pg_names[param_id]}}}, {});
+        sum_op->Run(cur_scope, dev_ctx);
+        cur_scope.Rename(new_inside_name, inside_grad_name);
+      }
+    }
+  }
+};
+
+class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad = new framework::OpDescBind();
+    grad->SetType("while_grad");
+    grad->SetInput(kParameters, Input(kParameters));
+
+    // Not all of IGs will be generated by inner gradient operators of while op.
+    // Ignore IGs that is not generated by the inside block.
+    auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
+    std::unordered_set<std::string> all_outs;
+    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
+      for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
+        all_outs.insert(oname);
+      }
+    }
+    for (auto &each_ig : igs) {
+      if (all_outs.find(each_ig) == all_outs.end()) {
+        VLOG(10) << "Ignore " << each_ig;
+        each_ig = framework::kEmptyVarName;
+      }
+    }
+
+    grad->SetOutput(framework::GradVarName(kParameters), igs);
+
+    grad->SetInput(kOutputs, Output(kOutputs));
+
+    // OG should be re-calculated by step blocks, since many outputs of while op
+    // do not need to calculate gradients.
+    std::unordered_set<std::string> block_ins;
+    auto *fwd_block = this->grad_block_[0]->ParentBlock();
+    {
+      for (auto &p : Input(kParameters)) {
+        block_ins.insert(p);
+      }
+      for (auto &o : Output(kOutputs)) {
+        block_ins.insert(o);
+      }
+    }
+    std::unordered_set<std::string> extra_inputs;
+    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
+      for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) {
+        if (block_ins.find(input_name) != block_ins.end()) {
+          continue;
+        }
+
+        // If the input of Op is generated by the forward block, do not make it
+        // as input again.
+        if (fwd_block->FindVar(input_name) != nullptr) {
+          continue;
+        }
+
+        extra_inputs.insert(input_name);
+      }
+
+      for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) {
+        block_ins.insert(output_name);
+      }
+    }
+
+    std::vector<std::string> extra_inputs_list;
+    extra_inputs_list.resize(extra_inputs.size());
+    std::copy(extra_inputs.begin(), extra_inputs.end(),
+              extra_inputs_list.begin());
+    grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+    grad->SetInput(kStepScopes, Output(kStepScopes));
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    // record the original output gradient names, since the gradient name of
+    // while operator could be renamed.
+    grad->SetAttr("original_output_grad", extra_inputs_list);
+
+    return std::unique_ptr<framework::OpDescBind>(grad);
+  }
+};
+
+class WhileGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind &op_desc,
+                  framework::BlockDescBind *block) const override {
+    auto p_names = op_desc.Input(kParameters);
+    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
+
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
+      auto *g_var = block->FindVarRecursive(pg_names[i]);
+      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
+                << " type: " << p_var.GetType();
+        g_var->SetType(p_var.GetType());
+        g_var->SetDataType(p_var.GetDataType());
+      }
+    }
+  }
+};
+
+class WhileGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->HasInputs(kParameters);
+    ctx->HasOutputs(framework::GradVarName(kParameters));
+    ctx->HasInputs(kOutputs);
+    ctx->HasInputs(framework::GradVarName(kOutputs));
+
+    auto p_names = ctx->Inputs(kParameters);
+    auto pg_names = ctx->Outputs(kParamGrads);
+    auto var_types = ctx->GetInputsVarType(kParameters);
+    std::vector<std::string> names_to_set;
+    std::vector<framework::DDim> dims_to_set;
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      if (pg_names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+      auto dims = ctx->GetInputsElementDim(kParameters, i);
+      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims);
+      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+        // not sure how to set the dim of LOD_TENSOR_ARRAY
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims);
+      }
+    }
+    ctx->SetDims(names_to_set, dims_to_set);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(while, paddle::operators::WhileOp,
+                  paddle::operators::WhileOpMaker,
+                  paddle::operators::WhileGradOpDescMaker);
+REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
+                  paddle::operators::WhileGradOpShapeInference,
+                  paddle::operators::WhileGradOpVarTypeInference);
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index 926fee47e1..25fc35311f 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -1,5 +1,3 @@
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
 set(OPITMIZER_SRCS
     adadelta_optimizer.cc
     adagrad_optimizer.cc
@@ -9,11 +7,6 @@ set(OPITMIZER_SRCS
     sgd_optimizer.cc
   )
 
-add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
-add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies})
-
-
-if(WITH_TESTING)
-  add_simple_unittest(serialization_test)
-  add_simple_unittest(parameter_optimizer_test)
-endif()
+cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
+cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
+cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
index 34913c4050..5cc7c47d44 100644
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "adadelta_optimizer.h"
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index bc634ee46d..6aab1ad553 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include "parameter_optimizer.h"
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
index d915ffb870..c981996bab 100644
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <cmath>
 
 #include "adagrad_optimizer.h"
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index b2935f8aff..447b7c7547 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include "parameter_optimizer.h"
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
index 18e5896a22..6dc2d74970 100644
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "adam_optimizer.h"
 #include <cmath>
 
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index d25cdc0731..37ab53afc3 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include "parameter_optimizer.h"
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index a2af139d01..faa2376452 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "optimizer.h"
 #include <glog/logging.h>
 #include <cstdlib>
@@ -6,8 +20,8 @@
 
 #include "parameter_optimizer.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
+using paddle::optimizer::ParameterOptimizer;
+using paddle::optimizer::Tensor;
 
 template <paddle_element_type VALUE>
 struct EnumToType {};
@@ -15,22 +29,21 @@ struct EnumToType {};
 template <class T>
 struct TypeToEnum {};
 
-#define MATCH_ENUM_TYPE(TYPE, ENUM)                  \
-  template <>                                        \
-  struct TypeToEnum<TYPE> {                          \
-    static paddle_element_type v() { return ENUM; }; \
-    static constexpr TYPE value = ENUM;              \
-  };                                                 \
-  template <>                                        \
-  struct EnumToType<ENUM> {                          \
-    typedef TYPE Type;                               \
+#define MATCH_ENUM_TYPE(TYPE, ENUM)                 \
+  template <>                                       \
+  struct TypeToEnum<TYPE> {                         \
+    static paddle_element_type v() { return ENUM; } \
+    static constexpr TYPE value = ENUM;             \
+  };                                                \
+  template <>                                       \
+  struct EnumToType<ENUM> {                         \
+    typedef TYPE Type;                              \
   }
 
 MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
 MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
 MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
 MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
-// TODO(zhihong): only implement below type, need to fix
 MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
 MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
 
diff --git a/paddle/optimizer/optimizer.h b/paddle/optimizer/optimizer.h
index aabf7a458d..e6fa12a4d2 100644
--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include <stdbool.h>
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc
index db0714635f..da92c2d01c 100644
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <glog/logging.h>
 #include "adadelta_optimizer.h"
 #include "adagrad_optimizer.h"
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index 8319f84e1b..99d0416e75 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include <glog/logging.h>
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cc
similarity index 96%
rename from paddle/optimizer/parameter_optimizer_test.cpp
rename to paddle/optimizer/parameter_optimizer_test.cc
index c99b2254ac..83757a3917 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -110,7 +110,7 @@ public:
 
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(s, kSize);
+      EXPECT_EQ(static_cast<size_t>(s), kSize);
       for (size_t j = 0; j < kSize; ++j) {
         EXPECT_EQ(newp[j], (*p)[j]);
       }
@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
 TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
 
 TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cc
similarity index 92%
rename from paddle/optimizer/serialization_test.cpp
rename to paddle/optimizer/serialization_test.cc
index 4c416f55ee..940e941e90 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cc
@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) {
     EXPECT_EQ(t1[i], t[i]);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
index 1090419083..c150144ac2 100644
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "sgd_optimizer.h"
 #include "serialization.h"
 
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index 6e1a0f0d3f..0b1da0aa27 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #pragma once
 
 #include "parameter_optimizer.h"
@@ -15,7 +29,6 @@ public:
         nesterov_(n) {
     if (momentum_ != 0.0) {
       size_t size = parameter->size();
-      // TODO: fix it with align aware allocator bind to Tensor
       momentums_ = new Tensor(size);
     }
   }
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index f031109501..3b0f09cea6 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -200,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
                                      false,
                                      useGpu_);
     }
-  } else if (matType == MAT_NORMAL_SHARED) {
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     size_t blockNum = 0;
     CHECK(isGradShared(&blockNum));
@@ -259,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
   } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
     CHECK(isGradSparseUpdate());
     mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  } else {
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
     LOG(FATAL) << "Unsupported mat type" << matType;
   }
 }
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 8b3be062b6..1898598e49 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_USE_MKLML
 #pragma omp parallel for
 #endif
   for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index bd86a9fe26..88df28a966 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,15 +1,20 @@
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+if(WITH_GPU)
+  cc_library(enforce SRCS enforce.cc DEPS nccl)
+else()
+  cc_library(enforce SRCS enforce.cc)
+endif()
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
+
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
-cc_library(place SRCS place.cc)
+cc_library(place SRCS place.cc DEPS enforce)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
-cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)
-
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
new file mode 100644
index 0000000000..d9f49527dc
--- /dev/null
+++ b/paddle/platform/call_once.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+namespace paddle {
+namespace platform {
+
+/*
+ The current implementation of std::call_once has a bug described in
+ https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
+ This is likely caused by a deeper bug of pthread_once, which is discussed in
+ https://patchwork.ozlabs.org/patch/482350/
+
+ This wrap is a hack to avoid this bug.
+*/
+template <typename Callable, typename... Args>
+inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
+  bool good = false;
+  std::exception ex;
+  std::call_once(flag,
+                 [&](Args&&... args) {
+                   try {
+                     f(args...);
+                     good = true;
+                   } catch (const std::exception& e) {
+                     ex = e;
+                   } catch (...) {
+                     ex = std::runtime_error("excption caught in call_once");
+                   }
+                 },
+                 args...);
+  if (!good) {
+    throw std::exception(ex);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cuda_helper.h b/paddle/platform/cuda_helper.h
index a7d99cde10..376bb0e688 100644
--- a/paddle/platform/cuda_helper.h
+++ b/paddle/platform/cuda_helper.h
@@ -31,6 +31,16 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 
 // For atomicAdd.
 USE_CUDA_ATOMIC(Add, float);
+USE_CUDA_ATOMIC(Add, int);
+USE_CUDA_ATOMIC(Add, unsigned int);
+USE_CUDA_ATOMIC(Add, unsigned long long int);
+
+CUDA_ATOMIC_WRAPPER(Add, int64_t) {
+  static_assert(sizeof(int64_t) == sizeof(long long int),
+                "long long should be int64");
+  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
+                       static_cast<unsigned long long int>(val));
+}
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000..b6311cb23d
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
+  std::array<char, 128> buf;
+  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
+  memcpy(buf.data(), tmpl.data(), tmpl.size());
+  auto result = mktemp(buf.data());
+  PADDLE_ENFORCE(strlen(result) != 0);
+  std::string config_file = result;
+
+  {
+    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
+    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+    for (const auto& line : config_flags) {
+      ofs << line << std::endl;
+    }
+  }
+
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index ce3421a3cb..80a4c9bb4b 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -63,9 +63,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
     }                                                             \
   } while (false)
 
-enum class DataLayout {
+enum class DataLayout {  // Not use
   kNHWC,
   kNCHW,
+  kNCDHW,
   kNCHW_VECT_C,
 };
 
@@ -107,12 +108,15 @@ class CudnnDataType<double> {
   }
 };
 
-inline cudnnTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+inline cudnnTensorFormat_t GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
   switch (order) {
     case DataLayout::kNHWC:
       return CUDNN_TENSOR_NHWC;
     case DataLayout::kNCHW:
       return CUDNN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
     default:
       PADDLE_THROW("Unknown cudnn equivalent for order");
   }
@@ -139,7 +143,7 @@ class ScopedTensorDescriptor {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
     // Update tensor descriptor dims setting if groups > 1
-    // FIXME(typhoonzero): Assume using NCHW order
+    // NOTE: Assume using NCHW or NCDHW order
     std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
@@ -176,12 +180,12 @@ class ScopedFilterDescriptor {
                                             const cudnnDataType_t type,
                                             const std::vector<int>& kernel,
                                             const int groups = 1) {
-    // filter layout: MCHW, where M is the number of
+    // filter layout: MCHW(MCDHW), where M is the number of
     // output image channels, C is the number of input image channels,
-    // H and W is height and width of filter.
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
     std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
     if (groups > 1) {
-      // M /= groups
       kernel_with_group[0] /= groups;
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
@@ -219,13 +223,15 @@ class ScopedConvolutionDescriptor {
     PADDLE_ENFORCE_EQ(pads.size(), strides.size());
     PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
 
-#if CUDNN_VERSION < 6000
+#if !CUDNN_VERSION_MIN(6, 0, 0)
     // cudnn v5 does not support dilation conv, the argument is called upscale
     // instead of dilations and it is must be one.
     for (size_t i = 0; i < dilations.size(); ++i) {
       PADDLE_ENFORCE_EQ(
           dilations[i], 1,
-          "Dilations conv is not supported in this cuDNN version");
+          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
+          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
+          CUDNN_VERSION % 100);
     }
 #endif
 
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/platform/cudnn_helper_test.cc
index 6bd85ae1ca..427359f697 100644
--- a/paddle/platform/cudnn_helper_test.cc
+++ b/paddle/platform/cudnn_helper_test.cc
@@ -38,6 +38,26 @@ TEST(CudnnHelper, ScopedTensorDescriptor) {
   EXPECT_EQ(strides[2], 6);
   EXPECT_EQ(strides[1], 36);
   EXPECT_EQ(strides[0], 144);
+
+  // test tensor5d: ScopedTensorDescriptor
+  ScopedTensorDescriptor tensor5d_desc;
+  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
+  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
+
+  std::vector<int> dims_5d(5);
+  std::vector<int> strides_5d(5);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data());
+
+  EXPECT_EQ(nd, 5);
+  for (size_t i = 0; i < dims_5d.size(); ++i) {
+    EXPECT_EQ(dims_5d[i], shape_5d[i]);
+  }
+  EXPECT_EQ(strides_5d[4], 1);
+  EXPECT_EQ(strides_5d[3], 6);
+  EXPECT_EQ(strides_5d[2], 36);
+  EXPECT_EQ(strides_5d[1], 216);
+  EXPECT_EQ(strides_5d[0], 864);
 }
 
 TEST(CudnnHelper, ScopedFilterDescriptor) {
@@ -60,6 +80,20 @@ TEST(CudnnHelper, ScopedFilterDescriptor) {
   for (size_t i = 0; i < shape.size(); ++i) {
     EXPECT_EQ(kernel[i], shape[i]);
   }
+
+  ScopedFilterDescriptor filter_desc_4d;
+  std::vector<int> shape_4d = {2, 3, 3, 3};
+  auto desc_4d = filter_desc.descriptor<float>(DataLayout::kNCDHW, shape_4d);
+
+  std::vector<int> kernel_4d(4);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(
+      desc_4d, 4, &type, &format, &nd, kernel_4d.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < shape_4d.size(); ++i) {
+    EXPECT_EQ(kernel_4d[i], shape_4d[i]);
+  }
 }
 
 TEST(CudnnHelper, ScopedConvolutionDescriptor) {
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 36450e9268..7afcdfce93 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -124,6 +124,11 @@ void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
 }
 
+void CUDADeviceContext::Finish() const {
+  Wait();
+  PADDLE_ENFORCE(cudaGetLastError());
+}
+
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index ef5f19214d..526d089e35 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -46,6 +46,8 @@ class DeviceContext {
   DeviceType* GetEigenDevice() const;
 
   virtual void Wait() const {}
+
+  virtual void Finish() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
@@ -77,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
 
+  /*! \brief  Check potential errors for the cuda kernel calls. */
+  void Finish() const override;
+
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index bb3fec1be9..f4fda65907 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,3 +1,3 @@
-cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
         DEPS dynamic_loader nccl)
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 6b64539b0a..61a22d9db3 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -62,6 +62,8 @@ extern void *cublas_dso_handle;
   DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
   __macro(cublasSgemv_v2);                \
   __macro(cublasDgemv_v2);                \
   __macro(cublasSgemm_v2);                \
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index d3e4cb567d..76ec82e108 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/cudnn.h>
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -37,6 +38,25 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef PADDLE_USE_DSO
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle);
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
+                 "Cannot load cudnn shared library. Cannot invoke method %s",
+                 fn_name);
+}
+#else
+bool HasCUDNN() { return true; }
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index b2d69da93b..8c937b37d7 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -25,9 +25,11 @@ namespace dynload {
 
 extern std::once_flag cudnn_dso_flag;
 extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
 
 #ifdef PADDLE_USE_DSO
 
+extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
@@ -36,6 +38,7 @@ extern void* cudnn_dso_handle;
       std::call_once(cudnn_dso_flag,                               \
                      paddle::platform::dynload::GetCudnnDsoHandle, \
                      &cudnn_dso_handle);                           \
+      EnforceCUDNNLoaded(#__name);                                 \
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
@@ -135,6 +138,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index 6feba42c0d..7a82d06a0a 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -78,12 +78,11 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
     if (nullptr == *dso_handle) {
       if (dso_path == "libcudnn.dylib") {
-        PADDLE_ENFORCE(true,
-                       "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                       "For instance, sudo tar -xzf "
-                       "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                       "chmod a+r /usr/local/cuda/include/cudnn.h "
-                       "/usr/local/cuda/lib/libcudnn*");
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
       }
     }
   }
@@ -92,7 +91,8 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
 
 static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                               const std::string& dso_name,
-                                              void** dso_handle) {
+                                              void** dso_handle,
+                                              bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
   *dso_handle = nullptr;
 
@@ -111,15 +111,19 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
-  PADDLE_ENFORCE(nullptr != *dso_handle,
-                 "Failed to find dynamic library: %s ( %s ) \n Please specify "
-                 "its path correctly using following ways: \n Method. set "
-                 "environment variable LD_LIBRARY_PATH on Linux or "
-                 "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-                 "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-                 "using the DYLD_LIBRARY_PATH is impossible unless System "
-                 "Integrity Protection (SIP) is disabled.",
-                 dlPath, dlerror());
+  auto error_msg =
+      "Failed to find dynamic library: %s ( %s ) \n Please specify "
+      "its path correctly using following ways: \n Method. set "
+      "environment variable LD_LIBRARY_PATH on Linux or "
+      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "using the DYLD_LIBRARY_PATH is impossible unless System "
+      "Integrity Protection (SIP) is disabled.";
+  if (throw_on_error) {
+    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == *dso_handle) {
+    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
+  }
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
@@ -132,9 +136,10 @@ void GetCublasDsoHandle(void** dso_handle) {
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+                             false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
 #endif
 }
 
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 0618c7414f..981b2ab258 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -17,6 +17,7 @@
 #include <dlfcn.h>
 #include <nccl.h>
 #include <mutex>
+#include "paddle/platform/call_once.h"
 #include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -27,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
-  struct DynLoad__##__name {                                      \
-    template <typename... Args>                                   \
-    auto operator()(Args... args) -> decltype(__name(args...)) {  \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(nccl_dso_flag,                               \
-                     paddle::platform::dynload::GetNCCLDsoHandle, \
-                     &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
-    }                                                             \
-  };                                                              \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
+      platform::call_once(nccl_dso_flag,                               \
+                          paddle::platform::dynload::GetNCCLDsoHandle, \
+                          &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
+    }                                                                  \
+  };                                                                   \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/platform/enforce.cc b/paddle/platform/enforce.cc
new file mode 100644
index 0000000000..e8d31bc782
--- /dev/null
+++ b/paddle/platform/enforce.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index bfe708748a..5abd4d4a34 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -49,7 +49,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-namespace {
 #ifdef __GNUC__
 inline std::string demangle(std::string name) {
   int status = -4;  // some arbitrary value to eliminate the compiler warning
@@ -60,7 +59,6 @@ inline std::string demangle(std::string name) {
 #else
 inline std::string demangle(std::string name) { return name; }
 #endif
-}
 
 struct EnforceNotMet : public std::exception {
   std::exception_ptr exp_;
@@ -236,16 +234,24 @@ inline void throw_on_error(T e) {
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                            \
-  PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
-                 paddle::string::Sprintf("" __VA_ARGS__));
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
-  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
-                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
-                 paddle::string::to_string(__VAL1),                           \
-                 paddle::string::Sprintf("" __VA_ARGS__));
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
+  do {                                                       \
+    if (UNLIKELY(nullptr == (__VAL))) {                      \
+      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+                   paddle::string::Sprintf("" __VA_ARGS__)); \
+    }                                                        \
+  } while (0)
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+                   " %s\n%s",                                           \
+                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+                   paddle::string::to_string(__VAL1),                   \
+                   paddle::string::Sprintf("" __VA_ARGS__));            \
+    }                                                                   \
+  } while (0)
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index f3455a8733..4fa2eaed31 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 
 #include "paddle/platform/enforce.h"
 
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
-              "Default use 95% of GPU memory for PaddlePaddle,"
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+              "Default use 92% of GPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
 namespace paddle {
@@ -75,15 +75,19 @@ size_t GpuMaxChunkSize() {
   GpuMemoryUsage(available, total);
 
   // Reserving the rest memory for page tables, etc.
-  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+  size_t reserving = 0.05 * total;
 
   // If available less than minimum chunk size, no usable memory exists.
-  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+  available =
+      std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               reserving) -
+      reserving;
 
-  // If available less than reserving, no usable memory exists.
-  size_t usable = std::max(available, reserving) - reserving;
+  size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
 
-  return usable;
+  PADDLE_ENFORCE_LT(allocating, available);
+
+  return allocating;
 }
 
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
@@ -109,5 +113,10 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
       cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
       "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
 }
+
+void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
+                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 37665b97d7..db961f3838 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -60,6 +60,9 @@ void GpuMemcpySync(void *dst, const void *src, size_t count,
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream);
 
+//! Set memory dst with value count size asynchronously
+void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
index f196868c72..bb9d59ec0a 100644
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@@ -49,8 +49,6 @@ struct Transform<platform::CPUPlace> {
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
                   OutputIter result, UnaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first, last, result, op);
   }
 
@@ -59,8 +57,6 @@ struct Transform<platform::CPUPlace> {
   void operator()(const DeviceContext& context, InputIter1 first1,
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
-    auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_cpu_place(place), "It must use CPU place.");
     std::transform(first1, last1, first2, result, op);
   }
 };
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index ccfc0e7602..f75475a88f 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -49,7 +49,7 @@ if(WITH_TESTING)
   add_subdirectory(test)
 endif()
 
-if(NOT WITH_C_API)
+if(NOT MOBILE_INFERENCE)
   add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
   link_paddle_exe(paddle_pserver_main)
 
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index a9bcc47438..fd55f410d3 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,8 +1,10 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array paddle_memory executor prune
+    DEPS pybind python backward proto_desc paddle_memory executor prune
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
 
-cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array)
+if(WITH_DOC)
+  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
+endif(WITH_DOC)
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index d3fc544ec7..6c8f06cccb 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -97,6 +97,15 @@ namespace pybind {
 
 using namespace paddle::framework;  // NOLINT
 
+template <typename T>
+static py::bytes SerializeMessage(T &self) {
+  // Check IsInitialized in Python
+  std::string retv;
+  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
+                 "Cannot serialize message");
+  return retv;
+}
+
 // Bind Methods
 void BindProgramDesc(py::module &m) {
   py::class_<ProgramDescBind>(m, "ProgramDesc", "")
@@ -132,17 +141,7 @@ void BindProgramDesc(py::module &m) {
       .def("block", &ProgramDescBind::MutableBlock,
            py::return_value_policy::reference)
       .def("num_blocks", &ProgramDescBind::Size)
-      .def("serialize_to_string",
-           [](ProgramDescBind &program_desc) -> py::bytes {
-             const ProgramDesc *desc = program_desc.Proto();
-             PADDLE_ENFORCE(desc->IsInitialized(),
-                            "ProgramDesc has not been initialized.");
-             std::string res;
-             PADDLE_ENFORCE(
-                 desc->SerializeToString(&res),
-                 "Serialize ProgramDesc Error. This could be a bug of Paddle.");
-             return res;
-           })
+      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
       .def("parse_from_string",
            [](ProgramDescBind &program_desc, const std::string &data) {
              ProgramDesc *desc = program_desc.Proto();
@@ -181,16 +180,7 @@ void BindBlockDesc(py::module &m) {
            py::return_value_policy::reference)
       .def("op_size", &BlockDescBind::OpSize)
       .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes {
-        const BlockDesc *desc = block_desc.Proto();
-        PADDLE_ENFORCE(desc->IsInitialized(),
-                       "BlockDesc has not been initialized.");
-        std::string res;
-        PADDLE_ENFORCE(
-            desc->SerializeToString(&res),
-            "Serialize BlockDesc Error. This could be a bug of Paddle.");
-        return res;
-      });
+      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
 }
 
 void BindVarDsec(py::module &m) {
@@ -212,24 +202,14 @@ void BindVarDsec(py::module &m) {
            },
            py::return_value_policy::reference)
       .def("set_shape", &VarDescBind::SetShape)
-      .def("set_data_type", &VarDescBind::SetDataType)
+      .def("set_dtype", &VarDescBind::SetDataType)
       .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("data_type", &VarDescBind::GetDataType)
+      .def("dtype", &VarDescBind::GetDataType)
       .def("lod_level", &VarDescBind::GetLodLevel)
       .def("set_lod_level", &VarDescBind::SetLoDLevel)
       .def("type", &VarDescBind::GetType)
       .def("set_type", &VarDescBind::SetType)
-      .def("serialize_to_string",
-           [](VarDescBind &var_desc) -> py::bytes {
-             const VarDesc *desc = var_desc.Proto();
-             PADDLE_ENFORCE(desc->IsInitialized(),
-                            "VarDesc has not been initialized.");
-             std::string res;
-             PADDLE_ENFORCE(
-                 desc->SerializeToString(&res),
-                 "Serialize VarDesc Error. This could be a bug of Paddle.");
-             return res;
-           })
+      .def("serialize_to_string", SerializeMessage<VarDescBind>)
       .def("persistable", &VarDescBind::Persistable)
       .def("set_persistable", &VarDescBind::SetPersistable);
 
@@ -239,7 +219,8 @@ void BindVarDsec(py::module &m) {
       .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
       .value("FETCH_LIST", VarDesc::FETCH_LIST)
       .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE);
+      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
 }
 
 void BindOpDesc(py::module &m) {
@@ -273,16 +254,7 @@ void BindOpDesc(py::module &m) {
       .def("check_attrs", &OpDescBind::CheckAttrs)
       .def("infer_shape", &OpDescBind::InferShape)
       .def("infer_var_type", &OpDescBind::InferVarType)
-      .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
-        const OpDesc *desc = op_desc.Proto();
-        PADDLE_ENFORCE(desc->IsInitialized(),
-                       "OpDesc has not been initialized.");
-        std::string res;
-        PADDLE_ENFORCE(
-            desc->SerializeToString(&res),
-            "Serialize OpDesc Error. This could be a bug of Paddle.");
-        return res;
-      });
+      .def("serialize_to_string", SerializeMessage<OpDescBind>);
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 78dc7943b3..c16d3e0cbe 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -23,11 +23,10 @@ limitations under the License. */
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/prune.h"
 #include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
@@ -38,9 +37,13 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
 
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 static size_t UniqueIntegerGenerator(const std::string &prefix) {
@@ -112,11 +115,13 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
       .def("set", PyCPUTensorSetFromArray<int64_t>)
+      .def("set", PyCPUTensorSetFromArray<bool>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
       .def("set", PyCUDATensorSetFromArray<int64_t>)
+      .def("set", PyCUDATensorSetFromArray<bool>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -233,6 +238,9 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<SelectedRows>();
            },
            py::return_value_policy::reference)
+      .def("get_lod_tensor_array",
+           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+           py::return_value_policy::reference)
 #ifdef PADDLE_WITH_CUDA
       .def("get_communicator",
            [](Variable &self) -> platform::Communicator * {
@@ -284,6 +292,11 @@ All parameter, weight, gradient are variables in Paddle.
     Prune(*prog_with_targets.Proto(), &pruned_desc);
     return new ProgramDescBind(pruned_desc);
   });
+  m.def("inference_optimize", [](ProgramDescBind &origin) {
+    ProgramDesc pruned_desc;
+    InferenceOptimize(*(origin.Proto()), &pruned_desc);
+    return new ProgramDescBind(pruned_desc);
+  });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
@@ -381,83 +394,6 @@ All parameter, weight, gradient are variables in Paddle.
         self->CompleteAddOp();
       });
 
-  py::class_<framework::TensorArray>(m, "TensorArray")
-      .def("__init__",
-           [](TensorArray &instance) { new (&instance) TensorArray(); })
-      .def("read",
-           [](TensorArray &self, size_t index) { return self.Read(index); })
-      .def("write", [](TensorArray &self, size_t index,
-                       LoDTensor &value) { self.Write(index, value); })
-      .def("write_shared",
-           [](TensorArray &self, size_t index, const LoDTensor &value) {
-             self.WriteShared(index, value);
-           })
-      .def("size", [](TensorArray &self) { return self.size(); })
-      .def("pack",
-           [](TensorArray &self, size_t level,
-              const std::vector<std::vector<size_t>> &meta_info,
-              const std::vector<std::vector<size_t>> &lod) {
-             std::vector<DySeqMeta> meta;
-             for (auto &info : meta_info) {
-               PADDLE_ENFORCE_EQ(info.size(), 3UL);
-               meta.emplace_back(info[0], info[1], info[2]);
-             }
-#ifndef PADDLE_WITH_CUDA
-             return self.Pack(level, meta, lod);
-#else
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return self.Pack(level, meta, new_lod);
-#endif
-           })
-      .def("unpack",
-           [](TensorArray &self, const LoDTensor &source, int level,
-              bool length_descend) {
-             auto metas = self.Unpack(source, level, length_descend);
-             std::vector<std::vector<size_t>> meta_info;
-             for (auto meta : metas) {
-               meta_info.emplace_back(
-                   std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
-             }
-             return meta_info;
-           })
-      .def("stack", [](TensorArray &self) { return self.Stack(); })
-      .def("unstack",
-           [](TensorArray &self, const LoDTensor &source) {
-             return self.Unstack(source);
-           })
-      .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
-        return self.UnstackShared(source);
-      });
-
-  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
-                                                          "DynamicRecurrentOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
-                    OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto rnn_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::DynamicRecurrentOp *>(
-                        rnn_op.release());
-                  })
-      .def("set_step_unit",
-           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
-               -> void { self.rnn.SetStepUnit(net.Clone()); })
-      .def("get_state",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.state(name); })
-      .def("get_step_input",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_input(name); })
-      .def("get_step_output",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_output(name); });
-
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
@@ -505,9 +441,30 @@ All parameter, weight, gradient are variables in Paddle.
         return res;
       });
 
+  py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__getitem__",
+           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+           py::return_value_policy::reference)
+      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
+      .def("__setitem__",
+           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
+             PADDLE_ENFORCE_LT(i, self.size());
+             self[i].ShareDataWith(t);
+             self[i].set_lod(t.lod());
+           })
+      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
+        self.emplace_back();
+        self.back().ShareDataWith(t);
+        self.back().set_lod(t.lod());
+      });
+
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
 
   return m.ptr();
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index f278e79af6..41fa658502 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -85,7 +85,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 }  // namespace details
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
   auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t>()(
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
           tensor);
   return buffer_info;
 }
diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh
new file mode 100755
index 0000000000..af16b84ca8
--- /dev/null
+++ b/paddle/scripts/check_env.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "========================= Hardware Information ========================="
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "-------------------------- Memory Information --------------------------"
+# dmidecode support start from 2.11
+dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
+if [ $dmi_ver -lt 2 ]; then
+  echo "Error: dmidecode unknown or version is too old"
+  exit 0
+fi
+if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
+  echo "Error: need root to run dmidecode"
+  exit 0
+fi
+max_dimms=0
+num_dimms_installed=0
+for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
+  num_refered=`dmidecode |grep -wc "$dimm_id"`
+  # the actual dimm id should be refered only once
+  if [ $num_refered -eq 1 ]; then
+    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
+      /Unknown/ {f=1};
+      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
+    if [ $num_unknown -eq 0 ]; then
+      dimms_installed="$dimms_installed \n $dimm_id"
+      ((num_dimms_installed++))
+    else
+      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
+    fi
+    ((max_dimms++))
+  fi
+done
+echo "Installed DIMM number  : $num_dimms_installed"
+num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
+if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
+  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
+fi
+num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
+if [ $num_dimms_installed -ne $num_clock_configed ]; then
+  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
+fi
+echo -e "Installed DIMMs Locator: $dimms_installed"
+echo -e "Not installed DIMMs    : $dimms_uninstalled"
+max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
+echo "DIMMs max slots        : $max_dimm_slots"
+if [ $max_dimms -ne $max_dimm_slots ]; then
+  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
+fi
+free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
+free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
+if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
+  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
+  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
+  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
+else
+  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+fi
+echo "Memory Size            : $mem_sz"
+echo "Swap Memory Size       : $swap_sz"
+echo "Total Memory Size      : $total_sz"
+echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
+# DIMMs fequency
+clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
+echo "Configed Clock Speed   : $clock_speeds"
+num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
+if [ $num_clock_type -ne 1 ]; then
+  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
+fi
+
+echo "-------------------------- Turbo Information  --------------------------"
+scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
+echo "Scaling Driver         : $scaling_drive"
+if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
+  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
+  if [ $turbo -eq 1 ]; then
+    echo "Turbo Status           : OFF"
+  else
+    echo "Turbo Status           : ON"
+  fi
+else
+  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
+  echo "Turbo Status           : Unknown"
+fi
+# cpu frequency
+num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
+num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
+if [ $num_max_freq -ne 1 ]; then
+  echo "Error: the max_frequency of all CPU should be equal"
+fi
+if [ $num_min_freq -ne 1 ]; then
+  echo "Error: the min_frequency of all CPU should be equal"
+fi
+max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
+max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
+min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
+min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
+echo "CPU Max Frequency      : $max_freq GHz"
+echo "CPU Min Frequency      : $min_freq GHz"
+# cpu governor
+num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
+if [ $num_governor -ne 1 ]; then
+  echo "Error: the governor of all CPU should be the same"
+fi
+governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
+echo "CPU Freq Governor      : $governor"
+
+
+echo "========================= Software Information ========================="
+echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
+echo "OS Version             : `cat /etc/redhat-release`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "------------------ Environment Variables Information -------------------"
+kmp_affinity=`env | grep KMP_AFFINITY`
+omp_dynamic=`env | grep OMP_DYNAMIC`
+omp_nested=`env | grep OMP_NESTED`
+omp_num_threads=`env | grep OMP_NUM_THREADS`
+mkl_num_threads=`env | grep MKL_NUM_THREADS`
+mkl_dynamic=`env | grep MKL_DYNAMIC`
+if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
+if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
+if [ ! $omp_nested ]; then omp_nested="unset"; fi
+if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
+if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
+if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
+echo "KMP_AFFINITY           : $kmp_affinity"
+echo "OMP_DYNAMIC            : $omp_dynamic"
+echo "OMP_NESTED             : $omp_nested"
+echo "OMP_NUM_THREADS        : $omp_num_threads"
+echo "MKL_NUM_THREADS        : $mkl_num_threads"
+echo "MKL_DYNAMIC            : $mkl_dynamic"
+# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
+for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
+  mkldnn_found=`find $path -name "libmkldnn.so"`
+  if [ "$mkldnn_found" ]; then
+    echo "Found MKL-DNN          : $mkldnn_found"
+  fi
+  mklml_found=`find $path -name "libmklml_intel.so"`
+  if [ "$mklml_found" ]; then
+    echo "Found MKLML            : $mklml_found"
+  fi
+  iomp_found=`find $path -name "libiomp5.so"`
+  if [ "$iomp_found" ]; then
+    echo "Found IOMP             : $iomp_found"
+  fi
+done
+
+# dump all details for fully check
+lscpu > lscpu.dump
+dmidecode > dmidecode.dump
+
+# The expected result would be like:
+# ========================= Hardware Information =========================
+# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+# CPU Family             : 6
+# Socket Number          : 2
+# Cores Per Socket       : 20
+# Total Physical Cores   : 40
+# Total Virtual Cores    : 40
+# Hyper Threading        : OFF
+# NUMA Nodes             : 2
+# -------------------------- Memory Information --------------------------
+# Installed DIMM number  : 12
+# Installed DIMMs Locator:
+#  CPU1_DIMM_A1
+#  CPU1_DIMM_B1
+#  CPU1_DIMM_C1
+#  CPU1_DIMM_D1
+#  CPU1_DIMM_E1
+#  CPU1_DIMM_F1
+#  CPU2_DIMM_A1
+#  CPU2_DIMM_B1
+#  CPU2_DIMM_C1
+#  CPU2_DIMM_D1
+#  CPU2_DIMM_E1
+#  CPU2_DIMM_F1
+# Not installed DIMMs    :
+#  CPU1_DIMM_A2
+#  CPU1_DIMM_B2
+#  CPU1_DIMM_C2
+#  CPU1_DIMM_D2
+#  CPU1_DIMM_E2
+#  CPU1_DIMM_F2
+#  CPU2_DIMM_A2
+#  CPU2_DIMM_B2
+#  CPU2_DIMM_C2
+#  CPU2_DIMM_D2
+#  CPU2_DIMM_E2
+#  CPU2_DIMM_F2
+# DIMMs max slots        : 24
+# Memory Size            : 376G
+# Swap Memory Size       : 4.0G
+# Total Memory Size      : 380G
+# Max Memory Capacity    : 2304 GB
+# Configed Clock Speed   : 2666 MHz
+# -------------------------- Turbo Information  --------------------------
+# Scaling Driver         : intel_pstate
+# Turbo Status           : ON
+# CPU Max Frequency      : 3.70 GHz
+# CPU Min Frequency      : 1.00 GHz
+# CPU Freq Governor      : performance
+# ========================= Software Information =========================
+# BIOS Release Date      : 03/10/2017
+# OS Version             : CentOS Linux release 7.3.1611 (Core)
+# Kernel Release Version : 3.10.0-514.el7.x86_64
+# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
+# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
+# CMake Version          : 3.5.2
+# ------------------ Environment Variables Information -------------------
+# KMP_AFFINITY           : unset
+# OMP_DYNAMIC            : unset
+# OMP_NESTED             : unset
+# OMP_NUM_THREADS        : unset
+# MKL_NUM_THREADS        : unset
+# MKL_DYNAMIC            : unset
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
deleted file mode 100644
index 91620b1ee7..0000000000
--- a/paddle/scripts/deb/postinst
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-echo "Post install paddle debian package."
-echo "Install some python package used for paddle. You can run "
-echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
-find /usr/ -name '*paddle*.whl' | xargs pip install
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 76bc30e59b..f3a6f1dba7 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -2,178 +2,197 @@
 
 ## Goals
 
-We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters.
+We want to make the building procedures:
 
-We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker.
+1. Static, can reproduce easily.
+1. Generate python `whl` packages that can be widely use cross many distributions.
+1. Build different binaries per release to satisfy different environments:
+    - Binaries for different CUDA and CUDNN versions, like CUDA 7.5, 8.0, 9.0
+    - Binaries containing only capi
+    - Binaries for python with wide unicode support or not.
+1. Build docker images with PaddlePaddle pre-installed, so that we can run
+PaddlePaddle applications directly in docker or on Kubernetes clusters.
 
-We want to minimize the size of generated Docker images and .deb packages so to reduce the download time.
+To achieve this, we created a repo: https://github.com/PaddlePaddle/buildtools
+which gives several docker images that are `manylinux1` sufficient. Then we
+can build PaddlePaddle using these images to generate corresponding `whl`
+binaries.
 
-We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+## Run The Build
 
-Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container.
+### Build Evironments
 
-We want the procedure and tools also work with testing, continuous integration, and releasing.
+The pre-built build environment images are:
 
+| Image | Tag |
+| ----- | --- |
+| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn5 |
+| paddlepaddle/paddle_manylinux_devel | cuda8.0_cudnn5 |
+| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn7 |
+| paddlepaddle/paddle_manylinux_devel | cuda9.0_cudnn7 |
 
-## Docker Images
-
-So we need two Docker images for each version of PaddlePaddle:
-
-1. `paddle:<version>-dev`
-
-   This a development image contains only the development tools and standardizes the building procedure.  Users include:
+### Start Build
 
-   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
-   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
-   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+Choose one docker image that suit your environment and run the following
+command to start a build:
 
-   Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
-
-  The development image should include the following tools:
-
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
+```bash
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" -e "PYTHON_ABI=cp27-cp27mu" paddlepaddle/paddle_manylinux_devel /paddle/paddle/scripts/docker/build.sh
+```
 
-   Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
+After the build finishes, you can get output `whl` package under
+`build/python/dist`.
 
-1. `paddle:<version>`
+This command mounts the source directory on the host into `/paddle` in the container, then run the build script `/paddle/paddle/scripts/docker/build.sh`
+in the container. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
 
-   This is the production image, generated using the development image. This image might have multiple variants:
+### Build Options
 
-   - GPU/AVX   `paddle:<version>-gpu`
-   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
-   - no-GPU/AVX  `paddle:<version>`
-   - no-GPU/no-AVX  `paddle:<version>-noavx`
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
 
-   We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version.
+| Option | Default | Description |
+| ------ | -------- | ----------- |
+| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
+| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
+| `WITH_TESTING` | ON | Build unit tests binaries. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
+| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
+| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
+| `WITH_C_API` | OFF | Build capi libraries for inference. |
+| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
+| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
+| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
+| `RUN_TEST` | OFF | Run unit test immediently after the build. |
+| `WITH_DOC` | OFF | Build docs after build binaries. |
+| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
-   We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
 
+## Docker Images
 
-## Development Environment
+You can get the latest PaddlePaddle docker images by
+`docker pull paddlepaddle/paddle:<version>` or build one by yourself.
 
-Here we describe how to use above two images.  We start from considering our daily development environment.
+### Official Docker Releases
 
-Developers work on a computer, which is usually a laptop or desktop:
+Official docker images at
+[here](https://hub.docker.com/r/paddlepaddle/paddle/tags/),
+you can choose either latest or images with a release tag like `0.10.0`,
+Currently available tags are:
 
-<img src="doc/paddle-development-environment.png" width=500 />
+|   Tag  | Description |
+| ------ | --------------------- |
+| latest | latest CPU only image |
+| latest-gpu | latest binary with GPU support |
+| 0.10.0 | release 0.10.0 CPU only binary image |
+| 0.10.0-gpu | release 0.10.0 with GPU support |
 
-or, they might rely on a more sophisticated box (like with GPUs):
+### Build Your Own Image
 
-<img src="doc/paddle-development-environment-gpu.png" width=500 />
+Build PaddlePaddle docker images are quite simple since PaddlePaddle can
+be installed by just running `pip install`. A sample `Dockerfile` is:
 
-A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+```dockerfile
+FROM nvidia/cuda:7.5-cudnn5-runtime-centos6
+RUN yum install -y centos-release-SCL
+RUN yum install -y python27
+# This whl package is generated by previous build steps.
+ADD python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl /
+RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.whl
+```
 
+Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
+the directory containing your own `Dockerfile`.
 
-## Usages
+- NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
 
-### Build the Development Docker Image
+### Use Docker Images
 
-The following commands check out the source code to the host and build the development image `paddle:dev`:
+Suppose that you have written an application program `train.py` using
+PaddlePaddle, we can test and run it using docker:
 
 ```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-docker build -t paddle:dev .
+docker run --rm -it -v $PWD:/work paddlepaddle/paddle /work/a.py
 ```
 
-The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
-
-Users can specify a Ubuntu mirror server for faster downloading:
-
-```bash
-docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
-```
+But this works only if all dependencies of `train.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
 
-### Build PaddlePaddle from Source Code
+### Run PaddlePaddle Book In Docker
 
-Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+Our [book repo](https://github.com/paddlepaddle/book) also provide a docker
+image to start a jupiter notebook inside docker so that you can run this book
+using docker:
 
 ```bash
-docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" paddle:dev
+docker run -d -p 8888:8888 paddlepaddle/book
 ```
 
-This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
-
-`build.sh` builds the following:
-
-- PaddlePaddle binaries,
-- `$PWD/build/paddle-<version>.deb` for production installation, and
-- `$PWD/build/Dockerfile`, which builds the production Docker image.
+Please refer to https://github.com/paddlepaddle/book if you want to build this
+docker image by your self.
 
-Users can specify the following Docker build arguments with either "ON" or "OFF" value:
-- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
-- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
-- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
-  ```bash
-    docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
-  ```
-- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
+### Run Distributed Applications
 
-### Build the Production Docker Image
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
 
-The following command builds the production image:
+Of course, we can manually build an application image and launch the job using the kubectl tool:
 
 ```bash
-docker build -t paddle -f build/Dockerfile ./build
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
 ```
 
-This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
+## Docker Images for Developers
 
-### Run PaddlePaddle Applications
+We have a special docker image for developers:
+`paddlepaddle/paddle:<version>-dev`. This image is also generated from
+https://github.com/PaddlePaddle/buildtools
 
-Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
+This a development image contains only the
+development tools and standardizes the building procedure.  Users include:
 
-```bash
-docker run --rm -it -v $PWD:/work paddle /work/a.py
-```
+- developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+- release engineers -- use this to build the official release from certain branch/tag on Github.com.
+- document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
 
-But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
 
-### Build and Run PaddlePaddle Applications
+The development image contains the following tools:
 
-We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
 
-```
-FROM paddlepaddle/paddle:<version>
-RUN pip install -U matplotlib jupyter ...
-COPY . /book
-EXPOSE 8080
-CMD ["jupyter"]
-```
+Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
 
-The book image is an example of PaddlePaddle application image.  We can build it
 
-```bash
-git clone https://github.com/paddlepaddle/book
-cd book
-docker build -t book .
-```
+### Development Workflow
 
-### Build and Run Distributed Applications
+Here we describe how the workflow goes on.  We start from considering our daily development environment.
 
-In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+Developers work on a computer, which is usually a laptop or desktop:
 
-Of course, we can manually build an application image and launch the job using the kubectl tool:
+<img src="doc/paddle-development-environment.png" width=500 />
 
-```bash
-docker build -f some/Dockerfile -t myapp .
-docker tag myapp me/myapp
-docker push
-kubectl ...
-```
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
 
 ### Reading source code with woboq codebrowser
+
 For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
 
 - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
 
 ```bash
-docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
 ```
 
 - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index a08716c5a5..3c6ec6faba 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -1,171 +1,210 @@
 #!/bin/bash
 
-set -xe
-
-# Set BASE_IMAGE according to env variables
-if [[ ${WITH_GPU} == "ON" ]]; then
-  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
-else
-  BASE_IMAGE="ubuntu:16.04"
-fi
-
-DOCKERFILE_GPU_ENV=""
-DOCKERFILE_CUDNN_DSO=""
-if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-fi
-
-mkdir -p /paddle/build
-cd /paddle/build
-
-# build script will not fail if *.deb does not exist
-rm *.deb 2>/dev/null || true
-# delete previous built whl packages
-rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-cat <<EOF
-========================================
-Configuring cmake in /paddle/build ...
-      -DCMAKE_BUILD_TYPE=Release
-      -DWITH_DOC=OFF
-      -DWITH_GPU=${WITH_GPU:-OFF}
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
-      -DWITH_MKLML=${WITH_MKLML:-ON}
-      -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-ON}
-      -DWITH_SWIG_PY=ON
-      -DWITH_C_API=${WITH_C_API:-OFF}
-      -DWITH_PYTHON=${WITH_PYTHON:-ON}
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-      -DCUDNN_ROOT=/usr/
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-      -DWITH_TESTING=${WITH_TESTING:-ON}
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-========================================
-EOF
+function cmake_gen() {
+    mkdir -p /paddle/build
+    cd /paddle/build
+
+    # build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # delete previous built whl packages
+    rm -rf /paddle/paddle/dist 2>/dev/null || true
+
+    # Support build for all python versions, currently
+    # including cp27-cp27m and cp27-cp27mu.
+    PYTHON_FLAGS=""
+    if [ "$1" != "" ]; then
+        echo "using python abi: $1"
+        if [ "$1" == "cp27-cp27m" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
+            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+        elif [ "$1" == "cp27-cp27mu" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
+            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+        fi
+    fi
 
-# Disable UNITTEST_USE_VIRTUALENV in docker because
-# docker environment is fully controlled by this script.
-# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-cmake .. \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_DOC=OFF \
-      -DWITH_GPU=${WITH_GPU:-OFF} \
-      -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
-      -DWITH_MKLML=${WITH_MKLML:-ON} \
-      -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
-      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-      -DWITH_C_API=${WITH_C_API:-OFF} \
-      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-      -DWITH_TESTING=${WITH_TESTING:-ON} \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-
-cat <<EOF
-============================================
-Building in /paddle/build ...
-============================================
+    cat <<EOF
+    ========================================
+    Configuring cmake in /paddle/build ...
+        -DCMAKE_BUILD_TYPE=Release
+        ${PYTHON_FLAGS}
+        -DWITH_DOC=OFF
+        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_MKL=${WITH_MKL:-ON}
+        -DWITH_AVX=${WITH_AVX:-OFF}
+        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_SWIG_PY=ON
+        -DWITH_C_API=${WITH_C_API:-OFF}
+        -DWITH_PYTHON=${WITH_PYTHON:-ON}
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+        -DCUDNN_ROOT=/usr/
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
+        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    ========================================
 EOF
-make -j `nproc`
-
-if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-cat <<EOF
-========================================
-Running unit tests ...
-========================================
+    # Disable UNITTEST_USE_VIRTUALENV in docker because
+    # docker environment is fully controlled by this script.
+    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        ${PYTHON_FLAGS} \
+        -DWITH_DOC=OFF \
+        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
+        -DWITH_AVX=${WITH_AVX:-OFF} \
+        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+        -DWITH_C_API=${WITH_C_API:-OFF} \
+        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
+        -DCUDNN_ROOT=/usr/ \
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+}
+
+function run_build() {
+    cat <<EOF
+    ============================================
+    Building in /paddle/build ...
+    ============================================
 EOF
-    ctest --output-on-failure
-    # make install should also be test when unittest
-    make install -j `nproc`
-    pip install /usr/local/opt/paddle/share/wheels/*.whl
-    paddle version
-fi
-
+    make -j `nproc`
+}
 
-if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+function run_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     cat <<EOF
-========================================
-Building documentation ...
-   In /paddle/build_doc
-========================================
+    ========================================
+    Running unit tests ...
+    ========================================
 EOF
-    mkdir -p /paddle/build_doc
-    pushd /paddle/build_doc
-    cmake .. \
-          -DWITH_DOC=ON \
-          -DWITH_GPU=OFF \
-          -DWITH_AVX=${WITH_AVX:-ON} \
-          -DWITH_SWIG_PY=ON \
-          -DWITH_STYLE_CHECK=OFF
-    make -j `nproc` gen_proto_py
-    make -j `nproc` paddle_docs paddle_docs_cn
-    popd
-fi
-
-
-if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+        ctest --output-on-failure
+        # make install should also be test when unittest
+        make install -j `nproc`
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        paddle version
+    fi
+}
+
+
+function gen_docs() {
+    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+        cat <<EOF
+    ========================================
+    Building documentation ...
+    In /paddle/build_doc
+    ========================================
+EOF
+        mkdir -p /paddle/build_doc
+        pushd /paddle/build_doc
+        cmake .. \
+            -DWITH_DOC=ON \
+            -DWITH_GPU=OFF \
+            -DWITH_AVX=${WITH_AVX:-ON} \
+            -DWITH_SWIG_PY=ON \
+            -DWITH_STYLE_CHECK=OFF
+        make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_docs paddle_docs_cn
+        popd
+    fi
+
+
+    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+        cat <<EOF
+    ========================================
+    Converting C++ source code into HTML ...
+    ========================================
+EOF
+        export WOBOQ_OUT=/paddle/build/woboq_out
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+            -b /paddle/build \
+            -a \
+            -o $WOBOQ_OUT \
+            -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    fi
+}
+
+
+function gen_dockerfile() {
+    # Set BASE_IMAGE according to env variables
+    if [[ ${WITH_GPU} == "ON" ]]; then
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    else
+    BASE_IMAGE="ubuntu:16.04"
+    fi
+
+    DOCKERFILE_GPU_ENV=""
+    DOCKERFILE_CUDNN_DSO=""
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+    fi
+
     cat <<EOF
-========================================
-Converting C++ source code into HTML ...
-========================================
+    ========================================
+    Generate /paddle/build/Dockerfile ...
+    ========================================
 EOF
-    export WOBOQ_OUT=/paddle/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-        -b /paddle/build \
-        -a \
-        -o $WOBOQ_OUT \
-        -p paddle:/paddle
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-fi
-
-cat <<EOF
-========================================
-Generate /paddle/build/Dockerfile ...
-========================================
+
+    cat > /paddle/build/Dockerfile <<EOF
+    FROM ${BASE_IMAGE}
+    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+    ENV HOME /root
 EOF
 
-cat > /paddle/build/Dockerfile <<EOF
-FROM ${BASE_IMAGE}
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-ENV HOME /root
+    if [[ ${WITH_GPU} == "ON"  ]]; then
+        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+    else
+        NCCL_DEPS="" 
+    fi
+
+    cat >> /paddle/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update &&\
+        ${NCCL_DEPS}\
+        apt-get install -y wget python-pip dmidecode && pip install -U pip && \
+        pip install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        paddle version && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ADD go/cmd/pserver/pserver /usr/bin/
+    ADD go/cmd/master/master /usr/bin/
 EOF
 
-if [[ -n ${APT_MIRROR} ]]; then
-cat >> /paddle/build/Dockerfile <<EOF
-RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+    if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
+        cat >> /paddle/build/Dockerfile <<EOF
+        ADD paddle/pybind/print_operators_doc /usr/bin/
 EOF
-fi
-
-if [[ ${WITH_GPU} == "ON"  ]]; then
-  NCCL_DEPS="apt-get install -y libnccl-dev &&"
-else
-  NCCL_DEPS="" 
-fi
-
-cat >> /paddle/build/Dockerfile <<EOF
-ADD python/dist/*.whl /
-# run paddle version to install python packages first
-RUN apt-get update &&\
-    ${NCCL_DEPS}\
-    apt-get install -y wget python-pip && pip install -U pip && \
-    pip install /*.whl; apt-get install -f -y && \
-    apt-get clean -y && \
-    rm -f /*.whl && \
-    paddle version && \
-    ldconfig
-${DOCKERFILE_CUDNN_DSO}
-${DOCKERFILE_GPU_ENV}
-ADD go/cmd/pserver/pserver /usr/bin/
-ADD go/cmd/master/master /usr/bin/
-# default command shows the paddle version and exit
-CMD ["paddle", "version"]
+    fi
+    cat >> /paddle/build/Dockerfile <<EOF
+    # default command shows the paddle version and exit
+    CMD ["paddle", "version"]
 EOF
+}
+
+set -xe
+
+cmake_gen ${PYTHON_ABI:-""}
+run_build
+run_test
+gen_docs
+gen_dockerfile
 
-set +xe
 printf "If you need to install PaddlePaddle in develop docker image,"
 printf "please make install or pip install build/python/dist/*.whl.\n"
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index 6ef45d33d8..cd13073a0c 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -44,7 +44,7 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
         -DHOST_C_COMPILER=/usr/bin/gcc \
         -DHOST_CXX_COMPILER=/usr/bin/g++ \
         -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
         -DUSE_EIGEN_FOR_BLAS=ON \
         -DWITH_C_API=ON \
         -DWITH_SWIG_PY=OFF \
@@ -58,7 +58,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then
         -DHOST_C_COMPILER=/usr/bin/gcc \
         -DHOST_CXX_COMPILER=/usr/bin/g++ \
         -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
         -DUSE_EIGEN_FOR_BLAS=OFF \
         -DWITH_C_API=ON \
         -DWITH_SWIG_PY=OFF \
@@ -72,7 +72,7 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
         -DHOST_C_COMPILER=/usr/bin/gcc \
         -DHOST_CXX_COMPILER=/usr/bin/g++ \
         -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
         -DWITH_C_API=ON \
         -DWITH_SWIG_PY=OFF \
         -DWITH_STYLE_CHECK=OFF \
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 5c4b5a2495..d71cb84df3 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,8 +18,8 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkl: @WITH_MKL@"
         echo "    with_mkldnn: @WITH_MKLDNN@"
-        echo "    with_mklml: @WITH_MKLML@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
@@ -43,6 +43,54 @@ function ver2num() {
   set +e
 }
 
+function cpu_config() {
+  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
+    return 0
+  fi
+  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="FALSE"
+    fi
+  else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="True"
+    fi
+  fi
+}
+
+function threads_config() {
+  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
+  # according to trainer_count and total processors
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
+    return 0
+  fi
+  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
+  if [ -z $trainers ]; then
+    trainers=1
+  fi
+  threads=$((processors / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  if [ -z "$OMP_NUM_THREADS" ]; then
+    export OMP_NUM_THREADS=$threads
+  fi
+  if [ -z "$MKL_NUM_THREADS" ]; then
+    export MKL_NUM_THREADS=$threads
+  fi
+}
+
 PADDLE_CONF_HOME="$HOME/.config/paddle"
 mkdir -p ${PADDLE_CONF_HOME}
 
@@ -92,9 +140,13 @@ else:
   sys.exit(0)
 EOF
 
+cpu_config
+# echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
     "train")
+        threads_config $@
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS
         ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 973b2736e5..ff0bac6a07 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -6,13 +6,17 @@ mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
+make -j `nproc` paddle_python
 make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` print_operators_doc
+paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
-linkchecker doc/en/html/index.html
-linkchecker doc/cn/html/index.html
+# It will be failed now!
+#linkchecker doc/en/html/index.html
+#linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4245df5ab7..8132742749 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -5,4 +5,8 @@ if(WITH_TESTING)
   add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  if(NOT MOBILE_INFERENCE)
+    add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
+    add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
+  endif()
 endif()
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index c691fe2625..cfb8c713d9 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
                                  bool withValue,
                                  bool useGpu,
                                  bool equalNnzPerSample) {
+#ifndef PADDLE_MOBILE_INFERENCE
   std::vector<int64_t> ids(height);
   std::vector<int64_t> indices(height + 1);
   indices[0] = 0;
@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
     }
     return mat;
   }
+#endif
+  return nullptr;
 }
 
 void generateSequenceStartPositions(size_t batchSize,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
new file mode 100644
index 0000000000..a491322b7e
--- /dev/null
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/memory/memory.h"
+
+int main(int argc, char** argv) {
+  std::vector<char*> new_argv;
+  std::string gflags_env;
+  new_argv.push_back(argv[0]);
+#ifdef PADDLE_WITH_CUDA
+  new_argv.push_back(
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+#else
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+#endif
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
+  testing::InitGoogleTest(&argc, argv);
+  paddle::memory::Used(paddle::platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  paddle::memory::Used(paddle::platform::GPUPlace(0));
+#endif
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 3d471a0c01..72911695bd 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -54,7 +54,7 @@ if(WITH_TESTING)
   add_subdirectory(tests)
 endif()
 
-if(NOT WITH_C_API)
+if(NOT MOBILE_INFERENCE)
   add_paddle_exe(paddle_trainer TrainerMain.cpp)
   add_paddle_exe(paddle_merge_model MergeModel.cpp)
 
@@ -74,7 +74,5 @@ endif()
 if(WITH_GOLANG)
   add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
   target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
-  if(NOT WITH_C_API)
-    target_link_libraries(paddle_trainer paddle_pserver_cclient)
-  endif()
+  target_link_libraries(paddle_trainer paddle_pserver_cclient)
 endif(WITH_GOLANG)
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index f3cfd9f97f..56c38015fb 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -27,6 +27,9 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+
   if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
       FLAGS_model_file.empty()) {
     LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
@@ -34,9 +37,6 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  initMain(argc, argv);
-  initPython(argc, argv);
-
   string confFile = FLAGS_config_file;
 #ifndef PADDLE_WITH_CUDA
   FLAGS_use_gpu = false;
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index b68e29cd5e..3e4a2b5fa8 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
     }
   }
 
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
+  }
+
   if (testing) {
     LOG(INFO) << "trainer: in testing mode";
     if (config_->getOptConfig().use_sparse_remote_updater() ||
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index f01ad4142d..bd518d8598 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,20 +1,17 @@
-################# test_Compare ############################
-add_unittest_without_exec(test_Compare
-    test_Compare.cpp)
-add_test(NAME test_Compare
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
+function(trainer_test TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
 
-################# test_Trainer ###########################
-add_unittest_without_exec(test_Trainer
-    test_Trainer.cpp)
-add_test(NAME test_Trainer
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/gen_proto_data.py &&
-        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+trainer_test(test_Compare)
+trainer_test(test_PyDataProviderWrapper)
+trainer_test(test_recurrent_machine_generation)
+trainer_test(test_Trainer)
 
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -23,60 +20,13 @@ if(WITH_PYTHON)
   add_unittest_without_exec(test_TrainerOnePass
       test_TrainerOnePass.cpp)
   add_test(NAME test_TrainerOnePass
-    COMMAND  ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-          ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
-          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
+          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
       WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
-############### test_CompareTwoOpts ###################
-add_unittest_without_exec(test_CompareTwoOpts
-    test_CompareTwoOpts.cpp)
-add_test(NAME test_CompareTwoOpts
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
-            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
-################# test_recurrent_machine_generation ###############
-add_unittest_without_exec(test_recurrent_machine_generation
-    test_recurrent_machine_generation.cpp)
-add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-
-#################### test_PyDataProviderWrapper #########################
-add_unittest_without_exec(test_PyDataProviderWrapper
-    test_PyDataProviderWrapper.cpp)
-
-add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
+        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/trainer/tests/chunking.conf b/paddle/trainer/tests/chunking.conf
deleted file mode 100644
index d88df919df..0000000000
--- a/paddle/trainer/tests/chunking.conf
+++ /dev/null
@@ -1,125 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-TrainData(ProtoData(
-  files = 'trainer/tests/train_files.txt',
-  usage_ratio = 1.0,
-))
-
-TestData(ProtoData(
-  files = 'trainer/tests/test_files.txt'
-))
-
-default_initial_std(1)
-default_decay_rate(4e-4)
-default_device(0)
-
-Inputs("features", "word", "pos", "chunk")
-
-Outputs("crf")
-
-Layer(
-    name = "features",
-    type = "data",
-    size = 4339,
-)
-
-Layer(
-    name = "word",
-    type = "data",
-    size = 478,
-)
-
-Layer(
-    name = "pos",
-    type = "data",
-    size = 45
-)
-
-Layer(
-    name = "chunk",
-    type = "data",
-    size = 23
-)
-
-Layer(
-    name = "output",
-    type = "mixed",
-    size = 23,
-    bias = False,
-    device = -1,
-    inputs = [
-        FullMatrixProjection("features", parameter_name="feature_weights"),
-    #    TableProjection("word"),
-    #    TableProjection("pos"),
-    ],
-)
-
-Layer(
-    name = "crf",
-    type = "crf",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Layer(
-    name = "crf_decoding",
-    type = "crf_decoding",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Evaluator(
-    name = "error",
-    type = "sum",
-    inputs = "crf_decoding",
-)
-
-'''
-# chuck evaluator cannot be used for GPU training
-Evaluator(
-    name = "chunk_f1",
-    type = "chunk",
-    inputs = ["crf_decoding", "chunk"],
-    chunk_scheme = "IOB",
-    num_chunk_types = 11,
-)
-'''
-
-Settings(
-    algorithm = 'sgd',
-    batch_size = 100,
-    average_window = 0.5,
-    max_average_window = 2500,
-    learning_rate = 1e-1,
-    learning_rate_decay_a = 5e-7,
-    learning_rate_decay_b = 0.75,
-    l1weight = 0,
-    l2weight = 1,
-    c1 = 0.0001,
-    backoff = 0.5,
-    owlqn_steps = 100,
-    max_backoff = 5,
-)
diff --git a/paddle/trainer/tests/compare_sparse_data b/paddle/trainer/tests/compare_sparse_data
deleted file mode 100644
index 18fc654138..0000000000
Binary files a/paddle/trainer/tests/compare_sparse_data and /dev/null differ
diff --git a/paddle/trainer/tests/data_bin_part b/paddle/trainer/tests/data_bin_part
deleted file mode 100644
index 66ede391b0..0000000000
--- a/paddle/trainer/tests/data_bin_part
+++ /dev/null
@@ -1,214 +0,0 @@
-F
-��X
-��X
-��X
-��X
-��X
-��X
-��X
-��X
-���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��H��C��=��T��F��T��Iַ;��W��8��T��;��8��T��J��J��8��T&$��H��=��T��F��T��I��W��8Ю+��J��J��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��8��T��8��T��&�9��C��6��H��C��=��T��F��T��Iַ;��B��T&$��8��8��&Ӗ5��H��=��T��F��T��I��B��T���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��IMK��H��C��=��T��F��T��Iַ;ٟ@��1��7ȣ8��Gȣ8�/��>��7��;��B��A��U��Q��U��T��0A?��H��=��T��F��T��Iٟ@��1��7��G�/��>��7��;��B��A��U��Q��U��T��0���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I����.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;����'���J��A��-��E�J��@��8��T��-��Eބ2�4��8��TYW��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I����A��M��1��8��Mބ2�4��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��K��H��=��T��F��T��I��@��K���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I#!��1��4��UƕT��6��.��Q��8��T��@Ԛ<��1��4ƕT��6��.��Q��8��@Ԛ<���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;ܥ6��H��=��T��F��T��Iܥ6���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I;9��H��C��=��T��F��T��Iַ;��Q��;��B�� �������������!��H��=��T��F��T��I��Q��B���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I53��H��W��8��T��;��8��T��8��T��H��C��=��T��F��T��Iַ;#!��H��W��8Ю+��8��H��=��T��F��T��I���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I&$��H��C��=��T��F��T��Iַ;��V��G��D��; ��H��=��T��F��T��I��V��G��D��;�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G����G͡S�<��%����&б��̣ ��Fۧ1��1ņAǧ1ņAņA�<��6ҥ3߫U��V�K��T��V��U��6��>��V��M��U��F��>��M��5��%��������������̋'wu��G͡S�<��%������̣ ��Fۧ1��1ņAǧ1ņAņA�<��6��U��V�K��T��V��6��>��V��M��U��F��>ʶM��%��������������̋'�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G̣ ��'��@��@��@	���@��@�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G&$��O��4��=ӪN��/��>��K��/��;��8�,��T ��O��4��=ӪN��/��>��K��;��,��T�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G><��,��9��O��8��.̣ ������T��B����0��O��!��.�/��W��D��S��W53��,��9��O��8��.��T��B����0��O��!��.�/��W��D��S��W�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G��:��=��X̣ ��Q��U��T��G܂=��X̣ ��Q��T��G�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G)'��=������	��0̣ ��M��6ͅT��O��,��@Ԛ<#!��=ؐ��0̣ ��M��6ͅT��O��,��@Ԛ<�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G/-��=������	��0̣ ��M��6ͅT��O��,��D��S�D��A)'��=ؐ��0̣ ��M��6ͅT��O��,��D��S�D��A�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G	̣ Ҧ)��G��G���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ20��4��A�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P&$��4��A�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��R��4�Q��>��.ŞG��GщQ��6��?��@Ԛ<#!��R��4�Q��>.��GщQ��6��?��@Ԛ<���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��4�Q��.ŞG��J�I��GщQ��D��S�D��A#!��4�Q.��J�I��GщQ��D��S�D��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��.ŞGٟ@��6��G��5�I��GщQ��A�7��B.ٟ@��6��G��5�I��GщQ��+���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ><��4��9��K�Q��.ŞG��R��G��D��9��H�O��K�J��A��.ŞG��=�R��J/-��4�-�Q.��R��G��D��9��H��K�J��A.�R��J���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4��A��I�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P)'��4��A��I�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ/-��4��=�R��4�Q��>��A��E��.ŞG��C��/��W��9��9 ��4�R��4�Q��>����C��/��W��9���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5��U��P��H���>��G��@Ԛ<��U��P��H���>��G��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��A��H���A��C��P��G��@Ԛ<��A��H���A��C��P��@Ԛ<���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>J��M��B��>��W��M��L��G��,��@Ԛ<MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>J��M��B��>��W��M��L��G��,��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��553��A��H��M��D��P�5��8��Qٟ@�H��3��/��A��@��@��@/-��A��H��M��D��P��8��Qٟ@�H��3��/��A��@��@���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5#!��A��H���A��C��P��G��D��S�D��A ��A��H���A��C��P��D��S�D��A���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5YW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪYW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪ���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5 ��6��P��H����>�5��H��O��A��B ��6��P��H����>�5��H��O��A��B���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��H���G��2��2��A��@��@��@��H���G��2��2��A��@��@�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O20����N߹-��7��B�O��1ַ;��L߹-��N��A��7��O��Iַ;)'����N߹-��7��B�O��1��;߹-��N��A��7��I�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*����N߹-��B�O߹-��7�O߹-ַ;�OʈF��<��4)'����N߹-��B�O߹-��7߹-ַ;�OʈF��<��4�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O&$��A����N߹-��B�O��>��8ֽHٟ@��@Ԛ<#!��A����N߹-��B�O��>��8ٟ@��@Ԛ<�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-�
-������N߹-��C��7��F��B�O��R��1��:��?��T)'�
-������Nں-��7��B�O��R��1��:��?��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O��߹-��7�O߹-��B��T��߹-��7߹-��B�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-����N߹-��B�O��7��F�O��O��?��L߹-��OǧB��T)'����N߹-��B�O��7�O��O��?��L߹-��O��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O><߹-����N��L��B��7��F�O��QӮD��D�A��4��0�A��T����(����",*߹-����N��L���O��QӮD��D��A��0�A��T�������B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
-������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��8�,��T#!����N��9��1��L��N��/��J��D��,��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/b`��1��R��L��D��A¶7��/�
-������J��0��E��K��B��8�/��/��O��E��Kю2��E��,��/��W��T�����)��ʪDB��1��L��D��N��/����J��0��K��B��8�/��O��Eю2��E�������)��ʪ�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/20�
-������1��R��L��A¶7��/��J��0��E��O����@��K&$����1��L��N��/��J��0��E��O����@��K�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/��>��T��7�O��=��P��;��>��7��=��P�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-��D��A¶7��/��1��R��L��JʡH��W��W��T�%����! ��D��N��/��1��L��JʡH��W��WՄO�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
-������N��1��R��L��A¶7��C��H��2��3��1��R��L��A¶7��/��/&$����N��1��L��N޻/��2��3��1��L��N��/�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-����L��G��R��1��¶7��/��1��7�>��>��G��<��T)'����L��+��¶7��/��1��7�>��>��G��<��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/JH��A¶7��/��C��1��R��L��H��7��/����N����=��,��:�:��8��4��S��Q��H�9��T86��N��/��C��1��L��+����N����=��,ў8��4��S��Q��H�9��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/DB�
-������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��E��J��<��B��B,*����N��9��1��L��N��/��J��D��E��J��<��B��B���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K\Z��E��@��T΂:��:��T����T΂:��8��/��6����T΂:��8��/��K����T��8��/��;����T΂:��/��8��E��@JH��E��@��T΂:��:��T����T����6����T����K����T��8��;����T΂:��/��E��@���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K ����N��,΂:��8��/��K��4��?�I����N��,����K��4��?�I���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K86��΂:��8��/΂:��8��/��6��H΂:��/��8��K΂:��8��/��C��T#!��������6��H΂:��/��K����C��T���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K����N΂:��8��/��K�K΂:����N����K�K΂:�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ\Z��R��<��L��#��%��6��K��9��T��V��4��6��V��6��#����6��#��%��6��#����6��$����6��#�8���8GE��R��<��L��#��6��K��9��V��6��V��6��#��6��#��6��#��6����6��#�8���8�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ;9��>�R��>��%��B��>ڜ>��A��9��T��K�9�1��A��#��%��@��@��@20��>�R��>��%��B��>ڜ>��A��9��K�9�1��A��#��@��@�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��9��T��Kڜ>��B��E�I��U��T��#��9��Kڜ>��B��E�I��U�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��K��9��T��D��0��6�O��@Ԛ<��#��K��9��D��0��6��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ��#��%��9��T��CۚK��@Ԛ<��#��9��CۚK��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQGE��6��W��#��%��>��9��T�?��#��%��6��O�/�O��O�/��U��!�'��B�8��>ڜ>;9��6��W��#��>��9�?��#��6��O�/��O�/��U��!�'��B�8��>ڜ>�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQYW��#��%��9��T��>��K��-��A��9��6��T��W��B��:��O��S��R��Q��9��#����%ѾC��H��T��L��6��L��TJH��#��9��>��K��A��9��6��T��W��B��:��O��S��R��Q��9��#����%��5��L��6��L��T�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��, ؓ���=��BܤK��S��/��C��8��Tœ�=��BܤK��S��8��T,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,DB��G��D��G��>��W��-��3��M�8��F�=��Bٟ@��6��S��9ܤK��ȟN��	��U��686��G��D��G��>��W��3�8��F�=��B��5��S��9ܤK��ȟN��	��U�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,><��E��R�=��B��6�O��7��>��T��H��H�8��@��9��F��F��S��A��@Ԛ<53��E��R�=��B��6��7��>��H��H�8��@��9��F��F��A��@Ԛ<,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,PN��8��4��C�8�1�=��B��R��V��T��6��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GDB��8��C�8�1�=��B��R��V��T��C��A��E��:��6�L��U��U��NԛL��@��6��G�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,JH��H�=��B��/��-�8��>ܤK��D��A��9��=��S˱U�8��Q��TָU��J�����)��ʪDB��H�=��B��/�8��>ܤK��D��A��9��=��S˱U�8��Q��T�U�����)��ʪ,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,GE��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR��R��D��O��6ө����ۆ	;9��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR����6ө��,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,;9��R��Q��S��A��E��M�8�=��B��>ץR��9��)��N��U��6��!��G��J53��R��Q��S��A��E��C�=��B��>ץR��9��)��N��U��6��!��1,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,/-��V��J��V�1�8�=��B��R��6��?��#��%��@��@��@)'��V��J��V�1�8�=��B��R��6��?��#��@��@�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����D��>��EȊ5��6��R��T���8��J��F�=��B��K��T��:�8��J�=��B��R��F��K��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9�9��I��S��D��P�D��Azx��D��>��Eˊ5��R��T���8��S�=��BАT��:�8��J�=��B��R��F��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9��I��S��D��P�D��A,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����V��6�8��B��Xʉ5�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��G��V����X��G��V���8��G������&��Ξ)��V��E����B��V��Ɣ>��X��V����U��8P��=ۚK��C��>��J��U̟K��O��4��>��L����V��6�8��B��X�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��V����X��V���8��G������&��Ξ)��V��E����B��V��۔>��V����U��8��=��C��J�.��4��>���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C��G��R��@��N����D��C��G��@��N���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����6��C��G��Dʉ5��>��R������#!����6��C��G��Dʉ5��>��R���������H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X86������G��R�>��R��P��>��R��6��9�9��V��A��D��S�D��A20����G��R�>��R��P��>��R��6��9��V��A��D��S�D��A���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����D��R߻W�9��9������@��@��@����D��R߻W�9��9����@��@���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����U��V�1�;��2��X��4����U��V�1�;��2��X���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��Xnl��>��A����6߻W��$��6��X��T��6�/ҥ3��)��T��:��6��X��-��6��M��E��@��E��U��%�������!�����)�������!MK��>��A����6߻W��$��6XɺRҥ3��?��:��6��X��-��6��E��@��E�������)����P���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��XA?������6��=��C߻W��E��D��>�3��K֟MȬT��T��(����#����$����!,*����6��=��C߻W��E��D��>�3��K֟MȬT��T�����H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X;9����6��6��GȂ3ʉ5��>��R��>��B��������C��T��6��;��3��D53����6��6��GȂ3ʉ5��>��R��>��B������C��T��;��3��D���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C߻W��R��1��@��K����D��C�W��1��@��K�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6GE��6��/��K�Oٟ@P��=��>��8��E�9��R�B��H��A��V��T��J��D��8��D��A��P53��6��/�Oٟ@��=��>��8��E��R�B��H��A��V��T��D��8��A�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6&$��C��K�O�I�9��R�B��2��S��C��I��9��C�O�I��R�B��2��S��C��9�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6)'��L��P��K�O�9��R�B����6��P����6��T ��L��P�O��R�B����6��P����6�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6PN��6��K�O�9��R�B��E��I��T��6��>��S��K��?��K��IP��=��>��K��I��9��0��C��9��T><��6�O��R�B��E��I��T��6��>��S��K��?��K��=��>��K��9��0��C��T�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*кB��P��K�O��K��=��9��F�9��R��H��G��8��T#!кB��P�O��K��=��9��F��R��H��G��8�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6JH��H��K�O��>��6��/P��=�9��R��H��>��D��A��P��;��0��T��?��6��T��)����!/-��H�O��>��6��/��=��R��H��>��A��;��T��6��T��)�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6MK��K�O��6��/P��=��K�O��R��D�B��6�O��K��K�O��6�9��6��K�O��6щQ��@Ԛ<53�O��6��/��=�O��R��D�B��6��K�O��9�O��6щQ��@Ԛ<�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*��6��/��K�O�9��R�B��DǬ<��C��I��I�?��9 ��6��/�O��R�B��DǬ<��Cڗ?��9�,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�OP��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6><��A��2��I��P��K�O��K��G��C��=�9��R�B��D�9��6��D��P�D��A20��A��2��I��P�O��K��G��C��R�B��D��6��D��P�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=SQ��$����(����0�?�=��6��C�9��A��T��0��7��R��L��@��;��I��6��U��L��I��G��8��2��TMK��$������0�?�=��6��C�9��A��T��0��7��R��L��@��I��6��U��L��I��G��8��2��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=86��C�9��A��T��0�?�9��-�8ٟ@��6ǽ=��E��X��Eŧ;��>��P/-��C�9��A��T��0�?�9��-�8ٟ@��6��E��E��>��P���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=20ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@��@/-ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��0�?��<��6��C�9��A��T��A�7��B��0�?��<��6��C�9��A��T��+���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=JH��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��?��B��RÙK��B��TA?��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��B��E��B���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=��0�?��<��C�9��A��T��0�?��<��C�9��A��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=PN�9��T��0�?��5��5�=��-щQ��X��S��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GGE�9��T��0�?��5��5�=��-щQƇX��C��A��E��:��6�L��U��U��NԛL��@��6��G���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=)'��0�?�9��<��C�9��A��T��6�O��P��B��6#!��0�?�9��<��C�9��A��T��6��P��B���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��D��6��@Ԛ<��B��7��Uח>��D��6��@Ԛ<���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��Uח>��@��K��Uח>��@��K���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>ͦB��O��E��R��B��7��Uח>ͦB��O��E��R���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��8��;��BٖT��T��B��7��Uח>��8��;��B��T���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>86��A��H��F��S��=��@��=՞R��U��7��0ח>��G��D��S��PԮK߀320��A��H��F��=��@��=՞R��U��7��0ח>��G��D��S��PٮK���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח> ��B��7��Uח>��D��T����(����"��B��7��Uח>��D��T�����6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��6��R��T��U��7��HˮD�D��A��6��T��U��7��HˮD�D��A���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��B��7��Uח>���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>20��A��H��F��S��=��@��=՞R��U��7��0ח>��G��P��B��6,*��A��H��F��=��@��=՞R��U��7��0ח>��G��P��B�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	MK��7��<��X��7��Q��L��4��4��R�W��5���������Q��-��<��>��;��������G��B;9ӱ��Q��L��4ߩ7��5���������Q��-��<��>��;��������G�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	��N��<��;��0��@��K��,��N��;��0��K��,�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	#!��<��X��4��R�W��>��4��8��@��@��@��<ߩ7��>��4��8��@��@�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	/-�4��P��P��PՈP��R��L��1�4��:����N��P��P��P&$�4��P��P��PՈP���:����N��P��P��P���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��3ϊX��4��C��3��7��Q��7����3ϊX��@����@��@ ԊX��4��C��3��3��ԊX��@����@���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K ��RН?��3ϊX��Q׆N��S��?��4�8��RН?ԊX��Q��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��Q��T��3ϊX��7��Q��7׆N��S��?��4�8��QԊX��3��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��I��7��Q����F��7��3ϊX��>��F�9�Q��?��WɤK��IԊX��>��F�9�Q��?��WɤK���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��R��3ϊX��4��6߻W��L��Q��G��8��@Ԛ< ��RԊX��4��6߻W��L��Q��G��@Ԛ<���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K><��3ϊX��R��7��Q��7��@��4��7��5�@����:ȥ����B��@����A��T/-ԊX��R��3��@��4��7��5�@����:ȥ����B����A���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K����"����!��F��>��"��F���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��720��D��N��/��E��L��>�7��Aڶ>��F��7��C��Dƹ;��@Ԛ<,*��D��NȜM��L��>�7��Aڶ>��F��7��C��4��@Ԛ<���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7)'��H��N��/��K��N��/ڶ>��F��7��=��A�7��B#!��H��N��/��K��N��/ڶ>��F��7��=��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7P��=��D��G��@��K��H��/��6��7��=��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7ܤK��K��A�7��B	ܤK��K��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7�����7��>��1��T֛7ٟ@��9��F��6��U��>ʔ7��1��/��>ٟ@��6��L��D��7��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5�9Ԛ<ڶ>��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A�����7��>��1֛7ٟ@��9��F��6��U��>ʔ7��1��/��>��5��L��D��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5��1��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��=�8��T��=��4ڶ>��F��7��S��@��@��@)'��D��N��=�8��T��=��4ڶ>��F��7��S��@��@���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7��Dƹ;��D��G��@��K��H��/��6��7��4��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7><��D��1ƹ;��T��Dƹ;��>��S��=��>��7ʗ7��4��=��>��S��B��7��S��T86��D��1ƹ;��T��4��>��S��=��>��7ʗ7��4��>��S��B��7��S��T���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7��V��/��6��7��=��D��G��@��K��V��/��6��7��=��D��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A><��E��R�9��U��5��1����$��L��2��;��N��@��6�1��O��D��S�D��A86��E��R�9��U��5��1����L��2��N��@��6�1��O��D��S�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��ASQ��$��U��S�/��1��6��1��.��T�9��A��6��A��PɺD��E��X��>��EѾC��T��8��6��V��O��T��BA?��$��U��S�/��6��T�9��A��6��A��PɺD��E��X��E��T��8��V��O��T��B���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��E��R�9�1��@��5��H��1��Bٟ@��4�9��A��E��@��@��@/-��E��R�9�1��@��5��H��1��@��4�9��A��E��@��@���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��ADB��"��Ξ)����6�9��$��R��5��4�9��A��Iٟ@��T��N��>��C��J��@��@Ԛ<><��"������6�9��$��R��5��4�9��A��@��T��N��>��C��J��@��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��@��;��5��R��H����$��U��L��T�9��A��6��D��P�D��A/-��@��5��R��H����U��L��T�9��A��6��D��P�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A86��D��P��>��E��5��H��"����$ĪN��L��=��4�9��6��A�7��B/-�P��>��E��5��H��"����$ĪN��L��=��4�9��6��+���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H/-��:��/��SʡH��9��9��9��?��9��9��?��D����6��T��:��S��9��9��9��9��D����6���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��9��?��<��V��V��:��S��9��9��<��V���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H20��S��:��/��SʡH��9��9��9��?ёC؄/��H��;��0��D��T��:��S��9��9ґC��H��;��D��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H)'��:��/��SʡH��9��9��S��:��?��B��6݆.��T��:��S��9��:��B݆.��T���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��9��?��:��/��SʡH��9��9��?��T��9��:��S��9��?��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H��B��<��V�?��-��=�R��J��B��<��V�?�R��J���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��Kwu��7��R��D��H��>��<��>��K���,��0��7������R��2��������
-�.��T������ʆ��L��@ϡS��4��,ܢE��M��,�.��O��2��J��6MK������R��2��������
-�.��T������ʆ��L��@ϡS��4��,��E��,�.��O��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KMK��R��D��H��>��<��>��K�,��0��I��O��9��4��9��1یV��0P��=P��H��>�.�E��6A?��R��H��>��<��>��K�,��0��I��O��9��4��V��0��=P��H��>�.�E��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K#!��R��D��>��H��<��K��1��Q��@��@��@��R��>��H��<��K��1��Q��@��@���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K/-��D��H��>��K��=��<��,��D��6�R��=��4��,��@Ԛ<&$��D��5�4��,��D��6�R��=��4��,��@Ԛ<���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K����7��R��D��H��>��<��>��K���2��>��7�.ʆ��J��6������ʆ��G������1��?������P��=��1��?����I��2����K��7����>��>����M��G����MߎM������6��>��J��Rʆ�.��J��6�~�.ʆ��J��6������ʆ��G������1��?��������=��1��?����I����K��7����>����M��G����MߎM������6��>��J��Rʆ�.��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K20��R��D��H��>��<��>��K�2��>��J��6��/��;��I��N��9,*��R��H��>��<��>��K�2��>��J��6��/��;��N��9���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K_]��R��D��H��>��<��>��K�2��>ʆ��>��I��2́���N��4��TȇN��4��T��I��(Pބ2��>�N��4ʆ�N��4GE��R��H��>��<��>��K�2��>ʆ��>��I��(�N��4ȇN��4��I��(܉2��>��Nʆ��N���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KGE��R��D��H��>��<��>��K�2��>��J��>��I��2ˏR��3˰(��I��B��>P�3ˏR��2;9��R��H��>��<��>��K�2��>�J��IˏR��3˰(��I��B��>�3ˏR��2�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��> ��L��N��L��BʰD��B��N��@��@��@��L��N��L��B��@��@�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��>)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A ��4��A��R��=��J��	��D��G��@��K��4��A��R��=��	��D��@��K���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��/��4��A��R��Q��=��J��B��4��/��4��A��R��Q��=��B���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A><İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��WН?��/Н?��T,*İF��B��T��V��L��8��A��R��O��8��L��AН?��-���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A20İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W&$İF��B��T��V��L��8��A��R��O��8��L��A���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��APNİF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W��WН?��W��?��U��U��W��TН?��>;9İF��B��T��V��L��8��A��R��O��8��L��A��W��W��?��U��U��W��?���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A,*İF��E��1��;��T��V��L��8��O��3߫UТ@��H��T&$İF��B��T��V��L��8��O��3߫UТ@��H��T���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��A��R��4��J��A��R��4��J���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��L��/��4��A��R��Q��>��L��/��4��A��R��Q��>���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A&$ŷ5��/��B��A��R��4��J��X��>��<��B��B#!ŷ5��/��B��A��R��4��X��>��<��B��B���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��H��H��9��X�RʹE��>��B��H��H��9��;��>��B���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��C��R��9��Xǌ8��@Ԛ<��C��R��9ǌ8��@Ԛ<���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9����9��X��U��T��I��9��X��N�S��;��U��O��Iַ;��U��R��I��I��I�K��I��H��B��O��F��;��F��;��<��U��O��U��M��U��B��I��O��U��-��4��I��9��P��;��P��-��7��;��U��R��I��4��;��Vnl��9��U��I��9��S��G��O��I��U��I��I��I��B��O��F��F��<��U��O��U��U��I��O��U��-��I��9��;��P��7��G��R��I��4��;��V���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��C�5��I��9��1ӛ?��6��9�; ��9��F�5��I��9��1ӛ?��6��9�;���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��U��C��;��-��9��U��C��-���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9\Z��G��9��X��W�F��5ԎB��@��J��P��1��1�<��T��I��:��2��O��:��9��X��C��E��I��>��.��3��>��7��2PN��G��9��W�F��5�B��J��P��1��1�<��T��I��:��2��O��:��9��C��I��>��.��3��>��7��2���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�986��9��X��B�9ԎB��@��@��O��L��W�F�R��9��B��9��<��C��T/-��9��B�9�B��@��O��L��W�F�R��9��B��9��<��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��X��5��9�Q��C��ͦ(����!��9��9��5ƋQ��C�����9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��?��9��@���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9YW��9��X��B�9ԎB��@��>��5��4��W�F�R��9��B��9�� ��I��C��7��0��FŔ6��A�D��M��Iַ;��7��0DB��9��B�9�B��>��5��4��W�F�R��9��B��9�� ��I�C��0��FŔ6��1��I��7���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9DB��9��X��C��K��2�9�R��5��>��9��X��W��A��/��1��C��2��O��D�K��O��D53��9��C��2��R��5��>��9��W��A��1��C��2��O��D�K��O��D���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��>��K��T��CΚI�RН?��>��A��T��9��>��K��CΚI�R��?��A���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�920��C��C��T��C��7��V��C��E��I��Cַ;��C��;��-��C��T��C��C�C��VĸIַ;��C��-��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9DB��H��W��:��9��X��B�9ԎB��@��=ŉE��D��W�F�R��9��B��9��9��X��C��T86��H��W��:��9��B�9�B��=ŉE��D��W�F�R��9��B��9��9��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��T��9��X��R��0ܥ6��9��@��T��9��Rܥ6���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��G��7��;��C��T��G��7��;��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9A?��Hʜ2��R��A��@�R��S�9��@��>��9��X��3��>��)��כ$��>��;��G��B;9��Hʜ2��R��A��@�R��S�9��@��>��9��3��>��)��כ$��>��;��G���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9><����N�R��=��9��X��C��9��S�9��9׵A��A��K��E��A��A��B��C��/;9����N�R��=��9��C��9��S�9��9׵A��A��K��E��A��A��B��C��/���=��@��K��E��=��=��@��E��=><��@��Q��0��H��@��KûA��Q��H��@��KûA��Q��,��H��P��H��C��B��020��@��0��H��@ûA��Q��H��@ûA��Q��,��H��P��H��B��0��=��@��K��E��=��=��@��E��=��=��@��J��@��K��I��5��@��=��@��J��@��I��5��@���=��@��K��E��=��=��@��E��=/-��7ûA��K��3��@��3��@��K��7��K��K��3������!#!��7ûA��K��3��@��3��@��7��K��3����=��@��K��E��=��=��@��E��=86��=��@��K��A��K��C��K��-��3��O��?��3��3��7��7����C��T)'��=��@��A��K��C��K��-��.��?�.��7����C���=��@��K��E��=��=��@��E��=/-��K��6��S�5��@��K��E��=��4��I��,��S��@��@��@)'��K��6��S�5��@��E��=��4��I��,��S��@��@��=��@��K��E��=��=��@��E��=��@��K��@��?��@��@���=��@��K��E��=��=��@��E��=)'��C��@ַ;��C��@��G��C��@��K��=��@��A��B&$��C��@ַ;��C��@��G��C��@��=��@��A��B��=��@��K��E��=��=��@��E��=DB��I��K��@��K��Q�Oַ;�O��E��6��V��=ԋ����J��>��J��T��7��L��J��653��I��K��@��Q�Oַ;�O��E��6��V��=�J��J��7��L��J��6�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��Q��8ȘI��K��5ܢE��4��N��>��4��O�J��A��Q��8��K��5����>��4�O�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��Q�1��U��?��TܢE��4��N��=��D��.��4ԃP��;߽4��G��3 ��Q�1��U��?����D��4��A߽4��G�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5��N��C��>��4ԃP��;��Q��:��3��3ȘI��J��8��2��T��H��A><��A��H��Q��8��K��5��N��C��>��4��A��Q��:��3��3ȘI��J��8��2��H�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��L��-��TܢE��4��N��C��4�1��TН?��> ��A��H��L��-��T����C��4�1��?�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4 ��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+)'ԃP��;��E��7��2��TܢE��4��NŇ7̛<��U��T��A��E��7��T��Ň7̛<��U�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��3��Q��T)'��A��H��Q��8��K��5����C��>��4б��3��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��T��?��T��CܢE��0&$��A��H��Q��8��K��5����C��T��?��T��/�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��XН?��2��J&$��A��H��Q��8��K��5����C��>��4��X��2�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��E��T)'��A��H��Q��8��K��5����C��>��4��?��E��T�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��U��T&$��A��H��Q��8��K��5����C��>��4��?��U�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+#!��Q�1��NÚQ��8ȘI��K��TԃP��;��4��Q�1��N��8��K��T��A��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��-ԃP��;��HܢE��4��N��C��;�>��C��1��A��1��J��>��=)'��-��A��H����C��;�>��C��1��Aܹ1��>��=�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+�~��4����N��U��.̤3��@��>ϥJ��=��T��.��-��0ܢE��4��N��5��H��0�1ԃP��;��R��:��?��=��N��.̤3��@��>��P��T��T��>��J��F��F��8��G��3b`��4����N��U��.��LϥJ��=��T��.��-��0����5��H��0�1��A��R��:��=��N��.��L��P��T��>��J��F��F��8��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��U��/��T&$��A߽4��Q��8��K��5����,��4��U��/��T�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+,*��Q�1ʡH��9��B�X��TܢE��4��NН?̛<��7��T��Q�1��9�X��T����?��7�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?��>��R��T&$��A��H��Q��8��K��5����C��>��4��?��R�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��X��Q��T#!��A߽4��Q��8��K��5����,��4��X��Q�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��C��-��HН?̛<��&���#��#����!)'��A��H��Q��8��K��5����C��>��4��*��?���ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+SQԃP��;��H��QʡHɤU��B��U��H��MܢE��4��N��C��T۹/��8��H��M��T��>��J�T��8��:��G��3><��A��H��QʡHɤU��B��U��H����C��T۹/��8��H��T��>��J�T��:��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+ecԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4ԃP��;ܢE��4��N��5��NģC��F��4��Q��O�1��M��J��Eа.��TН?��>;9��A߽4��Q��8��K��5����,��4��A������Q��O��-��Eа.��T��?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��UܢE��4��N��C��R��K��D��?��TيR̛<&$��A��H��U����C��R��K��D��?��TيR̛<�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*��7ԃP��;��E��7��2��TܢE��4��NН?��>��A��T��7��A��E��7��T����?��A�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+86߹-�JН?̛<ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4&$��-��?��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+#!ԃP��;߽4��U��L��6��.��TܢE��4��N��A߽4��U��L��6��T���ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��U��7��2��TܢE��4��NԃP��;߽4��TН?��T ��A߽4��U��7��T����A߽4��Tܞ?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+;9ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4����0̛<��Q��T)'��A��H��Q��8��K��5����C��>��4����0��Q���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��Tބ2��B��J��7��6��8��T��7��P��4��J#!��T��I�O��Tބ2��BќJ��6��8��7��4���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��JH��R��T��I�O��T��4�/����>��BԚ<��P��D��U���%����!��,��9��=��9��@Ԛ<;9��R��T��I�O��T��4�/����>Ԛ<��P��D��U��,��9��=��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��#!��T��I�O��Tބ2��B��>��T��V��>��T#!��T��I�O��Tބ2��B��>��T��V��>��T���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��SQ��R��P��4��D��3��T��MɾS��B��T��I�O��T��L��;��U��$����N��,�����%����!��@Ԛ<;9��R��4��D��3��T��M��B��T��I�O��T��L��;��U����N��,��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��9��T��I�O��T��>����B��K��1١-��J��L�;��@��@��@/-��R��9��T��I�O��T��>����B��K��1١-��8��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��tr��T��>��I�O��Tմ2�O̤@��R�O��W��BǞV��<��>��MɾS��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��Aki��T��>��I�O��Tմ2��@��R�O��W��BȞV��>��M��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��T��I�O��T��*��B��6��J��7��6��8��T��7��P��4��J��2)'��T��I�O��T��*��B��6ќJ��6��8��7��4��2���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��,*��T��I�O��T��>����,��:���%����!��@Ԛ< ��T��I�O��T��>����,��:��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��A?��T��7��I�O��T��>����3��D��,��R��,��S��U��U��P��4��J��@��@��@53��T��7��I�O��T��>����3��D��,��R��,��S��U��4��@��@���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��T��>��I�O��T��K��>��S��F��>����P��4��J��@��@��@)'��R��T��>��I�O��T��K��S��>����4��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��T����B��6��2��L��C��P��4��J��>��T#!��T��I�O��T����B��6��2��C��4��>���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��MK��9��Q��D��T��7��I�O��T��>��S��F��D��U��>��F��>����;��/��?��B��RÙK��B��T><��9��Q��D��T��7��I�O��T��S��D��U��>��F��>����;��/��B��E��B���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��20��P��4��J��T��I�O��T��S��U��Xߢ?��U��,��6��X��T&$��4��T��I�O��T��S��U��X��?��6��X��T���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��20��T��I�O��T��4��7��>��4����3��D��F��D��S�D��A,*��T��I�O��T��4��>����3��D��F��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��20��T��I�O��T��B��6��J��7��6��8��T��7��P��4��B��T)'��T��I�O��T��B��6ќJ��6��8��7��4��B��T���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��hf��R��T��>��I�O��T��>����U��P��4��>�4P��=��A��N��,��:��L���%����!��*��*��P��4��>٬J��=��$��@Ԛ<SQ��R��T��>��I�O��T��>����U��4��>�4��=��A��N��,��:��L��*��*��4��>٬J��=��$��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��DB��R��P��4��J��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A><��R��4��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A��G	��%��A��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M����&����'��������%��I��I��A��$��ۏ"��&����'��������%��I��I����A��G��&����'��������%��I��I����:��A��G��D�3��A��T��(����%����!����A��G}{��&����'��������%��I��I��A����&����'��������%��I��I����A��&����'��������%��I��I����:��A��D�3��A��T��V����A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A������ ��%��A��A������ ���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��C��V��2��%��0��J��%��2��C��W��F��T��O��W��W)'��%��C��V��2��%��0��%��2��W��F��T��O��9���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��B��F��%��J��W����D��G��%��A��G��@��F��:��=#!��%��<��%��J����D��%��A��@��:��=���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��MJH��D��9��G��M��>��A��Qٟ@��D��B�U��,�G߇;�G��3��M��Vٟ@��6��D��P�D��A><��DٚG��>��A��Qٟ@��D��K��,�G߇;�G��3��M��V��5��D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M/-����A����G��%��;̽>��MŹ��(Źʿ��@��@��@)'����A����G��%��;�>Ź��(Źʿ��@��@���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��D��J��W��.��>��=��V��%��J��W����G��%����A)'��%��D��J��W��.��>��=��V��%��J����G��A��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��86��$����&���� ��C��2̙EϪJֈD��T�9��J��9��@����A��B/-������ ��C��2̙EϪJֈD��T��J��9��@����A��B��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'�� ��2��E��C��$����&��E̛<��0��>��W��T�� ��2��E��C����E��0��>��W��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'��$����&���� ��C��2��G��E��9ֈD��@Ԛ<#!������ ��C��2��G��E��9ֈD��@Ԛ<��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��;9��Sޡ8��$����&��>��&��2̙E�� ֈD��>ܤK��$��'��&��9�Q��')'��S����>��&��2̙E�� ֈD��>ܤK��ƋQ��'��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD���~��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B��$����&��,��&ίB��>��T��7��>��K��U��V��J�J��K��U��Q��T��I��1���R��/��0��Qec��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B����,��&ίB��>��T��7��K��V����Q��I��1���R��/��Q��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD����$����&��2��@�� ��8�,��T����2��@�� ��,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��_]��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,�9��:�� ��>��I\Z��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,��:�� ��>��I��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��A?��$����&���� ۈX��2��@��Q��T��W��N��EܾW��,��;��P��T����,��T86������ ۈX��2��@��Q��T��W��N��EܾW��,��;ٱP����,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��53��ޥ0��C��E��$����&��0��>�� ��2��EֈD��J��<��=�@,*��ޥ0��C��E����0��>�� ��2��EֈD�J��=�@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��Hؕ7��;��E��E��@��;��Dؕ7��;��E��E��@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��Xŷ5��D��/��D��/��Xŷ5��D��D���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D ��;��D��H��B��U��>��U��W��6��T��;��DΑB��>��U��6���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��D��H��D��H��T��;��D��D��D��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��>	��;��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��B��D��/��>	��B��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��=��<��T��;��D��=��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;ӈ5��U��D��>��D��H��D��H��;�5��D��>��D��D���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;;9��K��6������>��H��E��1��K��/��Q��4��D�G��K��I����A��B86��K��6������>��H��E��1��K��/��Q��4��G��K��I����A��B���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;#!��D��E��1ߢ?��0��8��I��D��<��B��B ��D��E��1�?��8��I��D��<��B��B���E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I��E��1��?��0��;��E��1��?��0��;��E��1ߢ?��0	��E��1�?���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;PNڤ5��5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<MKܤ5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;A?��A׆B��?��K��U��E��I��3�R��>��7��D��E��1��?��P��;��6��6��@Ԛ<;9��A׆B��?��K��U��E��I��3��>��7��D��E��1��?��P��;��6��@Ԛ<Q��E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I    ���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;53��A��U��E��1��A��Iٟ@��;��N��?�9��8��5��D��@��@��@/-��A��U��E��1��A��@��;��N��?�9��8��5��D��@��@���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F)'��A�OݰF��B��F��A����S��F��>��L��S��2 ��A�O�F��F��A����F��>��L��S���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F86ݰF��B��S��F��Q��B��J��7��6��8��T��7��QݰF��B��S��F��B&$�F��F��Q��BќJ��6��8��7��Q�F��F��B���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F ��AסE��T��/ݰF��Bٟ@��3��@Ԛ<��A��T�Fٟ@��3��@Ԛ<���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F��S��F��U��R��7��T��F��U��7��T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T���� ��V��>б��1��1�F֎T���� ��Vб���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J/-��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@��@,*��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J)'������1��1�F֎T��=��?��N��;��7��8��K������1��1�F֎T����7��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J����=��?��N��;��C�;����M������C�;����M���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��J��6������!A?��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J ӪN��1��1�F֎T����E��@��@��@ӪN��1��1�F֎T����E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J86��1��1�F֎T��0��3��V��C��J��7��6��8��T��7��1��1�F֎T20��1��1�F֎T��0��3��V��CќJ��6��8��7��1��1�F֎T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T��@��?��1��1�F֎T��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JSQ��D��R�0��7��>��I�8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��APN��D��R�0��7��>��8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��A���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53��1��1�F֎T��W��"�����)����$������I��K��4��6)'��1��1�F֎T��W��"�����)������I��4���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH����7��&��:֎T��1��1�F֎T����T��T��T����=��?��N��;��T����������!53����7��&��:֎T��1��1�F֎T����T��T��T������T��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JPN��1��1�F֎T����=��?��N��;����7��=��?��N��;��G����T��T��T������
-�� ����!.,��1��1�F֎T��������7����G����T��T��T�+���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�Jki��1��1�F֎T��K��Sħ;��S�� ��C��9��>��>��4��K��.��T��RҲ0��A��G�B�@��>�����=��?��N��;�����)��ʪ\Z��1��1�F֎T��K��S��S�� ��Cޖ>��>��4��K��.��T��RҲ0��A��G�B�@��>����������)��ʪ���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53����&��1��1�F֎T��R��B��O��E��V����C��E��@��@��@,*��&��1��1�F֎T��R��B��O��E����C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J,*����7��D��T��1��1�F֎T��6��U��=��?��N��;����7��T��1��1�F֎T��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J_]��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B������!YW��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B���86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/��
-������N��W��=��H��D��E�8��K��D��G��@��K/-��X��1ʡH��9��7΂��N��W��/��D�8��K��D��@��K�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C\Z��-��A��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K�
-��:��K��4��8��?��:��T><��-��A��X��1ʡH��9��7����N��W��/��D�C�
-��:��4��8��?��:��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cb`��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��K����?����I��U��>��D��E��?��T΂:��C̛<A?��X��1ʡH��9��7����N��W��/��D��E����?����I��U��D��?΂:��C̛<�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�8��K΂:��4��T/-��X��1ʡH��9��7����N��W��/��D�8��K΂:��4��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��>��K��X��1��K����N��W��/��D��>�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CSQ��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��O��K��D��O��G��D��O��6��G20��X��1ʡH��9��7����N��W��/��D��O��D��G��D��6��G�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/�
-������/��@��C��H��W��D��EģC��KùB��N��L,*��X��1ʡH��9��7����N޻/��W��D�CùB��N��L�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cqo��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��>��K����L��2��8��A��W��T��6��O�0��UP��D��7��>��6��;PN��X��1ʡH��9��7����N��W��/��D��>����L��P��A��W��6��O�0��UP��D��7��>��6��;�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C><��D��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��K&$��D��X��1ʡH��9��7����N��W��/��D��E�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CPN��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�/��K��I���/�/��C��T20��X��1ʡH��9��7����N��W��/��D��/��I���/��C��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��P��K ��X��1��K����N��W��/��D��P��K�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�8��K�/��6��T,*��X��1ʡH��9��7����N��W��/��D�8��K�/��6�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CMK��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K��:΂:��<��B��B20��X��1ʡH��9��7����N��W޻/��D�C��:΂:��<��B��B�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��P��K��G��D��O��6΂:��P��6��G;9��X��1ʡH��9��7����N��W��/��D��P��K��G��D��6΂:��P��6��G���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��6�O��D��S�D��A��N��-��<��6��6��D��S�D��A���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6><��-��I��6��6�O��E��6��0��FǂS��H��A��V��T��J��D��8��D��A��P/-��-��I��6��6��E��6��FǂS��H��A��V��T��D��8��A���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��O��C��8��A��9��9��N��-��<��6��O��C��8��A��9���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6��<��C��6��=�R��J��<��C��6�R��J���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6 кB��-��<ԋ/��C��6��6��JƱC��TкB��-��<ԋ/��C��6��6��JϱC���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6,*��N��B��-��<�I��6��C��?����6��P����6��T&$��N��-��<�I��6��C��?����6��P����6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��686��5��-��<��H��C��?��D��A��P��;��0��T��?��6��T��)����!&$��5��-��<��H��C��?��A��;��T��6��T��)���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��U��İU��7�/��5�.��W��@ßN��W��F�/��U��I��T20��4��U��İU��7�/��5�.��W��@ßN��W�/��U��I��T���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐWН?,*��4��U��İU��7�/��5�.��W��@ßN��W�/̐W���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/;9��U��İU��7�/�.��W��@ßN����ۏ"����(����!��U��E��T(&��U��İU��7�/�.��W��@ßN�[��U��E��T���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I��T/-��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��0��W��F�9��G��3��Q��T86��4��U��İU��7�/��5�.��W��@ßN��0��W�9��G��3��Q��T���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/,*��4��U��İU��7�/��5�.��W��@ßN��W��F�/)'��4��U��İU��7�/��5�.��W��@ßN��W�/���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/,*��4��İU��7�/��5��:��S��9İUН?̛<��7��T&$��4��İU��7�/��5��:��S��9İU��?��7���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/��R��>Н?��T	��R��>ܞ?���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�//-����U��İU��7�/���.��W��@ßN�1��T��7̛<,*����U��İU��7�/���.��W��@ßN��1��7̛<���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��İU��7�/��5��:��S��9İU��:��4��K����"����!,*��4��İU��7�/��5��:��S��9İU��:��4��K��"���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��W��F�?�9��G��HН?��T20��4��U��İU��7�/��5�.��W��@ßN��W�?�9��G��/���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/GE����U��İU��7�/�.����W��@ßN��W��F�/ɴ9Н?��Tɴ9ʡH��9��?�/��T;9����U��İU��7�/�.����W��@ßN��W�/ɴ9ܞ?ɴ9��9�/��T���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/#!��4��U��İU��7�/��5�.��W��@ßN#!��4��U��İU��7�/��5�.��W��@ßN���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/86��4��U��İU��7�/��5�.��W��@ßN��W��F�/̝5̛<��Q��T20��4��U��İU��7�/��5�.��W��@ßN��W�/̝5̛<��Q���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/53��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/A?��4��U��İU��7�/��5�.��W��@ßN��W��F�/��E��:��Tɴ9��:��T�;86��4��U��İU��7�/��5�.��W��@ßN��W�/��E��:ɴ9��:�;�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��K��F��Eڶ>��FˎW��B��D��I��K��T�����)��ʪ/-��K��F��Eڶ>��FˎW��B��D��I��K�����)��ʪ�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J&$��Sį-��K��>��J���N��T���C��T��T#!��Sį-��K��>��J�ϞN���C��T��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J кB��6��Sį-��K��I��K��T��:��KкB��6��Sį-��K��I��K��:��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��;��Kʗ,��/��Sտ7��P��C��@��;��B ��;��Kʗ,��/��Sտ7��P��C��;��B�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J/-��;��Kʗ,��/P��L��>��C��B��F�R��K�A��K��B,*��;��Kʗ,��/P��L��>��C��B��F�R��K�A��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J)'��;��Kʗ,��/��S��K��D͙7��I��R��N͙7��T&$��;��Kʗ,��/��S��K��D��I��R��N͙7��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��S��Kб��J��7��6��8��T��7��U��>��S��KбќJ��6��8��7��U��>���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;��,��B��R��/��>��4��7��,��B��R��/��>��4��7���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9ѹ6��7��,��BƸ=��D��J��7��.��K��/��B��9��A��=��B��@��@��@&$չ6��,��BƸ=��D��J��*��/��B��A��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9��R��/��B��,��B��.��P��԰��'��0��V��A������Uѹ6��F��G,*��R��/��B��,��B��.��P��0��V��A��"��Uݹ6��G���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;zx��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��N��J��7��6��8��T��7��;��2��/ޟEŮ<��N��6��9��A��=��Bǭ;��HΆO��-��5_]��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��NќJ��6��8��7��;��R��6��N��A�HΆO��-��5���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;86���
-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@��K/-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;\Z��(����<��7����N��6����B����=��G�;��3��>��7����K���������������������#!��<����K����������������R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;)'��N��6��@��4��,��B��H��A��R��/��D��@Ԛ<&$��N��@��4��,��B��H��A��R��/��D��@Ԛ<���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;/-��R��/��,��B��@��Hٟ@ʜ2��I��A��N��6��@��@��@)'��R��/��,��B��@��Hٟ@ʜ2��I��A��N��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U53��B��7��;��U��S�L��8��7��B��9��7��7��	��N����H��&$��B��;��U��S�L��8����	��N����H�����B��7��8��;��U��B��8��;��U&$��B��7��;��>��8��N��@���>��;��G��B ��B��;��>��8��N��@���>��;��G��B��7��8��;��U��B��8��;��U,*��B��7��;��>��8��N��@Ɓ-��6��7Ɓ-��6��H��T#!��B��;��>��8��N��@ȁ-��7ȁ-��H��T���B��7��8��;��U��B��8��;��U��7��B��;��U��>��C��8�,��T��7��B��;��U��>��C��,��T��B��7��8��;��U��B��8��;��U��B��;��U�L��C��8�,��T��B��;��U�L��C��,��T���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U)'��7��B��;��U��>��C��B��U��8��J��<��B��B)'��7��B��;��U��>��C��B��U��8��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U,*��B��7��>��;��U��N��8��C��.��V��I��<��7��; ��B��>��;��U��N��8��C��.��I��7���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��@��?��J��.ϭB��@���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB)'��J��.ʭB�/��L����F��;��F��?��8�,��T ��J��.ϭB��L����F��;��F��,��T���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��'��G��N��O��C&$��J��.ϭB��L����F��O��'��G��N��O��C���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��L��B��<��B��B&$��J��.ϭB��L����F��O��L��B��<��B��B���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��>��L��J��.ϭB��>��L���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB53��J��.ʭB�/��8��N��J��.ʭB�/�G��>��9��8��F��>��T,*��J��.ϭB��8��N��J��.ϭB�G��>��9��8��F��>���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB/-��J��.ʭB�/��8��I��C¨0��3��?��;��9��<��>��T��J��.ϭB��8��IϨ0��-��<��>���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/�;��J��6��J��.ϭB�;��J��6���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��?��;��8��W��B��=��&��;��WɾS��2��S��C��I��9)'��?��;��8��W��B��=��&��;��W��2��S��C��9���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.��T,*��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JH��W��B��R��P��I��9��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��G��OŒA��TDB��W��B��R��P��I��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��GŒA��T���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20��P��G��,��D��N��G��8��0��6��W��B��=��C��=��S��7,*��P��G��,��D��N��G��8��5��W��B��=��C��S��7���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=GE��W��=��D��,��?��R��;��G��0��G��8��D��N��@��W��G��7ӽD��I��E��CӽD��I><��W��=��D��7��R��;��G��0��G��8��D��N��@��W��GӽD��I��EӽD��I���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=)'��;��8��0��W��B��=��D��>щQ��D��S�D��A&$��;��8��0��W��B��=ӗ>щQ��D��S�D��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JHԓ4��5��9��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T�!����!����";9ԓ4��5��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=#!��;��8��>��E��6��Q��W��B��=��@��N ��;��>��E��6��Q��W��B��=��@��N���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=��R����8��G��8��>��=��>��P��R����8��G��8��>��=��>��P���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=VTԓ4��5��9��D��0ԓ4��B��=��O��<��G��BǄP�B�@��;��8��>׽R��G��6��S��T�!����!����"DBԓ4��5��D��0ԓ4��B��=��O��<��G��BǄP��B��;��8��>׽R��G��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=\Z��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>��TYW��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=����H��$��,��G��G��8��8��W��-��B��G��H��H��H��$��,��G��G��8��8��W��-��B��G��H��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��H��D��E��T��L��B��L��=��,��K����H��$��,��G��G��8��8��W��-��B��G��H��H��$��,��G��G��8��8��W��-��B��G��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��D��E��T��L��B��L��,��K���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=86��G��,��D��N��G��8��0��6��W��B��=��C��=��Pֈ;̛<��A��T/-��G��,��D��N��G��8��5��W��B��=��C��Pֈ;̛<��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��T��C��R�J��G��<��8��Q��G��8��O��6��0��G��6��U��<��8��Gڶ>��S��=86��C�J��G��<��8��Q��G��8��O��6��0��G��6��<��8��G��S��=���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKǄP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
-��K��F�7DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
-��K��F�7���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��20��������Ͳ��4ʉ5��/��%��D�H��G��A��A��O��C��4ˉ5��%��D�H��A��A��O��C���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5�� ��������ʉ5ޚT��D��G��@��K����5��D��@��K���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��&$��������ۚKʉ5��R��G̛<��"����&��ۚK݉5��G̛<��"���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��ʉ5����������8�,��Tʉ5����,��T���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��20��7������������ʉ5ޚT��4��L��/ȈX��<��B��B��7��5��4��L��/ȈX��<��B��B���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��,*����������H���6��=��>ʉ5��B��-��A��B#!����H���6��=��>ʉ5��B��-��A��B���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��;9��������ۚK��4ʉ5��G��8��O��E��>έ;��L�S��DʡH��9�;,*��ۚK��4ʉ5��G��O��E��>٭;�S��DʡH��9�;���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��#!��@����������>ʉ5��D��S�D��A��@����>ʉ5��D��S�D��A���������ʉ5��	��ʉ5��ʉ5����������@��Kʉ5����@��K��������ʉ5��	��ʉ5��&$������$��6����������6ʉ5��@Ԛ<������$��6����6ʉ5��@Ԛ<�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6��@��K��-��;��@��K��-��;�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��653��.��H�B��@��M��6��4��A��6�O��I��0щQ��U��P��.��T/-��.��H�B��@��M��6��4��A��6��I��0щQ��U��P��.�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6><��.��H�B��@��M��6��4��A��6�O��H��A��V��T��J��D��8��D��A��P20��.��H�B��@��M��6��4��A��6��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6)'��.��4�9�B�3��I��6�O��F��U��P��U��T#!��.��4�9�B�3��I��6��F��U��P��U�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6A?ڤ5��5��D��>��.��1�B��@��D��4��A��=��������@��6�O��G��;��P20ܤ5��D��>��.��1�B��@��4��A��=����@��6��G��;��P�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6GE��.��J��S��=��H�B��@��D��H��4��A��D��A��P��;��0��T��?��6��T��)����!,*��.��S��H�B��@��H��4��A��A��;��T��6��T��)�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��620��.�B��K��M��4��A��H��A��V��T��J��D��8��D��A��P)'��.�B��K��M��4��A��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��.��H�B��@��D��4��A��=��6�O��G��U��P��9��T#!��.��H�B��@��4��A��S��G��U��P��9���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��@��G��.��M��T��A�/��B��@��G��.��M��T��Q��8ޚT��N��G��K��T��O��T,*��@��G��6��A�/��@��G��6��Q��8��+��K��T��O���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M)'��@��G��@��M��T�/��-��5��6��P��9�?ַ;#!��@��G��@��M�/��-��5��6��P��9��?���@��G��M��T	��@��G��M����@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��T��C�3��G�3��G��9ܞN��T��T��O��C�3��G��9��8��Iַ;��@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��Tki��@��G��6��G��=��@��G��M��.��@��M��C�3��G�3��G��9��N��T��C�3��G��9��I��@��G��6��G��=��@��G��M��.��@��M��@��G��M��T	��@��G��M86��@��G��.�4��@ϚL��4��M��T�;��M�4߹-��W��Hԓ6��Iַ;&$��@��G��.��@��4��M��6߹-��W��Hԓ6��I���@��G��M��T	��@��G��M&$��@��G��.��M��T߹-�5��T��O��O��Iַ;��@��G��6߹-�5��T��O��I��@��G��M��T	��@��G��M����.��@��M��T��.��M��T��@��M��TܞN��D��>��.��M��T��E��=��.��M��T��=��.��M��T�IϪJ��1��.��M��@��G��.��@��M��T��D��C�3��G��9��8��Iַ;\Z��.��@��M��6��@��M�N��>��6��E��=��6��=��6�IϪJ��1��.��M��@��G��.��@��M��D��C�3��G��9��I���@��G��M��T	��@��G��M&$��.��M��@��G��M��T�J��-��U��@ؙD��T#!��.��M��@��G��M�J��-��U��@ؙD��T��@��G��M��T	��@��G��M)'��'��=��.��@��G��M��T��I��B��.��<��B��B#!��'��=��.��@��G��M��I��B��<��B��B���@��G��M��T	��@��G��M\Z��.��M��@��G��M��T��.��M��T��.��@��M��T��@��M��T��E��M��T��=��.��M��T��C�3��G��9��8��Iַ;><��.��M��@��G��M��6��.��@��M��@��M��E��M��=��6��C�3��G��9��I��@��G��M��T	��@��G��M/-��@��G��=��@��G��.��M��T��=��.��M��T�I��G��@ ��@��G��=��@��G��6��=��6�I��G���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��M��U��@��G��@��M��T��@��M��T��M��T��M��U��,��H��P��5ѳBʈF��P��?53��M��@��G��@��M��@��M��M��M��,��H��P��5ѳBʈF��P��?���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M53��@��G��.��M��Tַ;��@��G��.��M��T��D��,��B��Pַ;Υ6&$��@��G��6ַ;��@��G��6��D��,��Pַ;Υ6c    ��I��6�;��0ڳQ	��+��0ڳQ ��I��6�;ٟ@�9ٟ@��0��A��@Ԛ<��+��9��0��A��@Ԛ<K    ��I��6�;��0ڳQ	��+��0ڳQ��I��6�;��-��N	��+��-��N�    ��I��6�;��0ڳQ	��+��0ڳQ20��I��6�;��0ʭBќ:��-��W��I��6�;��I��6��>��S��2&$��+��0ʭBќ:��-��W��I��6��I��6��>��So    ��I��6�;��0ڳQ	��+��0ڳQ&$��U��I��I��6�;��-��N�1��D��@��@��@��U��I��+��-��N�1ځD��@�    ��I��6�;��0ڳQ	��+��0ڳQSQ��������I��6�;��0��9��6�W��I��-��:��P��U��PޜF��T��IP��R��M��T��I��6ޜF��6JH��������+��0��9��6�W��I��-��:��P��U��PޜF��T��I��R��M��T��I��6ޜF��6�    ��I��6�;��0ڳQ	��+��0ڳQA?��Q��2�?��E��C��=��E��@��.��=��9�Q��C��B��9�Q��C��ͦ(����!)'��Q��2�?��E��C��=��@ƋQ��C��BƋQ��C��i    ��I��6�;��0ڳQ	��+��0ڳQ ��.��I��W��I��6�;��8�T��A��B��.��I��W��+��8�T��A��B�    ��I��6�;��0ڳQ	��+��0ڳQ86��I��6�;��6��U��=�9��=��>��C�<ʡH����6��I��H�<��T&$��+��6��9��>��C�<ʡH����6��I��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;GE��<��M��N��L��;��;��T��B��T��4��B��T��/��R��6��G��U��K��P��9��PگD��T><��<��M��N��L��;��;��B��4��B��T��/��R��6��G��U��K��9��PگD��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;53ä=��F��9��E��N��L��<��M��N��L��M��T��M��=��E��P��>,*ä=��F��B��N��L��<��M��N��L��M��T��M��E��P���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;����.��M�@��D��>��3��PϪJ��B��E҄J�J��9��R��>�9ֈD��C��S��W��9ٟ@��1��9��2��D��>��9��E��<��M��N��L��A��M��7��S�9��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.��T����.��M�@��D��>��3��PϪJ��B��EԄJ��9��R��>��D��C��S��W�@��1��9��2��D��>��B��<��M��N��L��A��M��S��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;YW��J��9��E��<��M��N��L����5��?��J����7��7��E��B��=����H��Q��2��8����@ǆ9��V��T��P��HSQ��J��B��<��M��N��L����5��?��J����7��7��B��=����H��Q��2��8����@ǆ9��V��T��P��H���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;JH��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H�<��TGE��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;#!��9��EʕV��<��M��N��LʕV��6��@Ԛ< ��BʕV��<��M��N��LʕV��6��@Ԛ<���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;PN��N��A��=��<��M��N��L��;��;��T�Rڶ>��S��E��>��C��=��,��B��/��7Ȼ;��T��=��.��LGE��N��A��=��<��M��N��L��;��;��T�R��S��E��>��C��,��B��/��7Ȼ;��T��=��L�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G53��>��W��5��CȥW��G��8��E��<����=��?��N��;M�8��T)'��>��W��5��CȥW��G��8��E��<����M��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��Ghf����;��>����>��WȥW��,��:��K��>��;����=��?��N��;����7��=��?��N��;��G����T��T��T������
-�� ����!FD����;��>����>��WȥW��,��:��K��>��;��������7����G����T��T��T�+�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>��W��5��C����WȥW��G��8��E��<��=��?��N��;����T��T��T��G�8̛<86��>��W��5��C����WȥW��G��8��E��<������T��T��T��G��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>����>��Q��R��@��8��S֗T��7��ȥW��@��@��@/-����>����>��Q��R��@��8��S֗T��7��ȥW��@��@�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>��ȥW��S��8��D��0��;����T����=��?��N��;)'����>��ȥW��S��8��D��0��;����T���������ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G#!��ȥW��ȥW��K��ȥW��,��:ĝ�� ��ȥW��ȥW��K��ȥW��,��:؝�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G_]��N��9��U��L��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T����=��?��N��;��T����������!DB��N��U��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T������T��K�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�=��D�J��>��?��=��D�J��>��?��=}	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=�J��?��,��=�J��?��,��=�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
-��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
-��H��0��6��4��T��	��?�J�=	��?�J�=��D�J��>��?��=��GĊA��>��T��D�J��>��?��=��GĊA��>�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=��?��E�J׍Q��D��G��@��K��?��E�J׍Q��D��@��K�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=,*��D�J��>��?��=��E��?��N��K��L��F��9��@��K)'��D�J��>��?��=��E��?��N��K��L��F��9��@�	��?�J�=	��?�J�=��?��=��E�J�=׍Q��P��B��6��?��=��E�J�=׍Q��P��B	��?�J�=	��?�J�=����;��?��1��K��E�J��>�=׍Q��C��P��D��C��K��9��K��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B�9��K��J��K��>��N�9͝,ڪ3��.��WȻ��B�D��E��A¶7ģC��:��Q����;��?��1��K��E�J��>�=׍Q��C��P��D��C��9��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B��8��J��>��N�9Ν,��.��WȻ��B�D��E��NģC��:��Q�	��?�J�=	��?�J�=20����?����>��?�J��>��,��N��1��6��6��=��=��@Ԛ<)'����?����>��?�J��>�=��1��6��=��@Ԛ<	��?�J�=	��?�J�=��D�J��>��?��=��@��K��D�J��>��?��=��@��K�	��?�J�=	��?�J�=><��D�J��>��?��=��4��F��S��CܞN��/����O��������J��-��0��E/-��D�J��>��?��=��4��F��S�N����O����J��7��E	��?�J�=	��?�J�=��?�J�=��4Н?��A��3��A��T��?�J�=��4��A��A�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�= ��?��E�J�=׍Q��F��K��	��A��B ��?��E�J�=׍Q��F��K��	��A��B�	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=)'�J��>��?�=ʡH۩R��V��-��T��.��6��.��T&$�J��>��?�=ʡH۩R��V��-��T��.��6��.�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
-��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
-��H��0��6��4��T��	��?�J�=	��?�J�=20��U��W��X�=��6��?��K��J�J�=��3��WН?��>��A��T,*��U��W��X�=��6��?��K��J�J�=��3��W��?��A�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=86��4��?߸3ѝ6��B��5��-��0��I�J��?߸3�=��=��I̛<��Q��T20��4��?߸3ѝ6��B��5��0��I�J��?߸3�=��=��I̛<��Q�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=)'��D�J��7��?��>��=����F��>��>��@��>��T#!��D�J��7��>��=����F��>��>��@��>���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��1��F��E��W��K��.��W�K��C��:��E��T��1��F��W��K��W�K��C��:��E���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>;9��I��F��E��A��W̋?�6��F��F��1��U�K��>�6�2��6��:��:��@20��I��F��A̋?�6�.��1��U�K��>�6�2��6��:��:��@���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��I�K��M��F��E��-��>��CϨH��Q��R��T��I�K��M��F��-��CΨQ��R��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>20��I��F��E��D��6��A��S��1��F՟?��>��>��D��S�D��A)'��I��F��D��6��Aū1��?��>��>��D��S�D��A���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>53��F��E��>��>��M��*��ɬI��*��I��*��5��5��T��H�<��T,*��F��>��>��M��*��ɬI��*��I��*��5��T��H��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>53��H����F��E��>��>��@��I��U��>��J��-��F�>��T��L��P20��H����F��>��>��@��I��U��>��J��-��F�>��T��L��P���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>20��IP��=��E��>��>��F��E��D��H��>��Q��I��B��,ܔN)'��I��=��E��>��>��F��D��H��>��Q��I��B��G���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��4��P��@Ԛ<��:��;��4��P��@Ԛ<���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;�
-��:��O��;��W��L�/��?��T�
-��:��;��W��.���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��O��:��4��;��D��G��@��K��O��:��;��D��@��K���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O�D��>��;��@��K��:�D��>��;��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2	��D��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��; ��:��O��;��2��,��L��D��G��@��K��:��;��2��,��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2	��:��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;&$����:��OƔ>��;��2�1��E��T��!����!����:Ɣ>��;��2�1��E��T�����D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��; ��P��:��O��8��;��:��I̺@��:��T��P��:��8��;��:��@���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;#!��:��O��;��J��:��O��4��9��7��4��T��:��;��J��:��4��7��4��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2��:��T��D��;��2��:��T���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��D��G��@��K��:��;��2��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2��7��C��<��B��B��:��;��2��7��C��<��B��B���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;�I��@��K��:��;�I��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;#!��:��O��KخG��5��K��;��D��G��@��K��:��KخG��5��K��D��@��K���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��/��I��M��T��S��;ͺ?ٟ@��6��A�7��B��I��T��S��;ͺ?��5��+���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��(��T��S��;��>��6��/��I��M��@��@��@��(��T��S��;��>��6��I��@��@���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M��T��R��;��>��>��V��Bͺ?�C��7�=��V��-��A��B)'��I��T��R��;��>��>��Bͺ?�C��7��V��A��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?��(��T��S��6��4ͺ?��(��T��S��6��4ͺ?���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��T��;ͺ?��D��S�D��A��I��T��;ͺ?��D��S�D��A���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?A?��/��I��M��P�D��;��Fͺ?��M��7��K��/��1�I��-�I��-������@Ԛ<53��I��P�D��;��Fͺ?��M��K��/�I��-�I��-������@Ԛ<���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��F̽>��S��6��>��N��B��I��F̽>��S��6��>��N��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?53��;ͺ?�9��T��.��/��I��/��J��@��/��T��A��/��I��M��T,*��;ͺ?�9��T��.��I��/��J��@��/��T��A��I��T���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M̺ٟ@��6ʔ7��;��Vͺ?��2��(��/��I��M����I��@)'��I̺��5ʔ7��;��Vͺ?��2��(��I����I��@��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8&$��U��J�G��>�S��I��B��E��U��3��H��8��U�G��>�S��I��B��8��H��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8zx��7��HܞN��D�G��>�S��E��U��7��HܞN��D�G��>�S��E��U��Q��7��HܞN��D�G��>�S��E��U��D��E��T߹-��8��Lԓ6��Iַ;��C��=��.b`��7��H�N�G��>�S��8��7��H�N�G��>�S��8��Q��7��H�N�G��>�S��8��D��E��T߹-��8��Lԓ6��I��C��=��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8ki�/��K��@�G��>�S��E��U�S��T��S��U��Q��=��W��B�S��E��U�S��I��B��E��U߹-��=��E��M��S��Iַ;��B��U��1��TPN�/��K�G��>�S��8�SŘ<��Q��=��W��B�S��8�S��I��B��8߹-��=��E̠M��I��B��U��1��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8DB��-ܞN��D�G��>��!������)�S��E��U��Q��-����Q�;ۓR��T��C��G�0/-��-�N�G��>�S��8��Q��-����Q�;ۓR��C��G�0��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��886��R�N��U�G��>�S��E��U��I��B�S��E��U��)����:�/��B#!��N�G��>�S��8��I��B�S��8��:�/��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8/-ܞN��D�G��>�S��I��B��E��U�;�S��I��B��E��U&$�N�G��>�S��I��B��8�;�S��I��B��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8SQ��-ܞN��D�G��>�S��E��U��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��Iַ;ŒA��TJH��-�N�G��>�S��8��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��IŒA��T��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8����P����P��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��820��D�G��I��B��E��U�S��E��U��V��;��E��U��B��E��U#!��D�G��I��B��8�S��8��V��8��B��8� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��O��9��4��>��6�O��@Ԛ<��B��R��O��9��4��>��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>20��D��C��D�9�7��U��D��E��4��Oٟ@��6��A��A�7��B ��D��R��5��U��D��M��Oٟ@��6��+� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��D��B��C��D��Iٟ@��9��6��4��E��>йS��D��K�9ٟ@��9��S��M��>��B�U��-щQ��@Ԛ<><��D��B��R��@��5��4��E��>޹S @��9��S��M��>��B�U��-щQ��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��O��D��6��>��D��=��7��A��D��B��D��C��D��=�9�>��D��Iٟ@��O��D��2�O��@��@��@><��O��D��6��>��D��=��7��A��B��R��9�>��D��@��O��D��2�O��@��@� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>b`��D��B��D��C��D��Cٟ@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��9��6ɤK��A��>��A�9��1��0��T��DPN��B��R��@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��5ɤK��A��>��A��1��0��T��D� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>_]��D��B��C��D��N��5�9��O��H��3��4��8��B��D��4��R��4��O��@��4��W��OŮP��O��4�/��T��D����O��TDB��D��B��R��N��5��O��3��8��B��D��4��M��O��@��4��W��X޵+��T����O��T� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��9��4��>��A��6�O��@Ԛ<��B��R��9��4��>��A��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��> ��U��C��D��9��4��>��A��6��?��,��U��R��9��4��>��A��6��?��,� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P/-��B؇9��6˩5ֲR����1��F��Q�?ٟ@��S��P��G��3&$��B؇9��6ֲR����1��F��Q�?ٟ@��S��G� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P#!��B��O��F��R��6˩5֛7��>��3��P��J��B��O��F��R��6�7��3��P��J� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��Pki��B��T��S��6˩5��0��Q��N�?�9��H��9��R��I��J��IН?��T��X��L��I��/��I��/��I��/��B��=��6��I��6��B��=��-��0YW��B��S��6��0��Q��N�?�9��H��9��R��I��J��Iܞ?ɜX��I��/��I��I��/��B��=��6��I��6��B��=��0� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PDB��0�5��OȨK��F��D�9��I��V��B��T��E��LȨK��F�9��I��V��:��TН?��>/-��0�5��OӨK��D��I��V��B��E��LӨK��I��V��:��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��B��T��S��6˩5��0��B��T��6˩5��1��T��7��H��;��T#!��B��S��6��0��B��6��1��7��H��;��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T��6��6˩5��0��Q��GН?��>��B��6��6��0��Q��G��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P��B�R��6˩5��1��?��F��B��T��B�R��6��1��?��B��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T؇9��6˩5��M��5��R��F��F��B؇9��6��M��5��R��F��F���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��?��>��P��.��T��G��6��>��?��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��@��N��>��P��C��@��N��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T;9��G߹-��.��T��G��TޚT��>��9��B��K��R�9��KϋI��LK��A��B20��G߹-��.��T��GޚT��>��B��R�9��KϋI��LK��A��B���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T)'��.��T��G��T��6��>��7��K��M��?��U��>��T ��.��T��G��6��>��7��K��M��?��,���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T20��.��T�9��Kʉ5�5��>��A��>��B��K��=��U��;Н?��T)'��.��T�9��Kʉ5�5��>��A��>��B��U��;ܞ?���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T/-��.��T�9��Kʉ5�5��>��A��>��B��K��=��3��R��T&$��.��T�9��Kʉ5�5��>��A��>��B��3ҔR���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TDB��O��<��>��T��R��I��O��.��T��R��I��O�V��T��I��O��B��<ȬT��I��Q��>86��O��<��>��T��R��O��.��T��R��O�V��T��I��O��B��<ЬT��Q���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TMK��.��T��G��T��9��.��D�S��>�9��>��A��K��@P��B��@��	��A��6�O��:��@��@��@><��.��T��G��9��.��D�S��>��I��A��K��@��B��@��	��A��6��:��@��@���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��@Ԛ<��.��T��G��6��>��@Ԛ<�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��4��F��E��4�A��J��T��5��4��T��?��1��W��Q̛<��7��T/-;-��4��E��4�A��J��T��5��4��T��?��W��Q̛<��7�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0PN̾-��-��H��,̾-�,��6��.ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��Nя7��>��1GE̾-��-��H��,;-��6ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��N��>��1�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7 ̾-�,�A��J��T��0ޡ8;-�A��J��T��0̾-��/��X��T̾-��/��X��T�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��H��8��4�A��J��T��5��4��0ޡ8��>��1��@��K)'��H��8��4�A��J��T��5��4��0��>��1��@��K�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z��H��,̾-�,��XΩW��8��4�A��L��T��5��4��0ޡ8��>��1��1��D��D��>��7��U��	̾-��X̾-��X��-��TMK��H��,;-��X��8��4�A��L��T��5��4��0��>��1��1��D��>��7��U��	̾-��X̾-��X�-�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0JH��8�A��J��T�O��C��6̾-�,��8��4��L��5��/��T��S��:��-�1��Q��B��U��/��;86��8�A��J��T�O��C��6;-��8��4��L��5��/��S��:��-ڠ#��/�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7 ̾-�,�A��J��T��0ޡ8;-�A��J��T��0/-̾-��/��/��?��8��4�A��J��T��5��T��;��U��/��T&$̾-��/��/��8��4�A��J��T��5��T��;��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��E���A��J��T����T�DɍP��M��A��:��7��.��U��/��T/-;-��E���A��J��T����T�DӍP��A��:��7��.��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0��I��L��I��L�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0&$̾-��4�A��T��9��5��/��?��V��/��?��T ̾-��4�A��T��9��5��/��@��?��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��R��6������!��8�,��T��R��6����,��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?53��1��K��>��Q�P��?��F��:��Bб��4��D��=��3��-��A��B,*��1��K��>��Q�P��?��Bб��4��D��=��-��A��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?;9��4��F��:̔6��B��U��P��V��7����1��5��C��S��?��F��:��@Ԛ<20��4��:̔6��B��U��P��V��7����1��5��C��S��?��@Ԛ<�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?&$̔6ַ;�I��B��U��Vԋ/��C��S��?��F��:��1�I��B��U��Vԋ/��C��S��?�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?GE��F��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��F��:��D��S�D��A><��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��D��S�D��A�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? �D��A��7�O��=P��Rߑ4��P��T�D��A��7��=��Rߑ4��P��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? ��?��F��:��6��S��>��J��<��B��B��?��6��S܃>��<��B��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��?��F��:��6��S��@��K��?��6��S��@��K���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/#!��0�-��0��:��Nٟ@�H�F��V��F��T��0�-��:ٟ@�H��F��F��T���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/,*��NԚ<��B��>��U��N��D��8��F��/��?��P��S��6 ��1��B��>��U��D��F��?��P��S��6���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��//-ӟ;��N��@��R��>��8��F��S��/��"ҥ3��!��@��;��6&$ӟ;��N��R��>��8��F��S��"ҥ3��!��@��6���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/#!��U��NۚK��/��D��8��F��D��S�D��A ��U��N��/��D��8��F��D��S�D��A���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/&$����F��S��5��<��U��T��=��N��@��>��/����F��S��<��Uǃ=��N��>��/���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/)'��MƛK��U��2��Q��T��5��D�H��F��/��@Ԛ<#!��MƛK��U��2��T��D�H��F��/��@Ԛ<���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D����S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D����S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/><��U��N��@�9��O����F��/��P��K��Sϥ%��U��N��@��S��/�4��3��D20��U��@��O����F��/��P��K��Sϥ%��U��N��@��S��/��4���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E����W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E����W��/��@��@��U��N��D�H��F��/��U��D�H��F��/��8��F��E��Nڜ>��/��@��K��8��F��E��Nܜ>��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A86��,ݠ.��>��O��/��1��9��O��6��1��6��A��B��T��G��A�7��B/-��,��>��O��/��1��9��O��6��1��6��A��B��T��G��+�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��O��,ݠ.��B��:��D��G��@��K��O��,��B��D��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A ��C��N��,ݠ.��Q��A��D��P�D��A��C��N��,��Q��A��D��P�D��A�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A)'��Iַ;��D��N��0��C��T��,ݠ.��AщQ��@Ԛ<��Iַ;��D��N����AщQ��@Ԛ<�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A,*��>��T��,ݠ.��9��A��B��A�A��4˛5��D�A��4��>��,��9��A��B��A˛5��D��An	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��N��T��,ݠ.��Nĵ*�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��APN����X��>��T��9�;��;��>��X��>��Q��A��7��A�O��7��R��N��;��X��7��:��U��>��E�8DB����X��>��;��>��X��>��Q��A��7��A�O��7��N��;��X��7��:��U��>��E�8�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��AA?��O߹-��5��,ݠ.߹-��,ݠ.��:߹-��HİU��M��A��N��C�)��O��8�,��T53��O߹-��5��,߹-��,��:߹-��H��M��A��N��C�)��O��,��T���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D ��IֈD��N��0��D��:��D��G��@��K��I��N��0��D��D��@��K���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D��IֈD��G��C��?��D��I��G��C��?��D���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D��D��:��IֈD��1��4	��D��I��1���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D><يR��IֈD��:��0��DيR��4��IֈD��:��0��B��IيR��4��T��C��,��>)'يR��I��:��0��D�R��I��:��0��B��I�R��C���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D;9��IֈD��>��0��E��F��R��4��:��0��>ğCѭDӮD��:ٟ@�H��@Ԛ</-��I��>��0��E��F��M��:��0��>ɟCܮDٟ@�H��@Ԛ<���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D,*��IֈD��:��0ߢ?��D��T��7��N��7��9��U��A��T#!��I��:��0ߢ?��D��7��N��7��9��U��A���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D><��0��IֈD��:��0��D��0��4��IֈD��:��0��B��I��0��4��T��C��,��>/-��0��I��:��0��D��0��4��I��:��0��B��I��0��4��C���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>;9��E��8��7�C��C��@��N��.��H˱U����=���F��CסE��@��@��@20��8��7�C��C��@��N��.��H˱U����=���F��C��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>GE����=��
-��>��C��@��N��7��U��0��>ٟ@��6��M��V�I��W��>��E��D��S�D��A><����=��
-��>��C��@��N��U��0��>ٟ@��6��V��=��>��E��D��S�D��A���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>A?��C��@��N����=��
-�F��>��E��Mӛ?ߤ8��>��4��F��C��@��N��@��@��@;9��C��@��N����=��
-�F��>��E��Mӛ?ߤ8��>��4��C��@��N��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB����=��
-��>��CסE��C��@��N��D��E��0��*��*ԑ4��9��A��*��/��@��@��@;9����=��
-��>��C��C��@��N��D��0��*��*ԑ4��9��A��*��/��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>b`��B˩5��5����=����>��O��*��7��C��@����N��7��C��@��ĕ6��T����F��R��/��H����F��H��4��ĕ6��TPN��B˩5��5����=����>��O��*����C��@��ĕ6��T����F��R��/��H����F��4��ĕ6��T���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>_]��6ɵO����=����>��C��@��Nð.��A��
-��>��>ٟ@�9ٟ@��D��DܢE��SܤK��A��@��CסE��SܤK��A��@Ԛ<SQ��6ɵO����=����>��C��@��Nð.��A��
-��>��>��9��DܢE��SܤK��A��@��C��SܤK��A��@Ԛ<���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>/-��7��C��@��N��7��
-��=���F��S������ÐW��7#!����
-��=���F��S������ÐW��7���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>><����=��
-�F��>��C��@��N��P��E��Mӛ?��M��>��>��Fٟ@��6��@Ԛ<;9����=��
-�F��>��C��@��N��P��E��Mӛ?��M��>��>��F��5��@Ԛ<���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>)'��C��@��N����=��
-�F��(��D��>��M��@Ԛ<)'��C��@��N����=��
-�F��(��D��>��M��@Ԛ<����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/JHùB��L��W¶7��/��J��7���H��>��/��B��/����Wȥ��O��B��4��7��8��4�/��:ĹB��N��/��J��8��4��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��W��>��S��/��9��4�/��:��AƭI���H��W��>��S��9��4��/��A����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/;9���H��>��S��/��>��4��N��O�/��:��/��/��9¶7��JùB��L��/#!����>��4��N��O��/��/��N��JĹB��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/20�/��:��7���H��>��S��/����O��B��4����7�/��:��/��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��>��S��/��O��B��4��"����>������O��B��4��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/PN��7���H��S��/��4��7�/��:����ȥ������Ƕ,��W¶7��/��>��;��������G��B20��/����ȥ������Ƕ,��N��/��>��;��������G����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/kiùB��L��9¶7��/��J��7���H��S��/����9ȥ��4��N��O��7�/��:����9¶7��/�/��:����6��6ȈX��4��������&20ĹB��N��/��J��/����N��/��/����6��6ȈX��4��������H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/><ʡH�U٨I��7���H��S��/��4��7�/��:������:��,��A��F��>�� ʡH�U٨I��/������:��,��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I����������E��L��/����������������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/��H��N�1��,��;��T��L��H�1��,��;��T��L���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��686��X��:��8��6˩5��4��X��:��8��6��4��V��D��T����(����!)'��X��:��8��6��4��X��:��8��6��V��D��T�����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6)'��E��8��:��X��6��6˩5�H��3��8��@��@��@#!��E��8��:��X��6��6�H��3��8��@��@���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��620��X��:��8��6ӻB��O��X��:��8��6˩5��Q��4��6��4��T#!��X��:��8��0��X��:��8��6��Q��6��T���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6GE��X��:��8��6ӻB��O��X��:��8��6ӻB��O��X��:��8��6��H��6��T��$������!20��X��:��8��0��X��:��8��6��O��X��:��8��6��6��T����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6#!��5��X��:��8��>��6˩5��6�R��@Ԛ< ��5��X��:��8��>��6��6�R��@Ԛ<�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��U�.��8߹-��U�.��8�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>/-��U�.��>��D��P��?��1��4��:щQȻ;��T��=��.��L)'��U�.��>��D��?��1��4��:щQȻ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>)'кB��U�.��6��:��D��P߇;Ȼ;��T��=��.��L#!кB��U�.��6��:��D߇;Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��> ��U�.��8߹-��U�.��.ʺB��P��T ��U�.��8߹-��U�.��.ʺB��P��T�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��U�.��:��/�0��E��F��T��6 ߹-��U�.��:��/�0��E��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>&$��U�.����V��P����1��B��,��,��	��5&$��U�.����V��P����1��B��,��,��	��5�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��V��;��T��6��4߹-��V��;��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>MK߹-��U�.�6��>��P��5��,�A߹-��U�.˭V�6��,��3��T߹-˭V�6܈I��U��?��9�0GE߹-��U�.�6��>��P��5��9߹-��U�.˭V�6��,��3��T߹-˭V�6߈I��?��9�0�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>MK����N����=��.��H����=���F��0��B��U�.��$��D��:����N����=��M��P��M��PJH����N����=��U����=���F��0��B��U�.��$��D��:����N����=��M��P��M��P�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>/-߹-��U�.��D��I��4��2��9��-��D��I��V��=�R��J#!߹-��U�.��D��4��2��9��-��+�R��J�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>53߹-��U�.��/�0��Bб��D��D�7��=��E��U��T۹/��U��D,*߹-��U�.��/�0��Bб��D��D�7��,��U��/��D�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��0��N��U��Oބ2��E������	����	��=ĪC��'��A��B&$߹-��U�.��0��N��U��OǷ.����=��A��B�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>SQ��:��Aб��=����>��U�.��=��9��V��>��D��>�9Ԛ<��O��I��SÄN��2��6��8�9��F��T��6GE��:��Aб��=����>��U�.��V��>��D��>��1��O��I��SÄN��9��8�9��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>)'��U�.��>��D��P��6��:��,Ȼ;��T��=��.��L#!��U�.��>��D��6��:��,Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��1��U�.��9��T��D��S�D��A ߹-��1��U�.��9��T��D��S�D��A�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>GE��3Ԛ<��U�.��D��Pڶ>��9��V��C��=��6��R��M��K��
-��P��T��6��1��T��P��653��1��U�.��D��9��V��C��6��R��K��
-��P��T��6��1��TڀP�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��:��S��M��?��B��;��BɵO��M��S��B��#��*��*��.��T53߹-��U�.��:��S��M��?��B��;��BֵO��S��B��#��*��*��.�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S&$��F��M��G��M��M��>��.��3ˠS�8��7��T��F��M��G��M��M��>��.�8��7�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S)'��V��X��?��A��MP��S��>��S��M�8��G��J#!��V��X��?��A��MP��S��>��SٶM��1�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S/-��U��=��Sб�.��6��5��J��?��O��4ʄ/��&�8��7&$��U��=��Sб�.��6�J��O��4ʄ/��&��8�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��SVT��7��0�:��7�K���U�6��A�8��>��C������¾9�8��T��P��7P��X��>¾9�8��7��;><�K���U�6��A��>��C���¾9��8��P��7P��X��>¾9��8��;�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��SA?��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<��(��������!53��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<���	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S86��W��7��I��U�8��>��S��E��Sٟ@�M߫U��@��U��'��@��@��@20��W��7��I��U��>��S��E��Sٟ@�M߫U��@��U��'��@��@�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S><�����������������
-��4��J��6����N��L�F��;��8��T��786�����������������
-��4��J��6����N��L�F��8��7�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��/�,��<��7��F����N��C��N��:��Q��E��I��/��4�O�5��.��L,*��/��<��7��F����N��N��:��+��@��4�O�5��L�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.b`��/��Q��E��T����N��/��Q��E��V��K��/��Q��E��L��9��O��/Լ=��E��T��/��Q��E��/��4��/��V��Q��E��1��WJH��/��+��T����N��/��+��V��/��+��L��9��O��/�=��T��/��+��/��/��V��7��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��C��N��/��Q��E��L��<��W��Q��T����N��N��/��+��L��<��W��Q�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.86����N��C��N��/�,��7ΩWǔ:��Q��B��1�5�O��.��Q��E��T,*����N��N��/��7ǔ:��Q��B��1�5�O��.��+��T�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.������N��/��Q��E��/��4��/��Q��/��4��K��/��Q��E��/��4��H��/��Q��E��Q��W��J��/��Q��E��E��/��V��/��Q��/��4��8��/��Q�O��4��/��Q��E��/��/��Q��/��K����-qo����N��/��+��/��/��Q��/��K��/��+��/��H��/��+��Q��W��/��+��E��V��/��Q��/��8��/��6��4��/��7��/��/��Q��/��K����-�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��./-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@/-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��2����N��Q��E��/�O��O�5��2��Q�O��K��2��Q��O��K��"��W53��2����N��+��/�O��O�5��2��6��K��2��Q��O��K��"��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.�~����N��/�,��K�O��6��1ؙD��T�5�O��L��/��Q��:��L��E��G��:��9��/��4��E��.��E��J��Q��E��T����N��D��Q��E��N��K��7��9��1��W��Tec����N��/��K�O��6��1ؙD��T�5�O��L��/��Q��:��E��:��/��E��.��E��J��+��T����N۳9��E��N��7��9��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��/��1��Q��I��/��4�O��.��D��Q ����N��/��1��Q��@��4�O��.۳9�	��@��U��E	��@��U��E��U��E��T��@��?	��U��E��@	��@��U��E	��@��U��E ��@��U��F��5��E�9��:��U��@Ԛ<��@��U��F��5��E��:��U��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��V��X��,��)��E��Bٟ@��&��EϜV��Q��T��V��X��)��E��@��&��EϜV��Q���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>DB��DԚ<��(��������!ٟ@��6��E��S��>��)����%������"��6��"����&)'��D��5��E��S��>��)����%������"��6��"���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>#!��;ښL��)��E��6��?����?��O��K��T��;ښL��)��E��6����?��A��T���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��8��V��1��)ٟ@��>��6��E��6��>��@Ԛ<#!��8��1��)ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>53��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��A�7��B/-��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��+���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>\Z��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��5��K��9��>��6��6��;��N��D��S��PԮK߀3VT��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��K��9��>��6��6��;��N��D��S��PٮK���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>/-��RPٟ@��)����%ٟ@��6��E��6��>��D��P�D��A&$��RPٟ@��)��5��E��6��>��D��P�D��A���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>GE��!��Rٟ@��6��E��6��>ٟ@��щQ��K��B��)��B��$����&��9��U��>щQ��@Ԛ<;9��!��R��5��E��6��>ٟ@��щQ��K��B��)��B����U��>щQ��@Ԛ<���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$hf��$��>��I��?��9��T��W��O��$�8��$��>��I��?��9��T��W��O��$�8��Q��$��>��I��?��9��T��W��O��$�8��,��9��PMK��$��>ɞ9��W��O��$�8��$��>ɞ9��W��O��$�8��Q��$��>ɞ9��W��O��$�8��,��9��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��֥>��$�8��?��9��T��W�8��Q��H��.��T#!��֥>��$�8ɞ9��W�8��Q��H��.��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��U��"����҈����$��4��T޲F��?��9��T��U��"����4��T޲Fɞ9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$tr�L��:��V��1��T��>��B��;��W�8������׫B�!��U��H��?��I��?��9��T��$�8��C��W��O��?��9��8��W�8ɳQ��W��Q��B��H��O_]�L��:��V��1��T��>��;��W�8������׫B�!��U��H��?��Iɞ9��$�8��C��W��OǞ9��W�8ɳQ��WвQ��H���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$;9µ��$��?��9��Tµ��$��?��9��T�@��M��@��>��K��T��@�/��Bɞ9ɞ9�@ܱM��>��K��@�/���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��<��?��9��T��W�8��$��C��Q��-����Q�;ۓR��T��C��G�0/-��<ɞ9��W�8��$��C��Q��-����Q�;ۓR��C��G�0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$��?��9��T��$�8��:ɞ9��$�8��:���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��?��9��T��$�8��?��9��T��$�8��,��?��9��T��$�8�/��P&$ɞ9��$�8ɞ9��$�8��,ɞ9��$�8�/��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$nl��4��T޲F��?��9��T��"����҈����$��A�/��B��4��T޲F��?��9��T��"����҈����$��Q��8ޚT��N��G��K��T��O��T><��4��T޲Fɞ9��"����A�/��4��T޲Fɞ9��"����Q��8��+��K��T��O���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$&$��?��9��T��Q��0��"ǉ:��?��9��TɳQ��Qɞ9��Q��0��"ǉ:ɞ9ɳQ��Q���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$ec����?��9��T��8�8��I��?��9��T��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��I��0�C��9��8��>ŒA��TSQ��ɞ9��8�8��Iɞ9��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��0�C��9��>ŒA��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$20��?��9��T��$�8ܞN��D֥>��W��8ݶ;��U��W��8��9��T#!ɞ9��$�8�N֥>��W΀8��U��8��9��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$,*��"����҈����$��4��T޲F��?��9��Tǉ:��"��"����4��T޲Fɞ9ǉ:��"���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$53��"����҈����$��4��T޲F��?��9��T��Q��D��2��D��T#!��"����4��T޲Fɞ9��Q��D��2��D��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$A?µ��$��?��9��T��A��=��U����L����E��Q��?Ǳ.��<��?��9��T��C��9/-ɞ9��A��=��U����L����E��Q��?Ǳ.��<ɞ9��C��9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'����?��9��T��$�8����1��Ƨ!��"��K��0 ��ɞ9��$�8����1��'��"��K��0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$_]��W�8��?��9��T��W�8��"��?��9��T��W�8��$���)��?��9��T��W�8��5��6��U��C��7��C��R��?��7��?A?��W�8ɞ9��W�8��"ɞ9��W�8ɞ9��W�8��5��6��U��C��C��R��?��7��?���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$GE��"ǉ:��?��9��T��W�8��5��6��U��$��Ȓ ��������Ƨ!��G��8��O��<��T,*��"ǉ:ɞ9��W�8��5��6��U����G��8��O��<��T���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOMK��/��N��/��4ʅ>߰4�>���N��.��O��X��,��F��J��O��:��9��/��N��/��4��@��@��@,*��N��4���N��O��X��,��F��O��:��9��N��@��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO><��/��N��/��4ʅ>߰4�>�N��.��Xҥ3߫U��B��W��O��F��J��U��Q��J&$��N��4�N��X��U��B��W��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO����6��Mӛ?��6��Mӛ?��O�;��O��/��N��/��4��7��>��6��7��4��4��B����9��H��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ɿC�R����S����2ބ2��B��@����Bބ2ͩ-��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:������6��Mӛ?��6��Mӛ?աO��O��N��̻4��B����9��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ۿC����S����2ބ2��B��@����B��D��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:�����/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO86��/��N��/��4ʅ>߰4�>�N��.��X��W��B��O��F��J��U��Q��J#!��N��4�N��X��W��B��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOSQ��4��/��N��/��4��5��F��>��J��F��J��Iݩ5��O��Rܠ9��4��/��N��/��4��5��F��>��J��F��J,*��4��N��5��4��F��Iݩ5��O��M��4��N��5��4��F���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO ��0��0��,��B��4��.��Iַ;��@��?��0��,��B��4��.��I��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��CP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�OP��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO&$����9��:��9��;��2P��X��>��9��:��;#!����9��:��9��;P��X��>��9��:��;���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P)'��7��6��B��J��P��T��;��<̖@��@��T��M��L&$��7��B��J��P��T��;��<̖@��@��T��M��L���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P,*��C�F��7��6��B��JԿ7��;˨O��O��/��J��Iַ;&$��C�F��7��B��JԿ7��;˨O��O��/��J��I���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��P��H��I��L��2��C����O��JP��H��I��L��2��C����O��JسS��B��6��B��J��Dʿ7��E��>��PP��H��I��L��2��C����O��JP��H��I��L��2��C����O��J��Q��0��N��>��>��K��J��N����B��I��L��2��C����O��J��B��I��L��2��C����O��J۳S��6��B��JϿ7��E��>��P��B��I��L��2��C����O��J��B��I��L��2��C����O��J��Q��0��N׎>��KɏJ���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P,*��J��R��J�C��J��D��6��P��V��.��6��;��J��T)'��J��R��J�C��J��D��6��P��V��.��6��;��T���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P)'��A��R��J��B��J��D��6��PщQ��U��;�7��P&$��A��R��J��B��J��D��6��PщQ��U��;��7���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P53��BܥN��F��C��S��7��B��7��6��B��R��6��H��J��>��A��P/-��BܥN��F��C��S��7��B��7��B��7��H��J��>��A��P���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��PYW��Jǭ;��N��,��6��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��,��6��>��G��3��.ٟ@��DƂGщQ��@Ԛ<SQ��Jǭ;��N��7��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��7��>��G��3��.ٟ@��DƂGщQ��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P/-��F��Jō/��N��J��D��0��P��L�3��6��>��;��G��B&$��J��N��J��D��0��P��L�3��6��>��;��G���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P#!��6ǭ;��>��Q��6��N��J��>��P��;�7 ��6��>��Q��6��N��J��>��P��;�7���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��C�F��J��B��6ǭ;��@Ԛ<��C�F��J��B��6��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P ��6��B��J��D��6��E��>��P��@Ԛ< ��6��B��J��D��6��E��>��P��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P ��7��6��B��J��>��P��Hڶ>��@Ԛ<��7��B��J��>��P��Hڶ>��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��A�7��B��L��BϨH��J��>��P��+���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P#!��Lǭ;��BϨH��J��>��P��D��G��@��K��L��BϨH��J��>��P��D��@��K���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P20��7��6��B��C��J��>��P��/��G��=��Q��>��B��D��>ÐW,*��7��B��C��J��>��P��G��=��Q��>��B��D��>ÐW���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��:ÐW��4��L��BϨH��J��>��P��:ÐW��4���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��<��6��>��7��T��<��@��9��:��T��<��6��>��7��T��?��9��:���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��M��4��T��C��T��7��@��<��@Ԛ<��M��4��C��T��7��@��@Ԛ<���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>53��9��T��B��@��>��T�K��7��<��:��7��@��<ǭ;��?��A��B,*��9��T��B��>��T�K��7��<��:��7��@ՄN��A��B���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>A?��>��T��<��@��>��/��2��6��S��C��S��E��T��<��@��>��-��/��7��B��6;9��>��T��?��>��/��2��6��S��C��S��E��T��?��>��-��/��7��B��6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>/-��T��@��<��T��@��<��/��T��@��<��6��S��E��A��T&$��T��@��T��@��/��T��@��6��S��E��A��T���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��4��T��<��@��H��A��V��T��J��D��8��D��A��P��4��?��H��A��V��T��D��8��A���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>��4��T��<�G��D��G��@��K��4��<�G��D��@��K���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>;9��4��T��R��F��7��@��<��5��@��2��D��0��O����6��P����6��T,*��4��R��I��@��5��@��2��0��O����6��P����6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��C��>��8��T��<��7��@��<��1��>��D��P�D��A&$��C��>��8��<��7��@��1��>��D��P�D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��686��6��N��B��U��C��-�9Ԛ<��D��/щQ��6��C��U��,��C��<��P)'��@��U��-��1��D��/щQ��6��U��,��C��<��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6#!��6��N��B��E��E��U�DщQ��C�D��P��@��E��E��U�DщQ��C�D��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��@Ԛ<��@��U��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6PN��6��N��B��U��Cٟ@�9ٟ@��N��D��.��B��2�I�O��=��.��@��D��N��B��2��B��E��1�S;9��@��U��9��N��D��.��2�I�O��=��.��@��D��N��2��B��E��1�S���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��MP�C��.��6��D��@Ԛ< ��@��U��MP�C��.��6��D��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��Uӛ?��C��D��T��D�A��4��@��U��Uӛ?��C��D��T��D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��620��6��N��B��U��C��-�9Ԛ<��6�Oݠ.��D����N��@Ԛ<#!��@��U��-��1��6ݠ.��D����N��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6/-��6��N��B��6��O��U��C��N��3��>��E��T��B��E��T&$��@��6��O��C��N��3��>��E��T��B��E��T�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9/-��D��H��L��K��D?�O��A��O��6�:��,��A�7��B)'��D��H��L��K��D?�O��A��O��6�:��,��+�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4��=�R��J53��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4�R��J�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��N��A��9��=��H��5��D?�O��Jٟ@��6�:��G��2��@��@��@,*��N��9��H��5��D?�O��J��5�:��G��2��@��@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9SQ��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��I��P��A��F��E��>��6MK��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��P��A��F��>��6�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=ϷA��H����@��H��=��Dć?�O��9��A����@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9A?��,�O��-��H��D��Bٟ@��;��?��=��1��P��K��@?�O��=�9��=��@Ԛ<53��,�O��-��H��D��@�?��=��1��P��K��@?�O��9��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9JH��H��=��W��K��=��:��B��:��D?�O��:��D��1��=��@�9��=��D��9��D��5��@Ԛ<><��H��=��W��K��=��:��B��:��D?�O��:��1��=��@��=��9��5��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=��D��S�D��A ��H��=��Dć?�O��9��D��S�D��A�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9><��D��H��K��L��9�GϪJ��D?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<;9��D��H��K��LݲLϪJ��D?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��120��H��,��8��2�,ֈ;��0��4��V��C��7��G��/��T��>��1)'��H��,��8��2ڈ;��4��V��Cî7��/��T��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1,*��V��@��,��1��V��2�,��7��C��7��G��.��V��@ �M��,��1��V��2��7��Cî7��.�M�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��153��H��8��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>,*��H��8��2��7��Cî7��/��T��>��1��?��T��J��Q�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1/-��W��?��A��;��O��V��2�,��7��C��7��G��A��.��T#!��W��?ҞM��O��V��2��7��Cî7��A��.�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2�,��>��B��-��4��5��J��2��>��B��-��5�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1hf��H��8�
-��N��2�,ԓ4��D��C��7��G��7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
-��5����>��2�,��W��FVT��H��8�
-��N��2��4��Cî7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
-��5����>��2��W�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��186��H��,��7��H��8��2�,��R��N��V��C��7��G��/��T��7��>��1��H��,��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2��C��2��C�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��V��2�,��7��C��7��G¶;��V��2��7��Cî7¶;���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5P��=Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D><��@��G��D��5��ՂP��R��A��5��H��D��KϲL��K��2��!��Q��H�9��T;9��@��G��D��5��ՂP��R��A��5��H��DϲL��K��2��!��Q��H�9��T���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��D,*�F�7��C��P��L߫W��A��=��R��A��D��K��S��7)'�F�7��C��P��L߫W��A��=��R��A��D��S��7���B��R��A��D��K��B��R��A��DPNՂP��L��E��;ߏG��K��C��R��A��D��K��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,MKՂP��L��E��;ߏG��K��C��R��A��D��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,��B��R��A��D��K��B��R��A��DGE��,��9��;��D��K��1��?؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D��K><��,��9��;��D�R؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5P��=Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D\Z��:��D��K��C��L��C��B��C��L��5��?��L��F��L��>��H��D��K��C��R��A��K�?��M��KߏG��K��C��BùFPN��:��D��C��L��C��B��C��5��?��L��L��>��H��D��C��R��A��K�?��M��KߏG��K��C��BùF���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��Dqo�
-��2��C��D��KՂP��L����A��R��A��K��3��D��K��M��KߏG��K����HӒC��,��N��D��K��5��=��T��Uߋ5��,��,��=��>��:��J_]�
-��2��R��KՂP��L����A��R��A��K��3��D��M��KߏG��K����HӒC��,��D��5��=��T��Uߋ5��,��=��>��:���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��;��K��F��A��BѤI��;��;��K��F��A��B���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��S��>��>ٟ@��6��;��@Ԛ<ѤI��;��S��>��>��5��;��@Ԛ<���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�JH��3Ԛ<ѤI��;��>��6��;��6��S��F����;�.��T��T��D��<��D�<��D��C��)�.��FDB��1ѤI��;��>��6��;��6��S��F����;�.��T��D��<��D�<��D��C��)�.��F���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�PNѤI��;��A�9ٟ@��6��-�9��A��4��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��TDBѤI��;��A @��6��9��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��M��@��KѤI��;��M��@��K���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��8ٟ@��>��6��C��A�7��BѤI��;��8ٟ@��>��6��C��+�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��:��B�7��>��B��3��1��C��T��C��CԃP��-��C#!��:��B�7��>��B��3��1��C��CƠ<��C�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G/-Ԋ/��B��N��P��9��2��K��1��W��>��2Ԋ/��Lؒ.��=#!Ԋ/��N��9��K��1��W��>��2��Lؒ.��=�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��N��W��>ڹ3��2��1��%��K��9��E��?��A��F��F��?��D�J��EʡH��9��?�/86��N��W��>ڹ3��2��%��K��9��E��?��A��F��F��D�J��E��9�/�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G)'��A��F��F��?��9��E��1ڹ3��2��1��K��W��(#!��A��F��F��9��E��1ڹ3��2��K��W��(�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G86��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H�K��7��<��653��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H��7��<��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��D��N��A��D��V��9��3��>��R��9��B�>��:��D��:��T��C��S��-��@��@��@86��D��A��D��V��9��3��>��R��9��B�>��:��D��:��C��S��@��@�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G��J��B�7��>��J��3��/��:��J�7��>��J��3��/��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��A��F��?��9��C��1��N��Wڹ3��2��1��K��:��&�8��7)'��A��F��9��C��1��N��Wڹ3��2��K��:��&��8�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��9��D��B��B��3ҾW��1��9��6��5��9��D��:��Q��T��C��2ʶU��>��3�.ٟ@��6ǽ=��G��@Ԛ<A?��D��B��B��3ҾW��1��9��6��5ՔD��Q��C��2ʶU��>��3ٟ@��6��G��@Ԛ<�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��ADB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��A�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��5�D��Bڹ3�G��>��<��9�7��>�?��L��S�:)'��5�D��Bڹ3�G��>��<��9�7��>��F��S�:�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��Gki��R��V��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B��Tec��/��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��6��3��P��7��S��4��D��T�9��I��8��7��1�Dڹ3��2��:��T��C��RٍB��KЅJ��C��G��>��6DB��6��3��P��7��S��C��T��8��7��1�Dڹ3��2��:��C��RٍB��KЅJ��C��>��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G&$��>��>��8��R��V��G��Bڹ3��;��2��F��5&$��>��>��8��R��V��G��Bڹ3��;��2��F��5�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��I��N��=��J��T)'يR��2ŞيR��2Ş��8��J��-��I��=��J��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8_]��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��>��G��@��K\Z��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��G��@��K�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8zx��(����"���������
-��U��A��7��J��1��H�
-��N����=���F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;��#_]�
-��U��A��7��J��1��H�
-��N����=���F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;��#�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8><يR��J��TيR��8��T��يR��<��J����N��	��=يR��J��-����8��T;9يR��J��TيR��8��T��يR��<����N��	��=يR��J��-����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��W��N����8��T,*يR��2ŞيR��2Ş��8��J��-��W��N����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8VT�
-��U��A��7��J��1��H����=���F���F��HيR��2��8��>�1��0�7����A��@��H۰M�3��AMK�
-��U��A��7��J��1��H����=���F���F��HيR��2��8�1��0�7����@��H�3��A�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8;9يR��2��8��>��9��K��A��8��D��6��P��>��JщQ��N��.��6��@Ԛ<53يR��2��8��9��K��A��8��D��P��>��JщQ��N��.��6��@Ԛ<�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8b`����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;\Z����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��/��U��P��G��,��N��K��Q��M��/��U��P��G��,��K��Q��M���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��/�J��P��C��9�8��?��U��P��T��/�J��P��9�8��?��U��P��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDA?��/��/��P��/��/��P��OP��=��-��/��/��P��C�?��K��P��/��/��Pĩ8><��/��/��P��/��/��P��O��=��-��/��/��P��C�?��K��P��/��/��Pĩ8���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD><��N��R��9��K��U��P��S��>��9��S�9Ԛ<��/��D��9��D��R��K��@Ԛ<53��N��R��9��K��U��P��S��>��9��S��1��/��9��R��K��@Ԛ<���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD20��5��9��P��/ַ;��/��P��/��P��A��/��P��?��P��F��7,*��5��9��P��/ַ;��/��P��/��P��/��P��?��P��F���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDPN��U��P�?İU��H��P��.��F��-��S��5��1��S�S��A��P��K�8��5��G��6�����)��ʪJH��U��P�?İU��H��1��F��-��S��5��1��S�S��A��P��K�8��G��6�����)��ʪ���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��U��PʡH��9��8��C��C��H��/��T��U��P��9��C��C��H��/��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��PʰD��/��Fַ;��PʰD��/��1�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��ܷT��1��W��>��/��>��/��C��SܷT��1��W��>��/��U��SܷT��1��W��>��/ܷT��1��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7��(����%����!����"�~��U��W��>��/��>��/��C��S��U��W��>��/��U��S��U��W��>��/��U��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7���ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WA?��6��D��Q��6��6��NیV��O��H��2ܷT��1��8��W��/��Q��6��6��;��6��=;9��6��Q��6��6��NیV��O��H��2��U��8��W��/��Q��6��6��;��6��=�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��8ܷT��1��O��W��=��;��8��U��O��W��=��;�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W ܷT��1��W��F��M��>Л6��;��@��K��U��W��F��>Л6��;��@�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W20��A��8��9�Q��EܷT��1��G��4��W��E��>��F��W��A��B)'��A��8ƋQ��E��U��G��4��W��E��>��F��A��B���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XMK��:��?��:��?��L��I��M��W��#������D��E��=��X��<��F��#����#��%Ӳ&��Ӳ&��;9��:��?��:��?��L��M��W��#������D��E��=��XѶ<��#��#�Χ���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X ��W��R��:��?�9��Iʉ5��X��@Ԛ<��W��R��:��?��Iʉ5��X��@Ԛ<���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XVT��9��W��I��>��:��?щQ��V��4��6��V��6��#����6��#��%��6��#����6��$����6��#�8���8GE��9��W��I��>��:��?щQ��V��6��V��6��#��6��#��6��#��6����6��#�8���8���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X�9��Wʉ5��X��@��N�9��Wʉ5��X��@��N���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X><��W��R��:��?�9��I��G��>ܤK��V��#����%ѾC��H��T��L��6��L��T53��W��R��:��?��I��G��>ܤK��V��#����%��5��L��6��L��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X;9��<��W��1��/��>��:��?б�����9��WڶU��5���P����R��T53��<��W��1��/��>��:��?���9��WڶU��5���P����R��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X53��E��W��N��6��=��A��9��S��:��?�9��I��>��<��G��V��620��E��W��N��6��=��A��9��S��:��?��I��>��<��G��V��6���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#����@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?��Gʉ5����B��W��/��U��X��7�A��E��3��D��#����&&$��:��?��G����B��W��/��U��X�A��E�����D��,��?��R��F��D��,��?��R��F;9��,��?��R��F��Q��U��B��D��A��P��;��0��T��?��6��T��)����!&$��,��?��R��F��Q��U��A��;��T��6��T��)��D��,��?��R��F��D��,��?��R��F/-��?��,��F��R��>��,��6��2ɀ?��E�B��P��2��2��>)'��8��F��R��>��,��6��2ɀ?��E�B��P��2��2���D��,��?��R��F��D��,��?��R��FMK��D��=��D��3��Dٟ@��F��R��?��,��1��@��?��>��1�9��Kٟ@�9ٟ@�-��4��,��@Ԛ<><��D��D��Rٟ@��F��R��8��1��@��?��>��1�9��K��9�-��4��,��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��>��BϨH��,��@��?��,��6��D��P�D��A&$��R��F��>��B؋8��@��8��6��D��P�D��A���D��,��?��R��F��D��,��?��R��F><��D��,��?��R��F��?ϨH��.��?��R��J��V��9��S��6��>��EщQ��@Ԛ<53��D��,�.��F��?��H�.��J��V��9��S��6��>��EщQ��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��,��?��H��A��V��T��J��D��8��D��A��P ��R��8��?��H��A��V��T��D��8��A���D��,��?��R��F��D��,��?��R��FGE��D��3��D����R�I��F��,��2��?��.����@��PیV��D��H��A��V��D��A��P��T><��D��R����R�I��F��,��2��?��.����@��PیV��D��H��A��V��A��T��D��,��?��R��F��D��,��?��R��F20��A��9��=��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<,*��9��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<���D��,��?��R��F��D��,��?��R��FA?��D��R��F��,íB��?��2��D��9��7��I��6��.��2��9��DܤK��4��@��@��@86��D��R��F��,íB��?��2��D��9��7��I��6��.��2�D��4��@��@��D��,��?��R��F��D��,��?��R��F��,��?��R��F��U��P��U��T��,�.��F��U��P��U���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I><��S��9��C��X��7ֈ?�N��X��Iַ;��W��N��I��,ڶ>��T��0��N��6�Q20��S��9��Xֈ?�N��X��I��W��N��,ڶ>��T��0��N��6�Q���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I)'��9��C��Xֈ?�N��X��Iַ;��B�<ނB��<��T#!��9��Xֈ?�N��X��I��B�<ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��:��-��9��C��Xֈ?�N��X��Iַ;��:��-��9��Xֈ?�N��X��I���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��Iec��/��S��B��X��D��/��S��:��X��D��<Υ6��1یV��0��/��S��B��X��U��B��O��B��E��B��V��B��,��B��-ނB��<��TMK��/��S��B��D��/��S�:��D��<Υ6��V��0��/��S��B��UüO��E��B��V��,��-ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I#!��9��C��FۨV��T��/��9��7ʡHб��6��9��F��8��/��9��7ʡHб��6���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��9��C��Xֈ?�N��X��Iַ;�7��4��9��Xֈ?�N��X��I�7���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��4ֈ?��Iַ;��6��B��T��9��4ֈ?��I��B��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OPN��D��D��7��O��C��-��S�O֊2��>��S��DɵO��6��8�9��H��A��V��T��J��D��8��D��A��PA?��D��D��7��C��-��S�O��>��S��DɵO��6��8�9��H��A��V��T��D��8��A���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OA?��7��8��B��Bر/��D��2ѺKٟ@��6��T��C��M��U�<��F������!����"20��7��8��B��Bر/��D��2ѺK��5��T��C��M��U�<��F��	���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O ��5��7��:��C��D�O֊2ѺK��@Ԛ<��5��7��:��C��D�OѺK��@Ԛ<���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODBкB��9��N��7��:��C�O֊2��>ٟ@��6߇;��1��G��3��F��7;Q��6��7;Q��T><кB��9��N��7��:��C�O��>��5߇;��1��G��3��F��7;Q��6��7;Q��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODB��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N�3��>��M><��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N��>���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O)'��V��D��D��7��B��C��9��2��>��/��6��7��T)'��V��D��D��7��B��C��9��2��>��/��6��7��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O&$��V��7�J��R��1��:��2��R��<��@��@��@#!��V��7�J��R��1��:��2��R��<��@��@���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M;9��2��8��=��S��0��M��2��8��G��N��0��6��W��,��6��4����6��T20ſ2��=��S��0��Mſ2��G��N��0��6��W��,��6��4����6���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��M/-��2��8��>��M��2��8��1��S��6��MۓR��9��T��,��K&$ƿ2��>��Mƿ2��1��5��MۓR��9��T��,��K���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��A��2��8��=χ7��1��S��6��M��N��1��SщQχ7��=�R��J,*��Aſ2��=χ7��1��5��M��N��1��SщQχ7�R��J���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M��S��6��M��E��S��2��8��@Ԛ<��5��M��E��Sſ2��@Ԛ<���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��MYW��D��NԚ<��2��8��1��S��6��M��1�H��3��Vٟ@��2��8��1��D��A��P��;��0��T��?��6��T��)����!86��Nſ2��1��5��M��1�H��3��Vٟ@ſ2��1��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��M��Vٟ@��2��8��D��A��P��;��0��T��?��6��T��)����!��Mٟ@ſ2��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M#!��A��2��8��1��S��6��M��D��S�D��A��Aƿ2��1��5��M��D��S�D��A�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��JA?��V��N��N��,̥6��:��D��9��S��J��6�O��Q��Nέ;��L�S��DʡH��9�;86��V��N��,̥6��:��D��9��S��J��6��Q��N٭;�S��DʡH��9�;�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��J53����DȂ3��@��>��QP��JP��J��>��R��C��R��A�7��B/-����DȂ3��@��>��QP��JP��J��>��R��C��R��+�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��D��J��A�7��B/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��D��J��+�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J��G��7��T��Q��-��G��7��T��Q��-�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��JDB�����������A��2ûR�9��?��A��>��;��B��TûR�9��?��A��>��5��653��T�A��2ûR�9��?��A��>��;��BûR�9��?��A��>��5��6�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��J><������6��J��D��9��S��J��6ȻW̑-�9ٟ@P��J��>��R��J��@Ԛ<86����6��J��D��9��S��J��6ȻW̑- @P��J��>��R��J��@Ԛ<�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��JMK����6��6�D��J��QP��L��>��J��R��J��J��QP��L��@��B��J��9��Uڤ5��5��@Ԛ<GE����6��6�D��J��QP��L��>��J��R��J��J��QP��L��@��B��J��Uܤ5��@Ԛ<�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J����4��B��4յG��W��G��X��F����4��B��4��W��X�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J,*������R��:��D��>ڝJ��R��K��2��D��G��@��K&$Ѝ��R��:��D��>ڝJ��R��K��2��D��@��K�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��9��W��W����C/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��9āR��A�����P��J��>��R��JЍP��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����P��J��>��R��JЍP��J��>��R��J�����A��F��8�,��TЍ�A��F��,��T�����P��J��>��R��JЍP��J��>��R��J/-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻WPۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����P��J��>��R��JЍP��J��>��R��J	����4��B	����4��B�����P��J��>��R��JЍP��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>P��J��BPۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����P��J��>��R��JЍP��J��>��R��J/-��U��C��U��TʡH��>��/��X��>����>�A��2���� ��U��UʡH��>��/��X����>��2Ѝ�����P��J��>��R��JЍP��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����P��J��>��R��JЍP��J��>��R��J�����A��F��P��C��<��B��BЍ�A��F��P��<��B��B�����P��J��>��R��JЍP��J��>��R��J)'����QP��J��R��6�9��:��A��D��S�D��A#!����QP��J��R��9��A��D��S�D��A����P��J��>��R��JЍP��J��>��R��J&$����U��R��:��D��>��J����B����/��T&$����U��R��:��D��>��J����B����/��T���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB���2��<��;��>��2��<��;��>���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�)'��V����2��P��K��C�4��EȯB��-�;��J��6&$��V����2��P��K��C�4��EȯB��-�;ϜJ���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�20�T��3��=��C��;��D��9��>��:��C��O��-֛7��<��B��B,*�T��=��C��;��D��9��>��:��C��O��-��<��B��B���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�/-ȯB��K��C��;��9��;��L�V��6�����)��ʪ��,*ȯB��K��C��;��;��L�V��6�����)��ʪ�����P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�53��2��<��F��2��D��C��D��:��LܾW��X��F��H��F��N��I��9)'��2��<��,��D��:��LܾW��X��F��H��F��N��I���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��H����D��C��,ȯB��J��P��I��@Ԛ<#!��H����D��C��,ȯB��J��P��I��@Ԛ<���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��P��N��ȯB��>��9��H��-�B�V��6#!��P��N��ȯB��>��9��H��-�B�V��6���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�JH����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;�9��V��;��K��XܤK��$GE����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;��V��;��K��XܤK��$�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9,*ȏBҲU��>��R��<��G��I��X��I��C��E��#��CҮJ��B��>ɸ<��I��X��I��C��#߭J�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����B߹-�;��:��XܷT��6˩5��J˩5��4����B߹-�;��:��XܷT��6˩5��/��7;9����B��-��:��XܷT��6��J˩5��4����B��-��:��XܷT��6��/��7�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9��6��T��'��߹-��X��6˩5��6��T��߹-��X��6�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9A?��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��9��D˩5ƛK��6��@��@��@;9��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��D˩5ƛK��6��@��@�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����B��I�;��:��XܷT��6˩5��J˩5��4����B��I�;��:��XܷT��6˩5��/��7A?����B��I�;��:��XܷT��6��J˩5��4����B��I�;��:��XܷT��6��/��7�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9;9��5˱U̾-��C��3��C��I��Q��:����>����:��X��>��6˩5��,��;86��5˱U̾-��C��3��C��I��Q��:����>����:��X��>��6��,��;�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9\Z����B��H����6ӻB��O��߹-��:��XܷT��B��H��߹-��XܷT��;��W��;����N����=��	�F��J˩5��4YW����B��H����6��O��߹-��:��XܷT��B��H��߹-��XܷT��;��W��;����N����=��	�F��J˩5��4�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9)'����>��6˩5��5�W�R�9��:��D��S�D��A ����>��6��5�Wֲ9��D��S�D��A�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9A?б��=��	�F߹-��=��X����B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X����B��H��F��S��T��T��9��CگD��/�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9nl�R��A߹-��:��X��>����B��6˩5��1��D��0�;��Hٟ@�R�9��:��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<_]�R��A߹-��:��X��>����B��6��1��0��Hٟ@ֲ9��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9GE����K��B��6��N��E��I��:��X��5�R�9��:˩5��U�I�R��>��:��D��S�D��AA?����K��B��6��N��E��I��:��X��5ֲ9˩5��U�I�R��>��:��D��S�D��A�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9DB߹-��:��XܷT��6��H߹-��:��XܷT��6˩5��Q��'����Ѳ��B��6ӻB��O��453߹-��:��XܷT��6߹-��:��XܷT��6��Q����Ѳ��B��0��4�����X˩5�R�9��:����X˩5ֲ9/-��'����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*����ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<����X˩5�R�9��:����X˩5ֲ9/-��0��:��X��6˩5��0��:��X��6˩5�>��4��6��4��T&$��0��:��X��6��0��:��X��6�>��4��6��T�����X˩5�R�9��:����X˩5ֲ9GEݩ5��T��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<DBݩ5��C����B��6��:��X��/ݩ5ٟ@��5��U�I��:����.��X��>˩5��G��@Ԛ<����X˩5�R�9��:����X˩5ֲ9A?б��=��	�F߹-��=��X����B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X����B��H��F��S��T��T��9��CگD��/�����X˩5�R�9��:����X˩5ֲ9��D��B��:����>˩5��A��K��B��:����>˩5��A��K����X˩5�R�9��:����X˩5ֲ9\Z��D�R��A��9į?߹-��=��X��>��6˩5��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��AYW��D�R��A��9į?߹-��=��X��>��6��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��A�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��H��E��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��;��W��H��E��K��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��K��E��	��9ݠ.��E��T ����N��9��;��W��K��	��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��K��E��K��9ݠ.��E��T����N��;��W��K��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��E��K��9ݠ.��E��T ����N��F��;��W��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��F��;��W��2��T��9��K��9ݠ.��E��T&$����N��F��;��W��2��T��9��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��H��K��9ݠ.��E��T ����N��F��;��W��H��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��H��E��	��9ݠ.��E��T#!����N��9��;��W��H��E��	��9��E��T���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��T��6��7��8��:��T��6��7��Iַ;P��=��8��-)'��:��T��6��7��8��:��T��6��7��I��=��8��-���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I ��T��1��8��:��C��T��6��7��Iַ;��T��1��8��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��Iܥ6��0��T��6��7ȣ8��Iַ;ܥ6��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��:��0��E��U��P��U��,��I��:��T��6��7��Iַ;#!��:��0��8��P��,��I��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��0��T��6��7ȣ8��Iַ;��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��:��C��T��6��7��Iַ;��@��?��:��T��6��7��I��@���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��C��T��6��7��Iַ;��:��C��T��6��7��Iַ;��;#!��:��T��6��7��I��:��T��6��7��I��;���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��0��E��U��4��J��8��:��C��T��6��7��Iַ;ܥ6 ��0��8��4��J��:��T��6��7��Iܥ6���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I����P����P���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B�:��D��>�7��5��.��T�:��D��>�7��5��.��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BMK��%��X��6��Xޡ8��X��S��X��8��X��N��X��.��X��C��X�C��X��F��X��2��X��4��X��CA?��%��X��6��Xޡ8��X��X��8��X��N��X��X��X��X��F��X��2��X��4��X��C���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BSQİF��E��1��;��/��6��4��X۹/��>��O��X۹/��>��TʭB��S��>��OʭB��S��>��T��U��>��6��K53İF��B��/��6��X��>��O��X��>��B��>��O��B��>��U��>��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B��/��4��?����B��O��B��T��/��4��?����B��O��B���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��>��L��?��?��H��F��?����F��T#!��/��4��3��>��L��?��H��F����F��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BGE˛5��9��/��=��T��4��>��X��?ޡ8��R��V��4��>��E��1��;��6��T��4��4��K��2,*���-��4��>����/��4��>��B��6��T��5��K��2���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BVT��D�G��:��/��4��X��>��3��?��X��?��F��B��T��F��?ޡ8��H��?��.��:��FʭB��.��4��?��F��6><��G��:��/��4��X��>��3��?��X��F��B��T��Fޡ8��H��.��F����F��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��BVT��E��1��;��>��C��6��P��K��H��,��-��X��?��7�1�E��7��0����NʡH����H��0��6��4��T��DB��B��>��C��6��P��K��H��,��-��X����0����NʡH����H��0��6��4��T�����4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B#!��U�/��4��X��>��3��B��?��8�,��T ��U�/��4��X��>��3��B��?��,��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B/-��-��4��4��6��M��;����-��>��>��@��W��>��W��>)'��-��5��6��M��;����-��>��>��@��>��W��>���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B_]��E��1��;��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��(��������!MK��B��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��X���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��?��>��L��H��J��X�/��E��N��B#!��/��4��3��?��>��L��H��J��X��E��N���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B����3��H��2��4��C��M΄/ǟ9��=��Tޡ8��?������$����������������������ڻ��������������(����$�������������������!����3��H��2��4��C��Mτ/��-�8������$����������������������ڻ��������������(����$�������������������!���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B,*��/��4��3��?��>��L��H��J��X�/��E�1ʞ:��-)'��/��4��3��?��>��L��H��J��X��E�1ʞ:��-���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B,*��H��E��1��;��>��/��4��H��?��L��B��<��B��B#!��H��B��>��/��4��H��L��B��<��B��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��9��G��O��B��Q��T��2��>����P��V��P��.��5��A��J��>��P)'��9��G��O��B��L��>����P��P�.��J��>��P���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FA?�C��O��W��>��M�>��B��W��A��Q��T��2��9��6��O��8��G��D��S�D��A;9�C��O��W��>��M�>��B��W��A��L��9��6��O��8��G��D��S�D��A���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��9��G�>��B��Q��T��2��>��V��J��7��6��8��T��7��=&$��9��G�>��B��L��>��VќJ��6��8��7��=���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��
-��F��W��L����S��J������$����2����A��B/-��
-��F��W��L����S��J������$����2����A��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��N��F��H��F��O��F��O��V��V��A��4��@��K&$��N��F��H��F��O��F��O��V��V��A��4��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86�>��B��Q��T��2��>΂P��F��;�/��U����N��5��L��U��ٶ,*�>��B��L��>΂P��F��;�/��U����N��5��L��U���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!�>��B��W��B��Q��T��2��>��F��@Ԛ<�>��B��W��B��L��>��F��@Ԛ<���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!��W��2��E�>��D��Q��T��2ϩN��F��B��W��E�>��D��LϩN��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F��8��F��5��R��.��U��E��S��2��8��F��=��.��U��E��S���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��J��>��R��8��"����������F��K��%��F��J��>��R������F��%��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F53��D��9��6��M��E��K�>��B��Q��T��2��>��V��D��@��@��@&$��D��9��6��E��K�>��B��L��>��V��@��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F ֖F��>��P��Mމ6��J��6��J����7��+��Mމ6��J��6��J����7���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��U�Mӛ?�1��?��7��F��,��7���M�����R��Q#!��U�Mӛ?�1��?���M�����R��Q���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��D��9��6��M�>��B��W��A��Q��T��2��D��S�D��A&$��D��9��6�>��B��W��A��L��D��S�D��A���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IǱ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IǱ.��>��4��I��2��I��D��T��0��I��I��D��0��I,*��I��D��T��0��1�,��I��C��D��T��0��I��D��T#!��I��D��T��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I/-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ</-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ<���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��IA?��D��T��3��I��5��8��D��T��3��8��I��5��D��T��3��X��5��8��I����?53��D��3��I��5��8��D��3��8��I��5��D��3��X��5��8��I��2���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I&$��I��D��T��P��D��N��0��I��0��I��@Ԛ<��I��D��P��D��0��0��I��@Ԛ<���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I/-��0��I��D��T��Fַ;��8��-����8��T��������!��0��D��1��8��-����8�����I��D��T��0��I��I��D��0��I��0��I��4��@Ԛ<��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��D��0��I��D��0��I��4��D��0��IǱ.��>��4��I����?/-��D��0��I��D��0��I��4��D��0��IǱ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��IǱ.��>��4��@Ԛ<��0��IǱ.��>��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��I��D��T��,�;��0��1�,��I��C��D��T��0��I��D��T)'��I��D��T��,�;��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IǱ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IǱ.��>��4��I��2��I��D��T��0��I��I��D��0��IDB��,��TܷT��0��I��,��TܷT��0��I��4��,��TܷT��0��IǱ.��>��4��I����?86��,ܷT��0��I��,ܷT��0��I��4��,ܷT��0��IǱ.��>��4��I��2���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��IMK��D��T��3��N��0��I��D��T��3��N��0��I��4��D��T��3��N��0��IǱ.��>��4��I����?/-��D��3��0��D��3��0��4��D��3��0Ǳ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��I��0��IػK��4��@��K��0��IػK��4��@��K���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I��0��I��D��G��@��K��0��I��D��@��K���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I53��D��T��3��I��D��T��3��Iַ;��D��T��3�O�I��I����?#!��D��3��I��D��3��I��D��3�O��I��2��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��B��6	�L��B��6��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8��L��7��@��K	�L��@��K��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��6��?	�L��6��?��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�,*��6�L��7��8��>ٟ@��H��F��@��F��7��6��>��P)'��6�L��8��>ٟ@��H��F��@��F��7��6��>��P��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�20��A��7�L��7�Hٟ@��8��E��P��;��:��P��O��@��@��@,*��A��7�L�Hٟ@��8��E��P��;��:��P��O��@��@��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�P��,�L��7��?��6��0���?��6��0��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�/-�L��7ٟ@��8��A��R��>��:��6��>��N��D��S�D��A,*�Lٟ@��8��A��R��>��:��6��>��N��D��S�D��A��L��7ٟ@��8��Lٟ@��8�,*��6P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�wu�L��7��D��F��6�L��7��B��7�L��7��6��<��6P��,�L��7��
-�Gٟ@��8��6��7��@��7��5�L��7��8��>ٟ@�;��F��J��>��N��1�S_]�L��D��F��6�L��B��7�L��6Ǥ<���
-�Gٟ@��8��6��7��@��7��5�L��8��>ٟ@�;��F��J��>��N��1�S�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��C��P��I��/��C��/��9��?��T#!�0��=����P��I��/��C��/��9��?��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S86¨0ʽ=��=��Tɾ=��C��6��=��Tɾ=��C��6��A�A��N��T��A��T#!�0��=��T�6��=��T�6��A��A��T��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��W����7��>��7��C��<��B��B)'�0��=��Tɾ=��W����7��>��7��C��<��B��B�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-��=��Tɾ=��>¨0ʽ=ʇX��Qޢ<�Qɾ=��Cݰ?��Q��.&$��=��Tɾ=��>�0ʇX��Qޢ<�Q̾=ݰ?��Q�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��J��6��O��T¨0��A��=��Tɾ=��J��6��K�0ҳ��O��T�0ҳ��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SJH¨0ʽ=��PʇX��D�Q��=��Tɾ=��C��>ΉX˛5¨0��A��/��T��D¨0��A��/�A��4�J53�-ʇX��D�Q��=����>ΉX˛5�0��/��T��D�0��/��A�J�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S)'¨0��A��=��Tɾ=��W�9��L��/͒�A��4��T�0��=��Tɾ=��W��/͒�A��4�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��SSQ¨0ʽ=��P��N��=��Tɾ=��C��@��Eޢ<�Qɾ=��C��D��/��D��/��QİU��4��4����������/-�-��N��=����@��Eޢ<�Q̾=��D��/��D��9��4�*�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��6��=��T��3��O��T��D�A��4&$�0��=��Tɾ=��6��=��T��3��O��T��D��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��=��Tɾ=��6����8�,��T�0��=��Tɾ=��6����,��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SDB��S��4��8¨0ʽ=��P��=��Tɾ=��Cϛ)ϛ)�)�)�Q��Tɾ=��C��9��8��K��T/-ФO��8�-��=��ϛ)ϛ)�)�)�Q����9��8��K��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S#!��E��=¨0ʽ=��=��Tɾ=��.��8��?̛<��=�0��=��Tɾ=��.��?�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S;9¨0��A��=��Tɾ=��C��E��S��S��.��PщQ¨0��A�A��4��D�A��T&$�0��=����E��S��*щQ�0��A��D�A��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��D��>��=��Tɾ=��C��@��K�0��D��>��=����@��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S��=��Tɾ=��C��6¨0��A��T��=��T�6�0��T�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/)'��/��/Æ.��J��:��N��L��J��S�1��/��G��B ��/��/Æ.��J��:��N��L��W��/��G�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/��S�1�D��?	��W�D��?�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/53��S�1��/��E��7��0��C��/��7��7����S�1��/����A��B#!��W��/��E����7����W��/����A��Bw	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/�A��B�A��B�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/20��9��J��/��?ſQ��5ߕJ��C��M��C��R��U��RН?��Q��T)'��9��J��/��?ſQ��5ߕJ��C��M��C��R����Q�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/><��B��U��>��9��@��V��W��F�?��Wַ;��;�E��-��S�1Н?��>��A��T20��B��>��9��@��V��W��F�?��Wַ;��;�E��-��W��?��A�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/��S�1��/��I�A��6��W��/��I�A��6���E��G��?��>��-��E��G��?��>��-/-��G��?�R�1��4��2��T��N��5��=�7��@��P��:��J#!��G��?�1��4��2��T��5��=��@��P��J��E��G��?��>��-��E��G��?��>��-)'��G��?��>��-��P��L΅/��Bڶ>��S��J��@Ԛ< ��G��?��>��-΅/��B��S��J��@Ԛ<���E��G��?��>��-��E��G��?��>��-��G��?��T�4��G��?��T�4��E��G��?��>��-��E��G��?��>��-��G��W��-��T��G��*���E��G��?��>��-��E��G��?��>��-��E�,��G��?�/��-��"��D��:��E��G��?�/��-��"��D��E��G��?��>��-��E��G��?��>��-20��G��?��>��-��G��6��4��?��9ʉ5��;˫N¶;�P��N��T,*��G��?��>��-��G��4��?��9ʉ5��;ΫN�P��N��T���E��G��?��>��-��E��G��?��>��-86��G��?��>��-��2��2΅/��8��B��?¶7ģC��CщQ��D��P�D��A,*��G��?��>��-΅/��8��B��N�CщQ��D��P�D��A��E��G��?��>��-��E��G��?��>��-20����N��E�,��G��?��>��-��?¶7ʡH��W��B��:ģC��O#!����N��E��G��?��>��-��N��W��:�C���E��G��?��>��-��E��G��?��>��-&$��G��?��>��-��8��G��?��>��-��4��-��2#!��G��?��>��-��G��?��>��-��4��-��2��E��G��?��>��-��E��G��?��>��- ��G��?��>��-�/��.��BʭBѡ8¶;��G��?��>��-�/��BʭBѡ8¶;���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>,*��K�=�9��:ׄ9��?��DϪJ��P��>؞C��@��@��@ ��=��:ׄ9��?��DϪJ��Pρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>��>؞C��1��9��Tׄ9��?��@Ԛ<ρ>��1��9��Tׄ9��?��@Ԛ<���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>Ư8��Hׄ9��?��>؞C��@��@��@Ư8��Hׄ9��?ρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>ׄ9��?��=��7ׄ9��?��=��7���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>&$��>؞Cׄ9��?��6��R�1��T��D��P�D��A ρ>ׄ9��?��6�1��T��D��P�D��A���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>DB����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��I��>؞CбM�8��6><����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��Iρ>бM��8���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>#!��UP۴2��>��M��N��,��B��MСG��T��U��P��>��M��N��B��MСG��T���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>hf��1��	��T��Sׄ9��?��A��J�9��J��O��T��,��Q����S��F��>��T��9��P��,��1�R��>؞Cб��:��6�����)��ʪ_]��1��	��T��Sׄ9��?��A˱9��O��T��,��Q����S��F��>��T��9��P��,��1�Rρ>б��:��6�����)��ʪ���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>A?��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9��>؞C��@��@��@;9��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9ρ>��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!��C��1����4��>��@��D��2��>��@Ԛ<#!��C��1����4��>��@��D��2��>��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2SQ��&����L����4�9��2��IщQP��=��&��1��X��4��B��D��7��1��X��G��:��&������T��6GEީ��L����4�9��2��IщQ��=��&��1��4��B��D��7��1��G��:��&������T��6�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2 ��Cڜ>����4��2��K��.��B��@��K��Cڜ>����4��K��.��@��K�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��6��1��&������6��P��>��4��2��9�Q��1��@����&��@��@��@20��6��1��&������6��P��>��4ƋQ��1��@����&��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!������6��E��4��2��4ڜ>��2����AЍ��6��E��4��4ڜ>��2����A�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2��6�>����4��2��E��X��@��N��6�>����4��E��@��N�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����>��4ڜ>��F��5��@Ԛ<����>��4ڜ>��F��5��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��6�2��486��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��D��4���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8MK����U��E��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��A�7��BDB��Ǡ2��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��+���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�853�D��U��E��>��W��@��P��2��1�H��S��V��9��;��W��@Ԛ<,*�DǠ2��>��W��@��7��1�H��S��V��9��=��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886��U��E��2��V��=��L��2��9��6��T��=ȟN��2��D��S��>؞CԚ<20Ǡ2��2��V��=��L��2��9��6��T��=ȟN��2��D��Sρ>Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8JH��>؞C��X��A�N��;��W����S��V��6��2��6��D��U��E��=��W�L��6��,��6��@Ԛ<A?ρ>��X��A�N��=����S��V��6��2��6��DǠ2��=��W�L��6��,��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8PN��9��;��2��U��E��D����S��V�1��6��=��G��B��<�6��>؞Cб��:��6�����)��ʪDB��9��;��2Ǡ2��D����S��V�1��6��=��G��<ρ>б��:��6�����)��ʪ���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8DB��D��7��>��U��E��;��A����S��VϨH��,ϨH��W��;��6��2��>��T��6��@Ԛ<><��D��7��>Ǡ2��;��A����S��V؋8ϨH��W��;��6��2��>��T��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8;9��>��U��E��;��F��W��O��T��7��,��>��A��8��S��V��D��P�D��A20��>Ǡ2��;��F��W��3��7��,��>��A��S��V��D��P�D��A���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886����U��E��6�1��6��=��V��6��>��6��L��=��>؞C��@��@��@/-��Ǡ2��6�1��6��=��V��6��>��6��L��=ρ>��@��@���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8)'��>؞C��U��E��;��V��6��2��6��D��S�D��A#!ρ>Ǡ2��;��V��6��2��6��D��S�D��A���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TJH�R��6��>��#����H��L��6��M��9ٟ@��U��V��UӁG��DܤK��8��<��#����@��@��@;9�R��6��>��#��H��L��6��M�@��U��V��UӁG��A��8��<��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T ��U��U��D��,��A��#��%��@��@��@��U��U��D��,��A��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T20��U��N��.��T��5ƛK��,��6�I�1��6��#��%��@��@��@,*��U��N��.��T��5ƛK��,��6�I�1��6��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TDB�9Ԛ<��6ϪJ��>��#����>��Q�@��D��9��D��FҾW��SܤK��#����@��@��@,*��1��6��>��#��>��Q�@��9��FҾW��S��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T20��#��%��>��UӁG��D��9��D��.��7��>��#��%��@��@��@#!��#��>��UӁG��9��.��7��>��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��TMK�R��,����9��S��=ɵOʡH��9��B��>��U��U��D��=��UL�9��T��M��#����@��@��@;9�R��,����SɵO��9��>��U��U��D��=��UL�9��T��M��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T&$��DПC��,��UӁG��DܤK��#����@��@��@��DПC��,��UӁG��A��#��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T&$��E��>��F��#����U��D�K�0��@��@��@ ��E��>��F��#��U��D�K�0��@��@���#����U��D��T��#��U��D��T��@��@��@��#����U��D��T��#��U��D��T,*��#����U��D�K��-щQ��R��Q��#����@��@��@#!��#��U��D�K��-щQ��R��Q��#��@��@���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-YW��9��T����:��B��7ٟ@��������)��X��-���;��J��%��)ѾC��T��O��7��%��T�8��7��F��D�0A?��9��T����:��B��7ٟ@��X��-����;��)ѾC��T��O��7��%��T��8��F�0���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-&$��5��M�;��J��.��B��7��H��1��R��@Ԛ<#!��5��M��;��.��B��7��H��1��R��@Ԛ<���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-PN�;��J��A��5��D��N�8��R��8��E��B��S��;��7��6��X��H��-��N��F��K��,��D��P�D��AA?��;��A��5��D��N��R��N��B��S��;��5��X��H��-��N��F��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-_]��5��M�;��J��D��Cٟ@��F��2��6��K��:��X��-��R��B��9��S�8��@��D��6��9��>ҾW��D��,��D��P�D��APN��5��M��;��D��@��F��2��6��K��:��X��-��R��B��9��S�8��@��6ߖ>��D��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-�;��J١-ܤK��S��/��@��N��;١-ܤK��S��@��N���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86��6��C��;����	��X��-��N��W��H��T�;��J�����)��ʪ/-��6��C��;����X��-��N��W��H��;�����)��ʪ�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�GE��5��D��R��9��3��A��7��.��8��R��AƛK��2��T��H�?��T�!��H��AM�8��6A?��5��D��R��9��3��A��7��8��R��AƛK��2��T��H�?��T�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�86��5��R��Aб�D��9��3��A��7��.��8��R��A���!��@��@��@20��5��R��Aб�D��9��3��A��7��8��R��A���!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ʡH��9��B��R��9����3��A��V��7��.��R��A��ϪJ��H��A��@��@��@20��9��R��9����3��A��V��7��R��A��ϪJ��H��A��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�;9�!��H��A��5��D��R��9��L��9��B��R��7��.��R��ϪJ��,��@Ԛ<86�!��H��A��5��D��R��9��L��9��B��R��7��R��ϪJ��,��@Ԛ<�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�ki��5��D��9��3��A��J��R��7��.��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��AM�8��6ec��5��D��9��3��A��J��R��7��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�b`��5��D��R��9��L��9�D��R������7��.��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��AM�8��6\Z��5��D��R��9��L��9�D��R������7��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��AM��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�/-��5��D��9��L��9�D��7��.��R��ƭI�!��@��@��@)'��5��D��9��L��9�D��7��R��ƭI�!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�hf��9��5��L��9�D��R��G��7��.��3��A��W�D��E��W��KѾC��H��T��7��H��A��7��:��6�����������)��ʪ��VT��9��5��L��9�D��R��G��7��3��A��W�D��E��W��K��5����:��6�����������)��ʪ���/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��.��K��5;9ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��K��5���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��Bʔ7��7��>��P/-��W��<��7��R��:����1��E��P��9�M��Bݔ7��>��P���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��R��-��R��-���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<53��9��.�<��J��O��<��-щQ��.��6��O��.��6��U��7��7��T&$��9�<��J��O��<щQ��.��O��.��UԚ7��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20��.��6��O��3��7��;��0��G����.��6��.��6��<��B��B&$��.��O��3�7��0��G����.��.��<��B��B���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��8�I��6��T��.��O��8�I��6��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��T����K��6��.��O��T����K��6���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20����.��6��O�/��E��E��D��.��6��O�/��E��E��"��W#!������E��D��.��6��O�/��E��"��W���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��D��S�D��A20��W��<��7��R��:����1��E��P��9�M��B��D��S�D��A���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<><��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��@Ԛ<,*��W��<��7��R��:����1��E��P��9�M��B��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��M��>��4��M��5��4Н?��A��3��A��T ��U��D��1ձM��4��M��5��4��A��A�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��C��T��%��8��>��9��S�1��M��E��;)'��U��D��1��C��T��%��8��>��9��S�1��M��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��U��3ʡH��W��R�D��U ��U��D��1��U��3ʡH��W��R�D��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��M��>��4��M��5��G��3��U��D��1ձM��4��M��5��G�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��8��J��D��1��U��H��AʡH����R��G��M��=��T,*��U��8��J��D��1��U��H��AʡH����R��G��M��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1��F��B��L��L¶7��JѾC��4��W��,��M��4��;#!��U��D��1�B��N��J�C��W��1��4��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1ʡH��R��:��D��G��AʈO��>��6#!��U��D��1ʡH��R��:��D��G��A��>��6�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��.��1��P��D��>��J١-��-�� ��A��B&$��UȂ3��.��1��P��>��J١-��-�� ��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1SQ��>��K��U��9��D��1��M��.��O��G��UʡH��9�>��9��U��1��9��9��>��U��6�9��I��T��@Ԛ<DB��>��K��U��9��D��1��M��.ǼO��U�>��9��U��1��9��>��U��6��I��T��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�BPϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�BPϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��1��M��CP��Q��>��DԃP��E��A��B ��U͂3�MP��Q��>��D�U��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1P��Rޚ6��H��U��4��9��QÐW��B&$��U��D��1��Rޚ6��H��U��4��9��QÐW��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1��W��>β7��UщQ��D��G��@��K ��U��D��1��W��>ƴ7щQ��D��@��K�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��MʡH��R��H��U��U��D��1��MʡH��R��H��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��D��1��@��1��G��M��3̛<��:��9��T�����!#!��U��D��1��@��1��G��M��3��:��T��W�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��>��N��V����N��F��U��D��1��>��N��V����N��F�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��MʡH��W��R��H��U��J��6��J����7,*��U��D��1��MʡH��W��R��H��U��J��6��J����7�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1JH��U��B��M�B��U��D��1��9ҧK��1�BPϪJ��>��D��S��1��U��B��D��B��N��@Ԛ<><��U��M�B��U��D��1��9ҧK��1�B��J��>��Dū1��U��D��B��N��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��AʋM��Q��U��,��D��1��U��>��4��,��3��T��5��=��T&$��A��Q��U��,��D��1��U��>��4��3��5��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��153��4��U��D��1��M��/��5��S��7��H��4��7����N����H��)'��4��U��D��1��M��5��S������N����H���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1��U��/��VӲU��>��/��=��W��Q��T ��U��D��1��*ӲU��>��/��=��W��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1��UʡH��W��R��6��U��>�G��=��S��U��/��T��(����)������!/-��U��D��1��UʡH��W��R��6��U��>��G��S��U��T���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��M��>��U��.��6��<��B��B��U��D��1ձM��U��.��<��B��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1�K��U��>�1��D��3̛<��2��/ќ6��H��Q��T&$��U��D��1�K��U��>�1��D��3��/��H��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1ۓR��4��H��5�BPϪJ��>��D��3��K��T��(������(������!)'��U��D��1ۓR��4��H��5�B��J��>��D��3��K���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8qo��HԼO��R��@��C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��D��Oָ:��?��Thf��H����C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8nl��HԼO��R��@��C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��D��Oָ:��?��Tec��H����C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8}{��M��:İU��;ԼO��R��@��?��R��5��<��D��>��B��D�5��@��E��7��K՞R��W��K��D��5��C��/��8��>ٟ@��>��:��T��(������!��K��;�8��6_]��:����?��R��5��<��D��>��B��D�5��@��E��G՞RʼG��D��5��C��/��8��>ٟ@��>��:��T����K��;�8��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8#!ß<��:��Dć?ԼO��@��C��/��8��>ٟ@ ß<��:��Dć?�O��C��/��8��>ٟ@���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8,*��HԼO��R��@��NܒM̺2��C��/��D��8��>ٟ@��#!��H����N�M��C��/��D��8��>ٟ@�����EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��853��HԼO��R��@��C��/��D��8��>ٟ@����R��E��@��>��D��W/-��H����C��/��D��8��>ٟ@����R��E��@��>��D��W���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8&$��EԼO��@��C��/��8��>ٟ@��D��P�D��A#!��E�O��C��/��8��>ٟ@��D��P�D��A���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8����HԼO��R��@��C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��@��>�Hٟ@��/ў7����:��@՞R��.ٟ@��/ў7��9��O��E��E��Xqo��H����C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��>�Hٟ@��/ў7����:��>ٟ@��/ў7��9��E��E��X���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8;9��HԼO��@��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>�9��2��653��H�O��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>��2��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��820��H��?��R��S��H��Iԓ4��>ԼO��@7��C��/��8��>ٟ@)'��H��?��R��S����>�O7��C��/��8��>ٟ@
\ No newline at end of file
diff --git a/paddle/trainer/tests/gen_proto_data.py b/paddle/trainer/tests/gen_proto_data.py
deleted file mode 100644
index 8cc6d44673..0000000000
--- a/paddle/trainer/tests/gen_proto_data.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cStringIO import StringIO
-
-import paddle.proto.DataFormat_pb2 as DataFormat
-from google.protobuf.internal.encoder import _EncodeVarint
-
-import logging
-import pprint
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-OOV_POLICY_IGNORE = 0
-OOV_POLICY_USE = 1
-OOV_POLICY_ERROR = 2
-
-num_original_columns = 3
-
-# Feature combination patterns.
-# [[-1,0], [0,0]]  means previous token at column 0 and current token at
-# column 0 are combined as one feature.
-patterns = [
-    [[-2, 0]],
-    [[-1, 0]],
-    [[0, 0]],
-    [[1, 0]],
-    [[2, 0]],
-    [[-1, 0], [0, 0]],
-    [[0, 0], [1, 0]],
-    [[-2, 1]],
-    [[-1, 1]],
-    [[0, 1]],
-    [[1, 1]],
-    [[2, 1]],
-    [[-2, 1], [-1, 1]],
-    [[-1, 1], [0, 1]],
-    [[0, 1], [1, 1]],
-    [[1, 1], [2, 1]],
-    [[-2, 1], [-1, 1], [0, 1]],
-    [[-1, 1], [0, 1], [1, 1]],
-    [[0, 1], [1, 1], [2, 1]],
-]
-
-
-def make_features(sequence):
-    length = len(sequence)
-    num_features = len(sequence[0])
-
-    def get_features(pos):
-        if pos < 0:
-            return ['#B%s' % -pos] * num_features
-        if pos >= length:
-            return ['#E%s' % (pos - length + 1)] * num_features
-        return sequence[pos]
-
-    for i in xrange(length):
-        for pattern in patterns:
-            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
-            sequence[i].append(fname)
-
-
-'''
-Source file format:
-Each line is for one timestep. The features are separated by space.
-An empty line indicates end of a sequence.
-
-cutoff: a list of numbers. If count of a feature is smaller than this,
- it will be ignored.
-if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
-i-th column.
-
-return a list of dict for each column
-'''
-
-
-def create_dictionaries(filename, cutoff, oov_policy):
-    def add_to_dict(sequence, dicts):
-        num_features = len(dicts)
-        for features in sequence:
-            l = len(features)
-            assert l == num_features, "Wrong number of features " + line
-            for i in xrange(l):
-                if features[i] in dicts[i]:
-                    dicts[i][features[i]] += 1
-                else:
-                    dicts[i][features[i]] = 1
-
-    num_features = len(cutoff)
-    dicts = []
-    for i in xrange(num_features):
-        dicts.append(dict())
-
-    f = open(filename, 'rb')
-
-    sequence = []
-
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            add_to_dict(sequence, dicts)
-            sequence = []
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    for i in xrange(num_features):
-        dct = dicts[i]
-        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
-        todo = []
-        for k, v in dct.iteritems():
-            if v < cutoff[i]:
-                todo.append(k)
-            else:
-                dct[k] = n
-                n += 1
-
-        if oov_policy[i] == OOV_POLICY_USE:
-            # placeholder so that len(dct) will be the number of features
-            # including OOV
-            dct['#OOV#'] = 0
-
-        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
-        for k in todo:
-            del dct[k]
-
-    f.close()
-    return dicts
-
-
-def encode_varint(v):
-    out = StringIO()
-    _EncodeVarint(out.write, v)
-    return out.getvalue()
-
-
-def write_proto(file, message):
-    s = message.SerializeToString()
-    packed_len = encode_varint(len(s))
-    file.write(packed_len + s)
-
-
-'''
-if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
-existed in dicts[i] will be assigned to id 0.
-if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
-in dicts[i].
-'''
-
-
-def gen_proto_file(input_file, dicts, oov_policy, output_file):
-    def write_sequence(out, sequence):
-        num_features = len(dicts)
-        is_beginning = True
-        for features in sequence:
-            assert len(features) == num_features, \
-                "Wrong number of features: " + line
-            sample = DataFormat.DataSample()
-            for i in xrange(num_original_columns):
-                id = dicts[i].get(features[i], -1)
-                if id != -1:
-                    sample.id_slots.append(id)
-                elif oov_policy[i] == OOV_POLICY_IGNORE:
-                    sample.id_slots.append(0xffffffff)
-                elif oov_policy[i] == OOV_POLICY_ERROR:
-                    logger.fatal("Unknown token: %s" % features[i])
-                else:
-                    sample.id_slots.append(0)
-
-            if patterns:
-                dim = 0
-                vec = sample.vector_slots.add()
-                for i in xrange(num_original_columns, num_features):
-                    id = dicts[i].get(features[i], -1)
-                    if id != -1:
-                        vec.ids.append(dim + id)
-                    elif oov_policy[i] == OOV_POLICY_IGNORE:
-                        pass
-                    elif oov_policy[i] == OOV_POLICY_ERROR:
-                        logger.fatal("Unknown token: %s" % features[i])
-                    else:
-                        vec.ids.append(dim + 0)
-
-                    dim += len(dicts[i])
-
-            sample.is_beginning = is_beginning
-            is_beginning = False
-            write_proto(out, sample)
-
-    num_features = len(dicts)
-    f = open(input_file, 'rb')
-    out = open(output_file, 'wb')
-
-    header = DataFormat.DataHeader()
-    if patterns:
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
-        slot_def.dim = sum(
-            [len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
-        logger.info("feature_dim=%s" % slot_def.dim)
-
-    for i in xrange(num_original_columns):
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.INDEX
-        slot_def.dim = len(dicts[i])
-
-    write_proto(out, header)
-
-    num_sequences = 0
-    sequence = []
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            write_sequence(out, sequence)
-            sequence = []
-            num_sequences += 1
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    f.close()
-    out.close()
-
-    logger.info("num_sequences=%s" % num_sequences)
-
-
-dict2 = {
-    'B-ADJP': 0,
-    'I-ADJP': 1,
-    'B-ADVP': 2,
-    'I-ADVP': 3,
-    'B-CONJP': 4,
-    'I-CONJP': 5,
-    'B-INTJ': 6,
-    'I-INTJ': 7,
-    'B-LST': 8,
-    'I-LST': 9,
-    'B-NP': 10,
-    'I-NP': 11,
-    'B-PP': 12,
-    'I-PP': 13,
-    'B-PRT': 14,
-    'I-PRT': 15,
-    'B-SBAR': 16,
-    'I-SBAR': 17,
-    'B-UCP': 18,
-    'I-UCP': 19,
-    'B-VP': 20,
-    'I-VP': 21,
-    'O': 22
-}
-
-if __name__ == '__main__':
-    cutoff = [3, 1, 0]
-    cutoff += [3] * len(patterns)
-    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
-    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
-    dicts[2] = dict2
-    gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
-                   'trainer/tests/train_proto.bin')
-    gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
-                   'trainer/tests/test_proto.bin')
diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list
deleted file mode 100644
index 703e87753d..0000000000
--- a/paddle/trainer/tests/mnist.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part
deleted file mode 100644
index 08b93a0ebb..0000000000
Binary files a/paddle/trainer/tests/mnist_bin_part and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
deleted file mode 100644
index f189b21e86..0000000000
Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
deleted file mode 100644
index 6b406dff0b..0000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ /dev/null
@@ -1 +0,0 @@
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
diff --git a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
deleted file mode 100644
index 92f32a18c0..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 999
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train_sparse.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 32
-layer2_dim = 16
-layer3_dim = 16
-hidden_dim = 32
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
deleted file mode 100644
index b1744db8d6..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
deleted file mode 100644
index b1744db8d6..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
deleted file mode 100644
index d19222360c..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf
deleted file mode 100644
index b720d4d5a6..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ /dev/null
@@ -1,180 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("recurrent_nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def SimpleRecurrentLayer(name, 
-                         size, 
-                         active_type, 
-                         bias, 
-                         input_layer_name, 
-                         parameter_name,
-                         seq_reversed = False):
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links=[input_layer_name], 
-                             out_links=[name],
-                             seq_reversed=seq_reversed)
-    memory_name = Memory(name=name, size=size)
-    Layer(
-        name = name,
-        type = "mixed",
-        size = size,
-        active_type = active_type,
-        bias = bias,
-        inputs = [IdentityProjection(input_layer_name),
-                  FullMatrixProjection(memory_name,
-                                       parameter_name = parameter_name,
-                                       ),
-                  ]
-        )
-    RecurrentLayerGroupEnd(name + "_layer_group")
-
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        SimpleRecurrentLayer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            size = hidden_dim,
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            input_layer_name = slot_names[i] + "_embedding_" + network_name,
-            parameter_name = "rnn1.w0",
-            )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/test.txt b/paddle/trainer/tests/test.txt
deleted file mode 100644
index 3ad503b34f..0000000000
--- a/paddle/trainer/tests/test.txt
+++ /dev/null
@@ -1,1000 +0,0 @@
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
--LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
--RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
index 2c29a27433..a76eeeacb9 100644
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -20,28 +20,6 @@ import random
 import json
 import string
 
-
-@provider(slots=[
-    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
-    IndexSlot(3)
-])
-def processNonSequenceData(obj, filename):
-    with open(filename, "rb") as f:
-        for line in f:
-            slots_str = line.split(';')
-            index = int(slots_str[0])
-            non_values = map(int, slots_str[1].split()[1:])
-            dense = map(float, slots_str[2].split()[1:])
-            strs = slots_str[4].strip().split(' ', 1)[1]
-
-            def __values_mapper__(s):
-                s = s.split(":")
-                return int(s[0]), float(s[1])
-
-            values = map(__values_mapper__, slots_str[3].split()[1:])
-            yield [non_values, dense, values, strs, index]
-
-
 SPARSE_ID_LIMIT = 1000
 SPARSE_ID_COUNT = 100
 SEQUENCE_LIMIT = 50
@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
 
 
 if __name__ == "__main__":
-    pvd = processNonSequenceData("test.txt")
-    print pvd.getNextBatch(100)
     pvd = processSeqAndGenerateData("_")
     print pvd.getNextBatch(100)
     pvd = processSubSeqAndGenerateData("_")
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
deleted file mode 100644
index 383505f813..0000000000
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(need_high_accuracy,
-            true,
-            "whether need to run in double accuracy (recommended)");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.train();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
-  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
-  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-4;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-7;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 66ec65e340..92dc8aa9ec 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -25,45 +25,9 @@ limitations under the License. */
 #include <unordered_set>
 #include "picojson.h"
 
-void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
 const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
 
-TEST(PyDataProviderWrapper, NoSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module(std::string("testPyDataWrapper"));
-  conf.set_load_data_object(std::string("processNonSequenceData"));
-  conf.set_async_load_data(false);
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  paddle::DataConfig conf2;
-  conf2.set_type("proto");
-  conf2.set_async_load_data(false);
-  conf2.clear_files();
-  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
-
-  provider.reset(paddle::DataProvider::create(conf2, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromProto;
-  provider->getNextBatch(100, &batchFromProto);
-
-  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
-  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
-  EXPECT_EQ(pyArguments.size(), protoArguments.size());
-
-  for (size_t i = 0; i < pyArguments.size(); ++i) {
-    checkEqual(protoArguments[i], pyArguments[i]);
-  }
-}
-
 TEST(PyDataProviderWrapper, SequenceData) {
   paddle::DataConfig conf;
   conf.set_type("py");
@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-void checkEqual(const paddle::Argument& expect,
-                const paddle::Argument& actual) {
-  if (expect.value) {
-    EXPECT_TRUE(actual.value != nullptr);
-    paddle::Matrix* e = expect.value.get();
-    paddle::Matrix* a = actual.value.get();
-    EXPECT_EQ(e->getWidth(), a->getWidth());
-    EXPECT_EQ(e->getHeight(), a->getHeight());
-    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
-      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
-      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
-      EXPECT_EQ(se->getFormat(), sa->getFormat());
-      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
-      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getElementCnt()
-                           : se->getHeight() + 1;
-      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getWidth() + 1
-                           : se->getElementCnt();
-      for (size_t i = 0; i < rowSize; ++i) {
-        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
-      }
-      for (size_t i = 0; i < colSize; ++i) {
-        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
-      }
-      if (se->getValueType() == paddle::FLOAT_VALUE) {
-        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
-        for (size_t i = 0; i < se->getElementCnt(); ++i) {
-          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
-        }
-      }
-    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
-      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
-      for (size_t i = 0; i < e->getElementCnt(); ++i) {
-        EXPECT_EQ(e->getData()[i], a->getData()[i]);
-      }
-    }
-  }
-
-  if (expect.ids) {
-    EXPECT_TRUE(actual.ids != nullptr);
-    paddle::VectorT<int>* e = expect.ids.get();
-    paddle::VectorT<int>* a = actual.ids.get();
-    EXPECT_EQ(e->getSize(), a->getSize());
-    for (size_t i = 0; i < e->getSize(); ++i) {
-      EXPECT_EQ(e->getData()[i], a->getData()[i]);
-    }
-  }
-
-  if (expect.strs) {
-    EXPECT_TRUE(actual.strs != nullptr);
-    std::vector<std::string>* e = expect.strs.get();
-    std::vector<std::string>* a = actual.strs.get();
-    EXPECT_EQ(e->size(), a->size());
-    for (size_t i = 0; i < e->size(); ++i) {
-      EXPECT_EQ((*e)[i], (*a)[i]);
-    }
-  }
-}
-
 void checkValue(std::vector<paddle::Argument>& arguments,
                 picojson::array& arr) {
   // CHECK SLOT 0, Sparse Value.
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 425b3d10a3..394038cf73 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -24,7 +24,6 @@ using namespace std;     // NOLINT
 static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
@@ -95,13 +94,6 @@ TEST(checkGradient, multi) {
 
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
-TEST(checkGradient, chunk) {
-  checkGradientTest(configFile3, false, false);
-#ifdef PADDLE_WITH_CUDA
-  checkGradientTest(configFile3, true, true);
-#endif
-}
-
 TEST(checkGradient, non_parallel) {
   checkGradientTest(configFile4, false, false);
 }
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index d1bb9b877f..2f86aaa753 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -15,12 +15,7 @@
 
 from paddle.trainer_config_helpers import *
 
-TrainData(ProtoData(
-    files = "dummy_list",
-    constant_slots = [1.0],
-    async_load_data = True))
-
-TestData(SimpleData(
+TrainData(SimpleData(
     files = "trainer/tests/sample_filelist.txt",
     feat_dim = 3,
     context_len = 0,
diff --git a/paddle/trainer/tests/test_files.txt b/paddle/trainer/tests/test_files.txt
deleted file mode 100644
index 49002677a8..0000000000
--- a/paddle/trainer/tests/test_files.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/test_proto.bin
diff --git a/paddle/trainer/tests/train.list b/paddle/trainer/tests/train.list
deleted file mode 100644
index f41e8e8893..0000000000
--- a/paddle/trainer/tests/train.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/data_bin_part
diff --git a/paddle/trainer/tests/train.txt b/paddle/trainer/tests/train.txt
deleted file mode 100644
index 2313aee987..0000000000
--- a/paddle/trainer/tests/train.txt
+++ /dev/null
@@ -1,5000 +0,0 @@
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
--LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
--RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
-close NN I-NP
-of IN B-PP
-141.35 CD B-NP
-yen NN I-NP
-. . O
-
-On IN B-PP
-the DT B-NP
-Commodity NNP I-NP
-Exchange NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-, , O
-gold NN B-NP
-for IN B-PP
-current JJ B-NP
-delivery NN I-NP
-settled VBD B-VP
-at IN B-PP
-$ $ B-NP
-367.30 CD I-NP
-an DT B-NP
-ounce NN I-NP
-, , O
-up IN B-ADVP
-20 CD B-NP
-cents NNS I-NP
-. . O
-
-Estimated VBN B-NP
-volume NN I-NP
-was VBD B-VP
-a DT B-NP
-light NN I-NP
-2.4 CD I-NP
-million CD I-NP
-ounces NNS I-NP
-. . O
-
-In IN B-PP
-early JJ B-NP
-trading NN I-NP
-in IN B-PP
-Hong NNP B-NP
-Kong NNP I-NP
-Monday NNP B-NP
-, , O
-gold NN B-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-$ $ B-NP
-366.50 CD I-NP
-an DT B-NP
-ounce NN I-NP
-. . O
-
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-Limited NNP I-NP
-Partnership NNP I-NP
-said VBD B-VP
-it PRP B-NP
-proposed VBD B-VP
-to TO I-VP
-acquire VB I-VP
-A.P. NNP B-NP
-Green NNP I-NP
-Industries NNP I-NP
-Inc. NNP I-NP
-for IN B-PP
-$ $ B-NP
-40 CD I-NP
-a DT B-NP
-share NN I-NP
-. . O
-
-In IN B-PP
-an DT B-NP
-Oct. NNP I-NP
-19 CD I-NP
-letter NN I-NP
-to TO B-PP
-A.P. NNP B-NP
-Green NNP I-NP
-'s POS B-NP
-board NN I-NP
-, , O
-East NNP B-NP
-Rock NNP I-NP
-said VBD B-VP
-the DT B-NP
-offer NN I-NP
-is VBZ B-VP
-subject NN B-ADJP
-to TO B-PP
-the DT B-NP
-signing NN I-NP
-of IN B-PP
-a DT B-NP
-merger NN I-NP
-agreement NN I-NP
-by IN B-PP
-no DT B-ADVP
-later RB I-ADVP
-than IN B-PP
-Oct. NNP B-NP
-31 CD I-NP
-. . O
-
-The DT B-NP
-letter NN I-NP
-, , O
-attached VBN B-VP
-to TO B-PP
-a DT B-NP
-filing NN I-NP
-with IN B-PP
-the DT B-NP
-Securities NNP I-NP
-and CC I-NP
-Exchange NNP I-NP
-Commission NNP I-NP
-, , O
-said VBD B-VP
-the DT B-NP
-approval NN I-NP
-is VBZ B-VP
-also RB B-ADVP
-contingent JJ B-ADJP
-upon IN B-PP
-obtaining VBG B-VP
-satisfactory JJ B-NP
-financing NN I-NP
-. . O
-
-An DT B-NP
-A.P. NNP I-NP
-Green NNP I-NP
-official NN I-NP
-declined VBD B-VP
-to TO I-VP
-comment VB I-VP
-on IN B-PP
-the DT B-NP
-filing NN I-NP
-. . O
-
-The DT B-NP
-$ $ I-NP
-40-a-share JJ I-NP
-proposal NN I-NP
-values VBZ B-VP
-the DT B-NP
-company NN I-NP
-at IN B-PP
-about RB B-NP
-$ $ I-NP
-106.6 CD I-NP
-million CD I-NP
-. . O
-
-A.P. NNP B-NP
-Green NNP I-NP
-currently RB B-ADVP
-has VBZ B-VP
-2,664,098 CD B-NP
-shares NNS I-NP
-outstanding JJ B-ADJP
-. . O
-
-Its PRP$ B-NP
-stock NN I-NP
-closed VBD B-VP
-at IN B-PP
-$ $ B-NP
-38 CD I-NP
-, , O
-up IN B-ADVP
-$ $ B-NP
-1.875 CD I-NP
-, , O
-in IN B-PP
-national JJ B-NP
-over-the-counter JJ I-NP
-trading NN I-NP
-. . O
-
-The DT B-NP
-company NN I-NP
-is VBZ B-VP
-a DT B-NP
-Mexico NNP I-NP
-, , I-NP
-Mo. NNP I-NP
-, , I-NP
-maker NN I-NP
-of IN B-PP
-refractory JJ B-NP
-products NNS I-NP
-. . O
-
-East NNP B-NP
-Rock NNP I-NP
-also RB B-ADVP
-said VBD B-VP
-in IN B-PP
-the DT B-NP
-filing NN I-NP
-that IN B-SBAR
-it PRP B-NP
-boosted VBD B-VP
-its PRP$ B-NP
-stake NN I-NP
-in IN B-PP
-A.P. NNP B-NP
-Green NNP I-NP
-to TO B-PP
-8.7 CD B-NP
-% NN I-NP
-. . O
-
-It PRP B-NP
-now RB B-ADVP
-holds VBZ B-VP
-233,000 CD B-NP
-A.P. NNP I-NP
-Green NNP I-NP
-common JJ I-NP
-shares NNS I-NP
-, , O
-including VBG B-PP
-30,000 CD B-NP
-shares NNS I-NP
-bought VBD B-VP
-last JJ B-NP
-Thursday NNP I-NP
-for IN B-PP
-$ $ B-NP
-35.50 CD I-NP
-to TO I-NP
-$ $ I-NP
-36.50 CD I-NP
-a DT B-NP
-share NN I-NP
-. . O
-
-New NNP B-NP
-York-based JJ I-NP
-John NNP I-NP
-Kuhns NNP I-NP
-and CC I-NP
-Robert NNP I-NP
-MacDonald NNP I-NP
-control NN B-VP
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-Inc. NNP I-NP
-, , O
-the DT B-NP
-sole JJ I-NP
-general JJ I-NP
-partner NN I-NP
-of IN B-PP
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-L.P NNP I-NP
-. . O
-
-The DT B-NP
-sole JJ I-NP
-limited JJ I-NP
-partner NN I-NP
-of IN B-PP
-the DT B-NP
-partnership NN I-NP
-is VBZ B-VP
-Westwood NNP B-NP
-Brick NNP I-NP
-Lime NNP I-NP
-Inc. NNP I-NP
-, , O
-an DT B-NP
-indirect JJ I-NP
-subsidiary NN I-NP
-of IN B-PP
-Westwood NNP B-NP
-Group NNP I-NP
-Inc NNP I-NP
-. . O
-
-Both DT B-NP
-Westwood NNP B-NP
-Brick NNP I-NP
-and CC O
-Westwood NNP B-NP
-Group NNP I-NP
-are VBP B-VP
-based VBN I-VP
-in IN B-PP
-Boston NNP B-NP
-. . O
-
-Freight NN B-NP
-rates NNS I-NP
-, , O
-declining VBG B-VP
-for IN B-PP
-most RBS B-NP
-of IN B-PP
-the DT B-NP
-decade NN I-NP
-because IN B-PP
-of IN I-PP
-competition NN B-NP
-spurred VBN B-VP
-by IN B-PP
-deregulation NN B-NP
-, , O
-are VBP B-VP
-bottoming VBG I-VP
-out IN B-PRT
-, , O
-turning VBG B-VP
-upward RB B-ADVP
-and CC O
-threatening VBG B-VP
-to TO I-VP
-fuel VB I-VP
-inflation NN B-NP
-. . O
-
-Trucking NNP B-NP
-, , I-NP
-shipping VBG I-NP
-and CC I-NP
-air-freight NN I-NP
-companies NNS I-NP
-have VBP B-VP
-announced VBN I-VP
-rate NN B-NP
-increases NNS I-NP
-, , O
-scheduled VBN B-VP
-for IN B-PP
-this DT B-NP
-fall NN I-NP
-or CC O
-early JJ B-NP
-next JJ I-NP
-year NN I-NP
-, , O
-reflecting VBG B-VP
-higher JJR B-NP
-costs NNS I-NP
-and CC O
-tightened VBD B-NP
-demand NN I-NP
-for IN B-PP
-freight NN B-NP
-transport NN I-NP
-. . O
-
-Major JJ B-NP
-shippers NNS I-NP
-say VBP B-VP
-they PRP B-NP
-expect VBP B-VP
-freight NN B-NP
-rates NNS I-NP
-to TO B-VP
-rise VB I-VP
-at IN B-ADVP
-least JJS I-ADVP
-as RB B-ADVP
-fast RB I-ADVP
-as IN B-PP
-inflation NN B-NP
-and CC B-ADVP
-maybe RB I-ADVP
-faster RBR B-ADVP
-in IN B-PP
-the DT B-NP
-next JJ I-NP
-few JJ I-NP
-years NNS I-NP
-. . O
-
-That DT B-NP
-'s VBZ B-VP
-a DT B-NP
-big JJ I-NP
-change NN I-NP
-from IN B-PP
-recent JJ B-NP
-years NNS I-NP
-when WRB B-ADVP
-freight NN B-NP
-haulage NN I-NP
-was VBD B-VP
-a DT B-NP
-bright JJ I-NP
-spot NN I-NP
-for IN B-PP
-U.S. NNP B-NP
-productivity NN I-NP
-, , O
-helping VBG B-VP
-to TO I-VP
-restrain VB I-VP
-inflation NN B-NP
-and CC O
-make VB B-VP
-U.S. NNP B-NP
-industry NN I-NP
-more RBR B-ADJP
-competitive JJ I-ADJP
-abroad RB B-ADVP
-. . O
-
-`` `` O
-Demand NN B-NP
-has VBZ B-VP
-caught VBN I-VP
-up IN B-PRT
-with IN B-PP
-the DT B-NP
-supply NN I-NP
-of IN B-PP
-certain JJ B-NP
-types NNS I-NP
-of IN B-PP
-freight NN B-NP
-transportation NN I-NP
-, , O
-and CC O
-rates NNS B-NP
-are VBP B-VP
-starting VBG I-VP
-to TO I-VP
-move VB I-VP
-up IN B-ADVP
-'' '' O
-at IN B-PP
-a DT B-NP
-rate NN I-NP
-`` `` O
-close RB B-ADJP
-to TO B-PP
-or CC O
-slightly RB B-ADJP
-more JJR I-ADJP
-than IN B-PP
-the DT B-NP
-inflation NN I-NP
-rate NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Clifford NNP B-NP
-Sayre NNP I-NP
-, , O
-director NN B-NP
-of IN B-PP
-logistics NNS B-NP
-at IN B-PP
-Du NNP B-NP
-Pont NNP I-NP
-Co NNP I-NP
-. . O
-
-Shippers NNS B-NP
-surveyed VBN B-VP
-recently RB B-ADVP
-by IN B-PP
-Ohio NNP B-NP
-State NNP I-NP
-University NNP I-NP
-said VBD B-VP
-they PRP B-NP
-expect VBP B-VP
-their PRP$ B-NP
-freight-transport JJ I-NP
-, , I-NP
-storage NN I-NP
-and CC I-NP
-distribution NN I-NP
-costs NNS I-NP
-to TO B-VP
-rise VB I-VP
-about IN B-NP
-4 CD I-NP
-% NN I-NP
-this DT B-NP
-year NN I-NP
-. . O
-
-Only RB B-NP
-10 CD I-NP
-% NN I-NP
-of IN B-PP
-the DT B-NP
-250 CD I-NP
-shippers NNS I-NP
-polled VBN B-VP
-expected VBN B-VP
-their PRP$ B-NP
-freight-transport JJ I-NP
-costs NNS I-NP
-to TO B-VP
-decrease VB I-VP
-, , O
-compared VBN B-PP
-with IN B-PP
-30 CD B-NP
-% NN I-NP
-who WP B-NP
-had VBD B-VP
-looked VBN I-VP
-to TO B-PP
-freight VB B-NP
-transport NN I-NP
-to TO B-VP
-reduce VB I-VP
-costs NNS B-NP
-in IN B-PP
-past JJ B-NP
-years NNS I-NP
-. . O
-
-`` `` O
-This DT B-NP
-is VBZ B-VP
-the DT B-NP
-first JJ I-NP
-year NN I-NP
-since IN B-PP
-transportation NN B-NP
-deregulation NN I-NP
-in IN B-PP
-1980 CD B-NP
-that IN B-ADVP
-we PRP B-NP
-have VBP B-VP
-had VBN I-VP
-such JJ B-NP
-a DT I-NP
-dramatic JJ I-NP
-and CC I-NP
-broad-based JJ I-NP
-upturn NN I-NP
-in IN B-PP
-perceived VBN B-NP
-transportation NN I-NP
-rates NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Bernard NNP B-NP
-LaLonde NNP I-NP
-, , O
-a DT B-NP
-transportation NN I-NP
-logistics NNS I-NP
-professor NN I-NP
-at IN B-PP
-Ohio NNP B-NP
-State NNP I-NP
-in IN B-PP
-Columbus NNP B-NP
-. . O
-
-The DT B-NP
-deregulation NN I-NP
-of IN B-PP
-railroads NNS B-NP
-and CC I-NP
-trucking NN I-NP
-companies NNS I-NP
-that WDT B-NP
-began VBD B-VP
-in IN B-PP
-1980 CD B-NP
-enabled VBD B-VP
-shippers NNS B-NP
-to TO B-VP
-bargain VB I-VP
-for IN B-PP
-transportation NN B-NP
-. . O
-
-Carriers NNP B-NP
-could MD B-VP
-use VB I-VP
-their PRP$ B-NP
-equipment NN I-NP
-more RBR B-ADVP
-efficiently RB I-ADVP
-, , O
-leading VBG B-VP
-to TO B-PP
-overcapacity NN B-NP
-they PRP B-NP
-were VBD B-VP
-eager JJ B-ADJP
-to TO B-VP
-fill VB I-VP
-. . O
-
-Shippers NNS B-NP
-cut VBP B-VP
-about RB B-NP
-$ $ I-NP
-35 CD I-NP
-billion CD I-NP
-from IN B-PP
-their PRP$ B-NP
-annual JJ I-NP
-, , I-NP
-inter-city JJ I-NP
-truck NN I-NP
-and CC I-NP
-rail NN I-NP
-costs NNS I-NP
-, , O
-to TO B-PP
-about RB B-NP
-$ $ I-NP
-150 CD I-NP
-billion CD I-NP
-, , O
-or CC O
-about IN B-NP
-6.4 CD I-NP
-% NN I-NP
-of IN B-PP
-gross JJ B-NP
-national JJ I-NP
-product NN I-NP
-, , O
-down RB B-ADVP
-from IN B-PP
-8 CD B-NP
-% NN I-NP
-of IN B-PP
-GNP NNP B-NP
-in IN B-PP
-1981 CD B-NP
-. . O
-
-But CC O
-with IN B-PP
-much NN B-NP
-of IN B-PP
-the DT B-NP
-inefficiency NN I-NP
-squeezed VBN B-VP
-out IN B-PP
-of IN B-PP
-the DT B-NP
-freight-transport JJ I-NP
-system NN I-NP
-, , O
-rising VBG B-NP
-costs NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-reflected VBN I-VP
-directly RB B-ADVP
-in IN B-PP
-higher JJR B-NP
-freight NN I-NP
-rates NNS I-NP
-. . O
-
-`` `` O
-Shippers NNS B-NP
-are VBP B-VP
-saying VBG I-VP
-` `` O
-the DT B-NP
-party NN I-NP
-'s POS B-VP
-over IN B-ADJP
-, , O
-' '' O
-'' '' O
-said VBD B-VP
-Mr. NNP B-NP
-LaLonde NNP I-NP
-. . O
-
-`` `` O
-Shippers NNS B-NP
-wo MD B-VP
-n't RB I-VP
-be VB I-VP
-able JJ B-ADJP
-to TO B-VP
-look VB I-VP
-for IN B-PP
-transportation-cost JJ B-NP
-savings NNS I-NP
-as IN B-SBAR
-they PRP B-NP
-have VBP B-VP
-for IN B-PP
-the DT B-NP
-last JJ I-NP
-eight CD I-NP
-or CC I-NP
-nine CD I-NP
-years NNS I-NP
-. . O
-
-Transport NN B-NP
-rates NNS I-NP
-wo MD B-VP
-n't RB I-VP
-be VB I-VP
-an DT B-NP
-opportunity NN I-NP
-for IN B-PP
-offsetting VBG B-VP
-cost NN B-NP
-increases NNS I-NP
-in IN B-PP
-other JJ B-NP
-segments NNS I-NP
-of IN B-PP
-the DT B-NP
-economy NN I-NP
-. . O
-'' '' O
-
-Robert NNP B-NP
-Delaney NNP I-NP
-, , O
-a DT B-NP
-consultant NN I-NP
-at IN B-PP
-Arthur NNP B-NP
-D. NNP I-NP
-Little NNP I-NP
-Inc. NNP I-NP
-, , O
-Cambridge NNP B-NP
-, , O
-Mass. NNP B-NP
-, , O
-said VBD B-VP
-`` `` O
-We PRP B-NP
-'ve VBP B-VP
-gotten VBN I-VP
-all PDT B-NP
-the DT I-NP
-benefits NNS I-NP
-of IN B-PP
-deregulation NN B-NP
-in IN B-PP
-freight-cost JJ B-NP
-reductions NNS I-NP
-. . O
-
-Now RB B-ADVP
-we PRP B-NP
-are VBP B-VP
-starting VBG I-VP
-to TO I-VP
-see VB I-VP
-real JJ B-NP
-freight-rate JJ I-NP
-increases NNS I-NP
-as IN B-SBAR
-carriers NNS B-NP
-replace VBP B-VP
-equipment NN B-NP
-, , O
-pay VB B-VP
-higher JJR B-NP
-fuel NN I-NP
-costs NNS I-NP
-and CC O
-pay VB B-VP
-more JJR B-NP
-for IN B-PP
-labor NN B-NP
-. . O
-
-You PRP B-NP
-'ll MD B-VP
-see VB I-VP
-carriers NNS B-NP
-try VB B-VP
-to TO I-VP
-recoup VB I-VP
-some DT B-NP
-of IN B-PP
-the DT B-NP
-price NN I-NP
-cutting VBG I-NP
-that WDT B-NP
-occurred VBD B-VP
-previously RB B-ADVP
-. . O
-'' '' O
-
-Not RB B-NP
-everyone NN I-NP
-believes VBZ B-VP
-that IN B-SBAR
-the DT B-NP
-good JJ I-NP
-times NNS I-NP
-are VBP B-VP
-over IN B-ADJP
-for IN B-PP
-shippers NNS B-NP
-. . O
-
-`` `` O
-There EX B-NP
-'s VBZ B-VP
-still RB B-ADVP
-a DT B-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-on IN B-PP
-rates NNS B-NP
-in IN B-PP
-both DT B-NP
-rail NN I-NP
-and CC I-NP
-truck NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Gerard NNP B-NP
-McCullough NNP I-NP
-, , O
-lecturer NN B-NP
-in IN B-PP
-transportation NN B-NP
-at IN B-PP
-Massachusetts NNP B-NP
-Institute NNP I-NP
-of IN B-PP
-Technology NNP B-NP
-. . O
-
-Less-than-truckload JJ B-NP
-companies NNS I-NP
-, , O
-which WDT B-NP
-carry VBP B-VP
-the DT B-NP
-freight NN I-NP
-of IN B-PP
-several JJ B-NP
-shippers NNS I-NP
-in IN B-PP
-each DT B-NP
-truck NN I-NP
-trailer NN I-NP
-, , O
-discounted VBD B-VP
-away RB B-ADVP
-a DT B-NP
-4.7 CD I-NP
-% NN I-NP
-rate NN I-NP
-increase NN I-NP
-implemented VBD B-VP
-last JJ B-NP
-April NNP I-NP
-. . O
-
-The DT B-NP
-carriers NNS I-NP
-were VBD B-VP
-competing VBG I-VP
-fiercely RB B-ADVP
-for IN B-PP
-market NN B-NP
-share NN I-NP
-. . O
-
-Railroad-rate JJ B-NP
-increases NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-restrained VBN I-VP
-by IN B-PP
-weakening VBG B-NP
-rail-traffic JJ I-NP
-levels NNS I-NP
-and CC O
-keen JJ B-NP
-competition NN I-NP
-for IN B-PP
-freight NN B-NP
-from IN B-PP
-trucks NNS B-NP
-. . O
-
-An DT B-NP
-official NN I-NP
-at IN B-PP
-Consolidated NNP B-NP
-Freightways NNP I-NP
-Inc. NNP I-NP
-, , O
-a DT B-NP
-Menlo NNP I-NP
-Park NNP I-NP
-, , I-NP
-Calif. NNP I-NP
-, , I-NP
-less-than-truckload JJ I-NP
-carrier NN I-NP
-, , O
-said VBD B-VP
-rate NN B-NP
-discounting NN I-NP
-in IN B-PP
-that DT B-NP
-industry NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-`` `` O
-stabilize VB B-VP
-. . O
-'' '' O
-
-Consolidated NNP B-NP
-Freightways NNP I-NP
-plans VBZ B-VP
-to TO I-VP
-raise VB I-VP
-its PRP$ B-NP
-rates NNS I-NP
-5.3 CD B-NP
-% NN I-NP
-late JJ B-NP
-this DT I-NP
-year NN I-NP
-or CC O
-early JJ B-NP
-next JJ I-NP
-year NN I-NP
-, , O
-and CC O
-at IN B-NP
-least JJS I-NP
-two CD I-NP
-competitors NNS I-NP
-have VBP B-VP
-announced VBN I-VP
-similar JJ B-NP
-increases NNS I-NP
-. . O
-
-Truckers NNS B-NP
-are VBP B-VP
-`` `` O
-trying VBG B-VP
-to TO I-VP
-send VB I-VP
-signals NNS B-NP
-that IN B-SBAR
-they PRP B-NP
-need VBP B-VP
-to TO I-VP
-stop VB I-VP
-the DT B-NP
-bloodletting NN I-NP
-, , O
-forget VB B-VP
-about IN B-PP
-market NN B-NP
-share NN I-NP
-and CC O
-go VB B-VP
-for IN B-PP
-higher JJR B-NP
-rates NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Michael NNP B-NP
-Lloyd NNP I-NP
-, , O
-an DT B-NP
-analyst NN I-NP
-at IN B-PP
-Salomon NNP B-NP
-Bros NNP I-NP
-. . O
-
-And CC O
-`` `` O
-shippers NNS B-NP
-are VBP B-VP
-getting VBG I-VP
-the DT B-NP
-feeling NN I-NP
-that IN B-SBAR
-they PRP B-NP
-have VBP B-VP
-played VBN I-VP
-one CD B-NP
-trucker NN I-NP
-off IN B-ADVP
-against IN B-PP
-another DT B-NP
-as RB B-NP
-much JJ I-NP
-as IN B-SBAR
-they PRP B-NP
-can MD B-VP
-, , O
-'' '' O
-he PRP B-NP
-said VBD B-VP
-. . O
-
-Air-freight NN B-NP
-carriers NNS I-NP
-raised VBD B-VP
-their PRP$ B-NP
-rates NNS I-NP
-for IN B-PP
-U.S. NNP B-NP
-products NNS I-NP
-going VBG B-VP
-across IN B-PP
-the DT B-NP
-Pacific NNP I-NP
-to TO B-PP
-Asia NNP B-NP
-by IN B-PP
-about IN B-NP
-20 CD I-NP
-% NN I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-. . O
-
-And CC O
-Japan NNP B-NP
-Air NNP I-NP
-Lines NNPS I-NP
-said VBD B-VP
-it PRP B-NP
-plans VBZ B-VP
-to TO I-VP
-boost VB I-VP
-its PRP$ B-NP
-rates NNS I-NP
-a DT B-NP
-further JJ I-NP
-25 CD I-NP
-% NN I-NP
-over IN B-PP
-the DT B-NP
-next JJ I-NP
-two CD I-NP
-years NNS I-NP
-. . O
-
-Such JJ B-NP
-rate NN I-NP
-increases NNS I-NP
-`` `` O
-will MD B-VP
-increase VB I-VP
-the DT B-NP
-total JJ I-NP
-cost NN I-NP
-of IN B-PP
-U.S. NNP B-NP
-products NNS I-NP
-and CC O
-slow JJ B-VP
-down RP B-PRT
-the DT B-NP
-rate NN I-NP
-of IN B-PP
-increase NN B-NP
-of IN B-PP
-U.S. NNP B-NP
-exports NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Richard NNP B-NP
-Connors NNP I-NP
-, , O
-a DT B-NP
-senior JJ I-NP
-vice NN I-NP
-president NN I-NP
-of IN B-PP
-Yusen NNP B-NP
-Air NNP I-NP
-& CC I-NP
-Sea NNP I-NP
-Service NNP I-NP
-U.S.A. NNP I-NP
-Inc. NNP I-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-air-freight-forwarding JJ I-NP
-subsidiary NN I-NP
-of IN B-PP
-Nippon NNP B-NP
-Yusen NNP I-NP
-Kaisha NNP I-NP
-of IN B-PP
-Japan NNP B-NP
-. . O
-
-Ship NN B-NP
-companies NNS I-NP
-carrying VBG B-VP
-bulk NN B-NP
-commodities NNS I-NP
-, , O
-such JJ B-PP
-as IN I-PP
-oil NN B-NP
-, , O
-grain NN B-NP
-, , O
-coal NN B-NP
-and CC O
-iron NN B-NP
-ore NN I-NP
-, , O
-have VBP B-VP
-been VBN I-VP
-able JJ B-ADJP
-to TO B-VP
-increase VB I-VP
-their PRP$ B-NP
-rates NNS I-NP
-in IN B-PP
-the DT B-NP
-last JJ I-NP
-couple NN I-NP
-of IN B-PP
-years NNS B-NP
-. . O
-
-Some DT B-NP
-bulk NN I-NP
-shipping VBG I-NP
-rates NNS I-NP
-have VBP B-VP
-increased VBN I-VP
-`` `` O
-3 CD B-NP
-% NN I-NP
-to TO I-NP
-4 CD I-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-past JJ I-NP
-few JJ I-NP
-months NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Salomon NNP B-NP
-'s POS B-NP
-Mr. NNP I-NP
-Lloyd NNP I-NP
-. . O
-
-And CC O
-ship NN B-NP
-lines NNS I-NP
-carrying VBG B-VP
-containers NNS B-NP
-are VBP B-VP
-also RB I-VP
-trying VBG I-VP
-to TO I-VP
-raise VB I-VP
-their PRP$ B-NP
-rates NNS I-NP
-. . O
-
-Carriers NNP B-NP
-boosted VBD B-VP
-rates NNS B-NP
-more JJR B-NP
-than IN I-NP
-10 CD I-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-North NNP I-NP
-Atlantic NNP I-NP
-between IN B-PP
-the DT B-NP
-U.S. NNP I-NP
-and CC O
-Europe NNP B-NP
-last JJ B-NP
-September NNP I-NP
-, , O
-hoping VBG B-VP
-to TO I-VP
-partly RB I-VP
-restore VB I-VP
-rates NNS B-NP
-to TO B-PP
-earlier JJR B-NP
-levels NNS I-NP
-. . O
-
-Ship NN B-NP
-lines NNS I-NP
-operating VBG B-VP
-in IN B-PP
-the DT B-NP
-Pacific NNP I-NP
-plan NN B-VP
-to TO I-VP
-raise VB I-VP
-rates NNS B-NP
-on IN B-PP
-containers NNS B-NP
-carrying VBG B-VP
-U.S. NNP B-NP
-exports NNS I-NP
-to TO B-PP
-Asia NNP B-NP
-about IN B-NP
-10 CD I-NP
-% NN I-NP
-, , O
-effective JJ B-ADJP
-next JJ B-NP
-April NNP I-NP
-. . O
-
-MGM NNP B-NP
-Grand NNP I-NP
-Inc. NNP I-NP
-said VBD B-VP
-it PRP B-NP
-filed VBD B-VP
-a DT B-NP
-registration NN I-NP
-statement NN I-NP
-with IN B-PP
-the DT B-NP
-Securities NNP I-NP
-and CC I-NP
-Exchange NNP I-NP
-Commission NNP I-NP
-for IN B-PP
-a DT B-NP
-public JJ I-NP
-offering NN I-NP
-of IN B-PP
-six CD B-NP
-million CD I-NP
-common JJ I-NP
-shares NNS I-NP
-. . O
-
-The DT B-NP
-Beverly NNP I-NP
-Hills NNP I-NP
-, , I-NP
-Calif.-based JJ I-NP
-company NN I-NP
-said VBD B-VP
-it PRP B-NP
-would MD B-VP
-have VB I-VP
-26.9 CD B-NP
-million CD I-NP
-common JJ I-NP
-shares NNS I-NP
-outstanding JJ B-ADJP
-after IN B-PP
-the DT B-NP
-offering NN I-NP
-. . O
-
-The DT B-NP
-hotel NN I-NP
-and CC I-NP
-Gaming NNP I-NP
-company NN I-NP
-said VBD B-VP
-Merrill NNP B-NP
-Lynch NNP I-NP
-Capital NNP I-NP
-Markets NNPS I-NP
-will MD B-VP
-lead VB I-VP
-the DT B-NP
-underwriters NNS I-NP
-. . O
-
-Proceeds NNS B-NP
-from IN B-PP
-the DT B-NP
-sale NN I-NP
-will MD B-VP
-be VB I-VP
-used VBN I-VP
-for IN B-PP
-remodeling VBG B-NP
-and CC I-NP
-refurbishing VBG I-NP
-projects NNS I-NP
-, , B-PP
-as RB I-PP
-well RB I-PP
-as IN I-PP
-for IN B-PP
-the DT B-NP
-planned VBN I-NP
-MGM NNP I-NP
-Grand NNP I-NP
-hotel\/casino NN I-NP
-and CC I-NP
-theme NN I-NP
-park NN I-NP
-. . O
-
-Bob NNP B-NP
-Stone NNP I-NP
-stewed JJ B-VP
-over IN B-PP
-a DT B-NP
-letter NN I-NP
-from IN B-PP
-his PRP$ B-NP
-manager NN I-NP
-putting VBG B-VP
-him PRP B-NP
-on IN B-PP
-probation NN B-NP
-for IN B-PP
-insubordination NN B-NP
-. . O
-
-Mr. NNP B-NP
-Stone NNP I-NP
-thought VBD B-VP
-the DT B-NP
-discipline NN I-NP
-was VBD B-VP
-unfair JJ B-ADJP
-; : O
-he PRP B-NP
-believed VBD B-VP
-that IN B-SBAR
-his PRP$ B-NP
-manager NN I-NP
-wanted VBD B-VP
-to TO I-VP
-get VB I-VP
-rid JJ B-ADJP
-of IN B-PP
-him PRP B-NP
-for IN B-PP
-personal JJ B-NP
-reasons NNS I-NP
-. . O
-
-Unable JJ B-ADJP
-to TO B-VP
-persuade VB I-VP
-the DT B-NP
-manager NN I-NP
-to TO B-VP
-change VB I-VP
-his PRP$ B-NP
-decision NN I-NP
-, , O
-he PRP B-NP
-went VBD B-VP
-to TO B-PP
-a DT B-NP
-`` `` I-NP
-company NN I-NP
-court NN I-NP
-'' '' O
-for IN B-PP
-a DT B-NP
-hearing NN I-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-scheduled VBN I-NP
-time NN I-NP
-, , O
-Mr. NNP B-NP
-Stone NNP I-NP
-entered VBD B-VP
-a DT B-NP
-conference NN I-NP
-room NN I-NP
-in IN B-PP
-a DT B-NP
-building NN I-NP
-near IN B-PP
-where WRB B-ADVP
-he PRP B-NP
-worked VBD B-VP
-. . O
-
-After IN B-SBAR
-the DT B-NP
-three CD I-NP
-members NNS I-NP
-of IN B-PP
-the DT B-NP
-court NN I-NP
-introduced VBD B-VP
-themselves PRP B-NP
-, , O
-the DT B-NP
-chairman NN I-NP
-of IN B-PP
-the DT B-NP
-panel NN I-NP
-said VBD B-VP
-: : O
-`` `` O
-Go VB B-VP
-ahead RB B-ADVP
-and CC O
-tell VB B-VP
-us PRP B-NP
-what WP B-NP
-happened VBD B-VP
-. . O
-
-We PRP B-NP
-may MD B-VP
-ask VB I-VP
-questions NNS B-NP
-as IN B-SBAR
-you PRP B-NP
-go VBP B-VP
-along IN B-PRT
-, , O
-or CC O
-we PRP B-NP
-may MD B-VP
-wait VB I-VP
-until IN B-PP
-the DT B-NP
-end NN I-NP
-. . O
-'' '' O
-
-No DT B-NP
-lawyers NNS I-NP
-or CC I-NP
-tape NN I-NP
-recorders NNS I-NP
-were VBD B-VP
-present JJ B-ADJP
-. . O
-
-The DT B-NP
-only RB I-NP
-extra JJ I-NP
-people NNS I-NP
-were VBD B-VP
-a DT B-NP
-couple NN I-NP
-of IN B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-, , O
-one CD B-NP
-of IN B-PP
-whom WP B-NP
-knew VBD B-VP
-Mr. NNP B-NP
-Stone NNP I-NP
-'s POS B-NP
-case NN I-NP
-intimately RB B-ADVP
-and CC O
-would MD B-VP
-help VB I-VP
-fill VB I-VP
-in IN B-PRT
-any DT B-NP
-facts NNS I-NP
-needed VBN B-VP
-to TO B-VP
-give VB I-VP
-the DT B-NP
-court NN I-NP
-the DT B-NP
-full JJ I-NP
-picture NN I-NP
-. . O
-
-Over IN B-PP
-a DT B-NP
-cup NN I-NP
-of IN B-PP
-coffee NN B-NP
-, , O
-Mr. NNP B-NP
-Stone NNP I-NP
-told VBD B-VP
-his PRP$ B-NP
-story NN I-NP
-. . O
-
-He PRP B-NP
-talked VBD B-VP
-about IN B-NP
-20 CD I-NP
-minutes NNS I-NP
-. . O
-
-When WRB B-ADVP
-he PRP B-NP
-was VBD B-VP
-through IN B-ADJP
-, , O
-the DT B-NP
-court NN I-NP
-members NNS I-NP
-asked VBD B-VP
-many JJ B-NP
-questions NNS I-NP
-, , O
-then RB B-ADVP
-the DT B-NP
-chairman NN I-NP
-said VBD B-VP
-they PRP B-NP
-would MD B-VP
-like VB I-VP
-to TO I-VP
-hear VB I-VP
-his PRP$ B-NP
-manager NN I-NP
-'s POS B-NP
-side NN I-NP
-and CC O
-talk VB B-VP
-to TO B-PP
-witnesses NNS B-NP
-. . O
-
-The DT B-NP
-chairman NN I-NP
-promised VBD B-VP
-Mr. NNP B-NP
-Stone NNP I-NP
-a DT B-NP
-decision NN I-NP
-within IN B-PP
-two CD B-NP
-weeks NNS I-NP
-. . O
-
-Bob NNP B-NP
-Stone NNP I-NP
-is VBZ B-VP
-a DT B-NP
-fictional JJ I-NP
-name NN I-NP
-, , O
-but CC O
-the DT B-NP
-incident NN I-NP
-described VBN B-VP
-is VBZ B-VP
-real JJ B-ADJP
-. . O
-
-It PRP B-NP
-happened VBD B-VP
-at IN B-PP
-Northrop NNP B-NP
-Corp. NNP I-NP
-in IN B-PP
-Los NNP B-NP
-Angeles NNP I-NP
-. . O
-
-The DT B-NP
-court NN I-NP
-is VBZ B-VP
-called VBN I-VP
-the DT B-NP
-Management NNP I-NP
-Appeals NNP I-NP
-Committee NNP I-NP
-, , O
-or CC O
-just RB B-NP
-`` `` I-NP
-MAC NNP I-NP
-, , O
-'' '' O
-and CC O
-it PRP B-NP
-is VBZ B-VP
-likely JJ B-ADJP
-to TO B-VP
-hear VB I-VP
-a DT B-NP
-couple NN I-NP
-of IN I-NP
-dozen NN I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-. . O
-
-Alter VB B-VP
-some DT B-NP
-details NNS I-NP
-of IN B-PP
-this DT B-NP
-example NN I-NP
-and CC O
-it PRP B-NP
-could MD B-VP
-be VB I-VP
-taking VBG I-VP
-place NN B-NP
-today NN B-ADVP
-at IN B-PP
-Federal NNP B-NP
-Express NNP I-NP
-in IN B-PP
-Memphis NNP B-NP
-, , O
-the DT B-NP
-Defense NNP I-NP
-and CC I-NP
-Underseas NNP I-NP
-Systems NNP I-NP
-divisions NNS I-NP
-of IN B-PP
-Honeywell NNP B-NP
-in IN B-PP
-Minneapolis NNP B-NP
-, , O
-a DT B-NP
-General NNP I-NP
-Electric NNP I-NP
-plant NN I-NP
-in IN B-PP
-Columbia NNP B-NP
-, , O
-Md. NNP B-NP
-, , O
-or CC O
-a DT B-NP
-number NN I-NP
-of IN B-PP
-other JJ B-NP
-companies NNS I-NP
-. . O
-
-These DT B-NP
-firms NNS I-NP
-are VBP B-VP
-pioneers NNS B-NP
-in IN B-PP
-a DT B-NP
-significant JJ I-NP
-new JJ I-NP
-trend NN I-NP
-in IN B-PP
-the DT B-NP
-corporate JJ I-NP
-world NN I-NP
-: : O
-the DT B-NP
-rise NN I-NP
-of IN B-PP
-what WP B-NP
-I PRP B-NP
-call VBP B-VP
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-. . O
-
-Although IN B-SBAR
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-is VBZ B-VP
-practiced VBN I-VP
-today NN B-NP
-in IN B-PP
-few JJ B-NP
-companies NNS I-NP
--- : O
-perhaps RB B-ADVP
-40 CD B-NP
-to TO I-NP
-60 CD I-NP
--- : O
-it PRP B-NP
-is VBZ B-VP
-one CD B-NP
-of IN B-PP
-the DT B-NP
-fastest JJS I-NP
-developing VBG I-NP
-trends NNS I-NP
-in IN B-PP
-industry NN B-NP
-. . O
-
-In IN B-PP
-the DT B-NP
-coming VBG I-NP
-decade NN I-NP
-a DT B-NP
-majority NN I-NP
-of IN B-PP
-people-oriented JJ B-NP
-companies NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-adopt VB I-VP
-it PRP B-NP
-. . O
-
-Corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-appeals NNS B-VP
-to TO B-PP
-management NN B-NP
-for IN B-PP
-a DT B-NP
-variety NN I-NP
-of IN B-PP
-reasons NNS B-NP
-. . O
-
-It PRP B-NP
-reduces VBZ B-VP
-lawsuits NNS B-NP
-from IN B-PP
-disgruntled JJ B-NP
-employees NNS I-NP
-and CC I-NP
-ex-employees NNS I-NP
-, , O
-with IN B-PP
-all DT B-NP
-that WDT B-NP
-means VBZ B-VP
-for IN B-PP
-reduced VBN B-NP
-legal JJ I-NP
-costs NNS I-NP
-and CC O
-better RBR B-NP
-public JJ I-NP
-relations NNS I-NP
-. . O
-
-It PRP B-NP
-helps VBZ B-VP
-to TO I-VP
-keep VB I-VP
-out IN B-PRT
-unions NNS B-NP
-. . O
-
-It PRP B-NP
-increases VBZ B-VP
-employee NN B-NP
-commitment NN I-NP
-to TO B-PP
-the DT B-NP
-company NN I-NP
-, , O
-with IN B-PP
-all DT B-NP
-that WDT B-NP
-means VBZ B-VP
-for IN B-PP
-efficiency NN B-NP
-and CC O
-quality NN B-NP
-control NN I-NP
-. . O
-
-What WP B-NP
-must MD O
-your PRP$ B-NP
-management NN I-NP
-team NN I-NP
-do VBP B-VP
-to TO B-VP
-establish VB I-VP
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-? . O
-
-Here RB B-ADVP
-are VBP B-VP
-four CD B-NP
-key JJ I-NP
-steps NNS I-NP
-: : O
-
-1 CD B-LST
-. . O
-Make VB B-VP
-sure JJ B-ADJP
-you PRP B-NP
-have VBP B-VP
-a DT B-NP
-strong JJ I-NP
-personnel NNS I-NP
-department NN I-NP
-. . O
-
-It PRP B-NP
-must MD B-VP
-be VB I-VP
-able JJ B-ADJP
-to TO B-VP
-handle VB I-VP
-most RBS B-NP
-of IN B-PP
-the DT B-NP
-complaints NNS I-NP
-that WDT B-NP
-can MD B-VP
-not RB I-VP
-be VB I-VP
-solved VBN I-VP
-in IN B-PP
-the DT B-NP
-trenches NNS I-NP
-by IN B-PP
-managers NNS B-NP
-and CC O
-their PRP$ B-NP
-subordinates NNS I-NP
-, , O
-else RB B-ADVP
-the DT B-NP
-company NN I-NP
-court NN I-NP
-or CC I-NP
-adjudicators NNS I-NP
-will MD B-VP
-be VB B-VP
-inundated VBN I-VP
-with IN B-PP
-cases NNS B-NP
-. . O
-
-At IN B-PP
-Polaroid NNP B-NP
-, , O
-the DT B-NP
-Personnel NNP I-NP
-Policy NNP I-NP
-Planning NNP I-NP
-Committee NNP I-NP
-may MD B-VP
-hear VB I-VP
-only RB B-NP
-about IN I-NP
-20 CD I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-; : O
-the DT B-NP
-rest NN I-NP
-of IN B-PP
-the DT B-NP
-many JJ I-NP
-hundreds NNS I-NP
-of IN B-PP
-complaints NNS B-NP
-are VBP B-VP
-resolved VBN I-VP
-at IN B-PP
-earlier JJR B-NP
-stages NNS I-NP
-. . O
-
-At IN B-PP
-TWA NNP B-NP
-, , O
-the DT B-NP
-System NNP I-NP
-Board NNP I-NP
-of IN B-PP
-Adjustment NNP B-NP
-hears VBZ B-VP
-50 CD B-NP
-to TO I-NP
-75 CD I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-, , O
-only RB B-NP
-a DT I-NP
-fraction NN I-NP
-of IN B-PP
-the DT B-NP
-complaints NNS I-NP
-brought VBN B-VP
-to TO B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-. . O
-
-At IN B-PP
-Citicorp NNP B-NP
-, , O
-the DT B-NP
-Problem NNP I-NP
-Review NNP I-NP
-Board NNP I-NP
-may MD B-VP
-hear VB I-VP
-only RB B-NP
-12 CD I-NP
-or CC I-NP
-so RB I-NP
-cases VBZ I-NP
-because IN B-PP
-of IN I-PP
-personnel NNS B-NP
-'s POS B-NP
-skill NN I-NP
-in IN B-PP
-complaint-resolution NN B-NP
-. . O
-
-In IN B-PP
-a DT B-NP
-typical JJ I-NP
-year NN I-NP
-, , O
-up IN B-NP
-to TO I-NP
-20 CD I-NP
-% NN I-NP
-of IN B-PP
-the DT B-NP
-work NN I-NP
-force NN I-NP
-goes VBZ B-VP
-to TO B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-with IN B-PP
-complaints NNS B-NP
-of IN B-PP
-unfair JJ B-NP
-treatment NN I-NP
-. . O
-
-In IN B-PP
-a DT B-NP
-large JJ I-NP
-company NN I-NP
-that WDT B-NP
-means VBZ B-VP
-many JJ B-NP
-hundreds NNS I-NP
-of IN B-PP
-complaints NNS B-NP
-for IN B-PP
-personnel NNS B-NP
-to TO B-VP
-handle VB I-VP
-. . O
-
-2 CD B-LST
-. . O
-Formally RB B-ADVP
-or CC I-ADVP
-informally RB I-ADVP
-, , O
-train NN B-VP
-all DT B-NP
-your PRP$ I-NP
-managers NNS I-NP
-and CC I-NP
-supervisors NNS I-NP
-in IN B-PP
-the DT B-NP
-company NN I-NP
-'s POS B-NP
-due-process NN I-NP
-approach NN I-NP
-. . O
-
-See VB B-VP
-that IN B-SBAR
-they PRP B-NP
-know VBP B-VP
-company NN B-NP
-personnel NNS I-NP
-policy NN I-NP
-backwards RB B-ADVP
-and CC I-ADVP
-forwards RB I-ADVP
-, , O
-for IN O
-it PRP B-NP
-is VBZ B-VP
-the DT B-NP
-`` `` I-NP
-law NN I-NP
-'' '' O
-governing VBG B-VP
-company NN B-NP
-courts NNS I-NP
-and CC I-NP
-adjudicators NNS I-NP
-. . O
-
-Coach NNP B-VP
-them PRP B-NP
-in IN B-PP
-handling NN B-VP
-complaints NNS B-NP
-so RB B-SBAR
-that IN I-SBAR
-they PRP B-NP
-can MD B-VP
-resolve VB I-VP
-problems NNS B-NP
-immediately RB B-ADVP
-. . O
-
-In IN B-SBAR
-case NN O
-managers NNS B-NP
-and CC O
-personnel NNS B-NP
-specialists NNS I-NP
-are VBP B-VP
-unsuccessful JJ B-ADJP
-and CC O
-subordinates NNS B-NP
-take VBP B-VP
-their PRP$ B-NP
-complaints NNS I-NP
-to TO B-PP
-a DT B-NP
-company NN I-NP
-court NN I-NP
-or CC I-NP
-adjudicator NN I-NP
-, , O
-teach VB B-VP
-managers NNS B-NP
-to TO B-VP
-accept VB I-VP
-reversals NNS B-NP
-as IN B-PP
-a DT B-NP
-fact NN I-NP
-of IN B-PP
-business NN B-NP
-life NN I-NP
-, , O
-for IN O
-in IN B-PP
-a DT B-NP
-good JJ I-NP
-due-process NN I-NP
-system NN I-NP
-they PRP B-NP
-are VBP B-VP
-bound VBN I-VP
-to TO I-VP
-happen VB I-VP
-. . O
-
-In IN B-PP
-the DT B-NP
-15 CD I-NP
-companies NNS I-NP
-I PRP B-NP
-studied VBD B-VP
-, , O
-reversal NN B-NP
-rates NNS I-NP
-range VBP B-VP
-on IN B-PP
-the DT B-NP
-average NN I-NP
-from IN B-PP
-20 CD B-NP
-% NN I-NP
-to TO B-PP
-40 CD B-NP
-% NN I-NP
-. . O
-
-3 CD B-LST
-. . O
-Decide VB B-VP
-whether IN O
-you PRP B-NP
-want VBP B-VP
-a DT B-NP
-panel NN I-NP
-system NN I-NP
-or CC O
-a DT B-NP
-single JJ I-NP
-adjudicator NN I-NP
-. . O
-
-A DT B-NP
-panel NN I-NP
-system NN I-NP
-like IN B-PP
-that DT B-NP
-in NN B-PP
-the DT B-NP
-Bob NNP I-NP
-Stone NNP I-NP
-example NN I-NP
-enjoys VBZ B-VP
-such JJ B-NP
-advantages NNS I-NP
-as IN B-PP
-high JJ B-NP
-credibility NN I-NP
-and CC O
-, , O
-for IN B-PP
-the DT B-NP
-panelists NNS I-NP
-, , O
-mutual JJ B-NP
-support NN I-NP
-. . O
-
-An DT B-NP
-adjudicator NN I-NP
-system NN I-NP
--- : O
-that DT B-INTJ
-is VBZ I-INTJ
-, , O
-an DT B-NP
-investigator NN I-NP
-who WP B-NP
-acts VBZ B-VP
-first JJ B-ADVP
-as IN B-PP
-a DT B-NP
-fact-finder NN I-NP
-and CC O
-then RB O
-switches VBZ B-VP
-hats NNS B-NP
-and CC O
-arbitrates VBZ B-VP
-the DT B-NP
-facts NNS I-NP
--- : O
-has VBZ B-VP
-such JJ B-NP
-advantages NNS I-NP
-as IN B-PP
-speed NN B-NP
-, , O
-flexibility NN B-NP
-and CC O
-maximum JJ B-NP
-privacy NN I-NP
-. . O
-
-International NNP B-NP
-Business NNP I-NP
-Machines NNPS I-NP
-and CC O
-Bank NNP B-NP
-of IN B-PP
-America NNP B-NP
-are VBP B-VP
-among IN B-PP
-the DT B-NP
-companies NNS I-NP
-using VBG B-VP
-the DT B-NP
-single-adjudicator JJ I-NP
-approach NN I-NP
-. . O
-
-4 CD B-LST
-. . O
-Make VB B-VP
-your PRP$ B-NP
-due-process NN I-NP
-system NN I-NP
-visible JJ B-ADJP
-. . O
-
-It PRP B-NP
-wo MD B-VP
-n't RB I-VP
-do VB I-VP
-any DT B-NP
-good NN I-NP
-for IN B-PP
-anybody NN B-NP
-unless IN B-SBAR
-employees NNS B-NP
-know VBP B-VP
-about IN B-PP
-it PRP B-NP
-. . O
-
-Most JJS B-NP
-managements NNS I-NP
-hesitate VBP B-VP
-to TO I-VP
-go VB I-VP
-all DT B-ADVP
-out NN I-ADVP
-in IN B-PP
-advertising VBG B-VP
-their PRP$ B-NP
-due-process NN I-NP
-systems NNS I-NP
-for IN B-PP
-fear NN B-NP
-of IN B-PP
-encouraging VBG B-VP
-cranks NNS B-NP
-and CC O
-chronic JJ B-NP
-soreheads NNS I-NP
-to TO B-VP
-file VB I-VP
-complaints NNS B-NP
-. . O
-
-On IN B-PP
-the DT B-NP
-other JJ I-NP
-hand NN I-NP
-, , O
-they PRP B-NP
-make VBP B-VP
-sure JJ B-ADJP
-at IN B-PP
-a DT B-NP
-minimum NN I-NP
-that IN B-SBAR
-their PRP$ B-NP
-systems NNS I-NP
-are VBP B-VP
-described VBN I-VP
-in IN B-PP
-their PRP$ B-NP
-employee NN I-NP
-handbooks NNS I-NP
-and CC O
-talked VBD B-VP
-up IN B-PRT
-by IN B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-. . O
-
-Smith-Kline NNP B-NP
-Beecham NNP I-NP
-goes VBZ B-VP
-further JJ B-ADVP
-and CC O
-sometimes RB B-VP
-features VBZ I-VP
-its PRP$ B-NP
-grievance NN I-NP
-procedure NN I-NP
-in IN B-PP
-closed-circuit JJ B-NP
-TV NN I-NP
-programs NNS I-NP
-. . O
-
-Naturally RB B-ADVP
-, , O
-one CD B-NP
-of IN B-PP
-the DT B-NP
-best JJS I-NP
-ways NNS I-NP
-to TO B-VP
-guarantee VB I-VP
-visibility NN B-NP
-for IN B-PP
-your PRP$ B-NP
-due-process NN I-NP
-system NN I-NP
-is VBZ B-VP
-for IN B-SBAR
-top JJ B-NP
-management NN I-NP
-to TO B-VP
-support VB I-VP
-it PRP B-NP
-. . O
-
-At IN B-PP
-IBM NNP B-NP
-, , O
-the DT B-NP
-company NN I-NP
-'s POS B-NP
-Open NNP I-NP
-Door NNP I-NP
-system NN I-NP
-is VBZ B-VP
-sometimes RB B-ADVP
-the DT B-NP
-subject NN I-NP
-of IN B-PP
-memorandums NNS B-NP
-from IN B-PP
-the DT B-NP
-chief JJ I-NP
-executive NN I-NP
-. . O
-
-Federal NNP B-NP
-Express NNP I-NP
-goes VBZ B-VP
-further JJ B-ADVP
-in IN B-PP
-this DT B-NP
-respect NN I-NP
-than IN B-PP
-any DT B-NP
-company NN I-NP
-I PRP B-NP
-know VBP B-VP
-of IN B-PP
-with IN B-PP
-both DT B-NP
-Frederick NNP B-NP
-Smith NNP I-NP
-and CC O
-James NNP B-NP
-Barksdale NNP I-NP
-, , O
-chief JJ B-NP
-executive NN I-NP
-and CC O
-chief JJ B-NP
-operating VBG I-NP
-officer NN I-NP
-, , O
-respectively RB B-ADVP
-, , O
-sitting VBG B-VP
-in IN B-PRT
-on IN B-PP
-the DT B-NP
-Appeals NNP I-NP
-Board NNP I-NP
-almost RB B-NP
-every DT I-NP
-Tuesday NNP I-NP
-to TO B-VP
-decide VB I-VP
-cases NNS B-NP
-. . O
-
-Mr. NNP B-NP
-Ewing NNP I-NP
-is VBZ B-VP
-a DT B-NP
-consultant NN I-NP
-based VBN B-VP
-in IN B-PP
-Winchester NNP B-NP
-, , O
-Mass. NNP B-NP
-, , O
-and CC O
-author NN B-NP
-of IN B-PP
-`` `` O
-Justice NNP B-NP
-on IN B-PP
-the DT B-NP
-Job NNP I-NP
-: : O
-Resolving NNP B-VP
-Grievances NNP B-NP
-in IN B-PP
-the DT B-NP
-Nonunion NNP I-NP
-Workplace NN I-NP
-'' '' O
--LRB- ( O
-Harvard NNP B-NP
-Business NNP I-NP
-School NNP I-NP
-Press NNP I-NP
-, , O
-1989 CD B-NP
--RRB- ) O
-. . O
-
-Tokyo NNP B-NP
-stocks NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-active JJ B-NP
-trading NN I-NP
-Friday NNP B-NP
-, , O
-marking VBG B-VP
-the DT B-NP
-fourth JJ I-NP
-consecutive JJ I-NP
-daily JJ I-NP
-gain NN I-NP
-since IN B-PP
-Monday NNP B-NP
-'s POS B-NP
-sharp JJ I-NP
-fall NN I-NP
-. . O
-
-London JJ B-NP
-shares NNS I-NP
-closed VBD B-VP
-moderately RB B-ADVP
-lower JJR I-ADVP
-in IN B-PP
-thin JJ B-NP
-trading NN I-NP
-. . O
-
-At IN B-PP
-Tokyo NNP B-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-index NN I-NP
-of IN B-PP
-225 CD B-NP
-selected VBN I-NP
-issues NNS I-NP
-was VBD B-VP
-up IN B-ADVP
-112.16 CD B-NP
-points NNS I-NP
-to TO B-PP
-35486.38 CD B-NP
-. . O
-
-The DT B-NP
-index NN I-NP
-advanced VBD B-VP
-266.66 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-. . O
-
-In IN B-PP
-early JJ B-NP
-trading NN I-NP
-in IN B-PP
-Tokyo NNP B-NP
-Monday NNP B-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-index NN I-NP
-rose VBD B-VP
-101.98 CD B-NP
-points NNS I-NP
-to TO B-PP
-35588.36 CD B-NP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-volume NN I-NP
-on IN B-PP
-the DT B-NP
-First NNP I-NP
-Section NN I-NP
-was VBD B-VP
-estimated VBN I-VP
-at IN B-PP
-one CD B-NP
-billion CD I-NP
-shares NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-862 CD B-NP
-million CD I-NP
-Thursday NNP B-NP
-. . O
-
-Winners NNS B-NP
-outpaced VBD B-VP
-losers NNS B-NP
-, , O
-572 CD B-ADVP
-to TO I-ADVP
-368 CD I-ADVP
-, , O
-while IN B-SBAR
-181 CD B-NP
-issues NNS I-NP
-remained VBD B-VP
-unchanged JJ B-ADJP
-. . O
-
-With IN B-SBAR
-investors NNS B-NP
-relieved VBN B-ADJP
-at IN B-PP
-the DT B-NP
-overnight JJ I-NP
-gain NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-, , O
-small-lot JJ B-NP
-buying NN I-NP
-orders NNS I-NP
-streamed VBD B-VP
-into IN B-PP
-the DT B-NP
-market NN I-NP
-from IN B-PP
-early JJ B-NP
-morning NN I-NP
-, , O
-making VBG B-VP
-traders NNS B-NP
-believe VBP B-VP
-the DT B-NP
-market NN I-NP
-was VBD B-VP
-back RB B-ADVP
-to TO B-PP
-normal JJ B-NP
-. . O
-
-The DT B-NP
-Nikkei NNP I-NP
-, , O
-which WDT B-NP
-reached VBD B-VP
-as RB B-ADJP
-high JJ I-ADJP
-as IN B-PP
-35611.38 CD B-NP
-right NN B-ADVP
-after IN B-PP
-the DT B-NP
-opening NN I-NP
-, , O
-surrendered VBD B-VP
-part NN B-NP
-of IN B-PP
-its PRP$ B-NP
-early JJ I-NP
-advance NN I-NP
-toward IN B-PP
-the DT B-NP
-end NN I-NP
-of IN B-PP
-the DT B-NP
-day NN I-NP
-because IN B-PP
-of IN I-PP
-profit-taking NN B-NP
-. . O
-
-`` `` O
-Investors NNS B-NP
-, , B-NP
-especially RB I-NP
-dealers NNS B-NP
-, , O
-do VBP B-VP
-n't RB I-VP
-want VB I-VP
-to TO I-VP
-hold VB I-VP
-a DT B-NP
-position NN I-NP
-over IN B-PP
-the DT B-NP
-weekend NN I-NP
-, , O
-'' '' O
-a DT B-NP
-trader NN I-NP
-at IN B-PP
-Dai-ichi NNP B-NP
-Securities NNP I-NP
-said VBD B-VP
-, , O
-adding VBG B-VP
-, , O
-though RB B-ADVP
-, , O
-that IN B-SBAR
-the DT B-NP
-trading NN I-NP
-mood NN I-NP
-remained VBD B-VP
-positive JJ B-ADJP
-through IN B-PP
-the DT B-NP
-afternoon NN I-NP
-session NN I-NP
-. . O
-
-The DT B-NP
-Tokyo NNP I-NP
-Stock NNP I-NP
-Price NNP I-NP
-Index NNP I-NP
--LRB- ( O
-Topix NNP B-NP
--RRB- ) O
-of IN B-PP
-all DT B-NP
-issues NNS I-NP
-listed VBN B-VP
-in IN B-PP
-the DT B-NP
-First NNP I-NP
-Section NN I-NP
-, , O
-which WDT B-NP
-gained VBD B-VP
-22.78 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-, , O
-was VBD B-VP
-up IN B-ADVP
-14.06 CD B-NP
-points NNS I-NP
-, , O
-or CC O
-0.53 CD B-NP
-% NN I-NP
-, , O
-at IN B-PP
-2679.72 CD B-NP
-. . O
-
-The DT B-NP
-Second JJ I-NP
-Section NN I-NP
-index NN I-NP
-, , O
-which WDT B-NP
-rose VBD B-VP
-15.72 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-, , O
-was VBD B-VP
-up IN B-ADVP
-11.88 CD B-NP
-points NNS I-NP
-, , O
-or CC O
-0.32 CD B-NP
-% NN I-NP
-, , O
-to TO B-VP
-close VB I-VP
-at IN B-PP
-3717.46 CD B-NP
-. . O
-
-Volume NN B-NP
-in IN B-PP
-the DT B-NP
-second JJ I-NP
-section NN I-NP
-was VBD B-VP
-estimated VBN I-VP
-at IN B-PP
-30 CD B-NP
-million CD I-NP
-shares NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-28 CD B-NP
-million CD I-NP
-Thursday NNP B-NP
-. . O
-
-In IN B-PP
-turmoil NN B-NP
-caused VBN B-VP
-by IN B-PP
-the DT O
-previous JJ B-NP
-Friday NNP I-NP
-'s POS B-NP
-plunge NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-marked VBD B-VP
-a DT B-NP
-sharp JJ I-NP
-647.33-point JJ I-NP
-fall NN I-NP
-Monday NNP B-NP
-. . O
-
-But CC O
-the DT B-NP
-Nikkei NNP I-NP
-fell VBD B-VP
-an DT B-NP
-overall JJ I-NP
-1.8 CD I-NP
-% NN I-NP
-in IN B-PP
-value NN B-NP
-that DT B-NP
-day NN I-NP
-compared VBN B-PP
-with IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-'s POS I-NP
-far RB B-ADJP
-sharper JJR I-ADJP
-6.9 CD B-ADJP
-% NN I-ADJP
-drop NN B-NP
-on IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-. . O
-
-The DT B-NP
-Tokyo NNP I-NP
-market NN I-NP
-'s POS B-NP
-resiliency NN I-NP
-helped VBD B-VP
-participants NNS B-NP
-to TO B-VP
-regain VB I-VP
-confidence NN B-NP
-gradually RB B-ADVP
-as IN B-SBAR
-they PRP B-NP
-spent VBD B-VP
-more JJR B-NP
-time NN I-NP
-on IN B-PP
-analyzing VBG B-VP
-factors NNS B-NP
-that WDT B-NP
-caused VBD B-VP
-the DT B-NP
-Friday NNP I-NP
-plunge NN I-NP
-and CC O
-realized VBD B-VP
-these DT B-NP
-problems NNS I-NP
-were VBD B-VP
-unique JJ B-ADJP
-to TO B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-and CC B-ADJP
-not RB I-ADJP
-directly RB B-ADJP
-related VBN I-ADJP
-to TO B-PP
-Tokyo NNP B-NP
-. . O
-
-The DT B-NP
-Nikkei NNP I-NP
-continued VBD B-VP
-to TO I-VP
-gain VB I-VP
-for IN B-PP
-the DT B-NP
-rest NN I-NP
-of IN B-PP
-the DT B-NP
-week NN I-NP
-, , O
-adding VBG B-VP
-1017.69 CD B-NP
-points NNS I-NP
-in IN B-PP
-four CD B-NP
-days NNS I-NP
--- : O
-more JJR B-VP
-than IN I-VP
-erasing VBG I-VP
-Monday NNP B-NP
-'s POS B-NP
-losses NNS I-NP
-. . O
-
-But CC O
-further JJ B-NP
-major JJ I-NP
-advances NNS I-NP
-on IN B-PP
-the DT B-NP
-Nikkei NNP I-NP
-are VBP B-VP
-n't RB I-VP
-foreseen VBN I-VP
-this DT B-NP
-week NN I-NP
-by IN B-PP
-market NN B-NP
-observers NNS I-NP
-. . O
-
-Investors NNS B-NP
-are VBP B-VP
-still RB I-VP
-waiting VBG I-VP
-to TO I-VP
-see VB I-VP
-how WRB B-ADVP
-the DT B-NP
-U.S. NNP I-NP
-government NN I-NP
-will MD B-VP
-decide VB I-VP
-on IN B-PP
-interest NN B-NP
-rates NNS I-NP
-and CC O
-how WRB B-ADVP
-the DT B-NP
-dollar NN I-NP
-will MD B-VP
-be VB I-VP
-stabilized VBN I-VP
-. . O
-
-Some DT B-NP
-high-priced JJ I-NP
-issues NNS I-NP
-made VBD B-VP
-a DT B-NP
-comeback NN I-NP
-Friday NNP B-NP
-. . O
-
-Pioneer NNP B-NP
-surged VBD B-VP
-450 CD B-NP
-yen NN I-NP
--LRB- ( O
-$ $ B-NP
-3.16 CD I-NP
--RRB- ) O
-to TO B-PP
-6,050 CD B-NP
-yen NN I-NP
--LRB- ( O
-$ $ B-NP
-42.60 CD I-NP
--RRB- ) O
-. . O
-
-Kyocera NNP B-NP
-advanced VBD B-VP
-80 CD B-NP
-yen NN I-NP
-to TO B-PP
-5,440 CD B-NP
-. . O
-
-Fanuc NNP B-NP
-gained VBD B-VP
-100 CD B-NP
-to TO B-PP
-7,580 CD B-NP
-. . O
-
-Breweries NNP B-NP
-attracted VBD B-VP
-investors NNS B-NP
-because IN B-PP
-of IN I-PP
-their PRP$ B-NP
-land NN I-NP
-property NN I-NP
-holdings NNS I-NP
-that WDT B-NP
-could MD B-VP
-figure VB I-VP
-in IN B-PP
-development NN B-NP
-or CC O
-other JJ B-NP
-plans NNS I-NP
-, , O
-traders NNS B-NP
-said VBD B-VP
-. . O
-
-Sapporo NNP B-NP
-gained VBD B-VP
-80 CD B-NP
-to TO B-PP
-1,920 CD B-NP
-and CC O
-Kirin NNP B-NP
-added VBD B-VP
-60 CD B-NP
-to TO B-PP
-2,070 CD B-NP
-. . O
-
-Housings NNS B-NP
-, , I-NP
-constructions NNS I-NP
-and CC I-NP
-pharmaceuticals NNS I-NP
-continued VBD B-VP
-to TO I-VP
-be VB I-VP
-bought VBN I-VP
-following VBG B-PP
-Thursday NNP B-NP
-'s POS B-NP
-gains NNS I-NP
-because IN B-PP
-of IN I-PP
-strong JJ B-NP
-earnings NNS I-NP
-outlooks NNS I-NP
-. . O
-
-Daiwa NNP B-NP
-House NNP I-NP
-gained VBD B-VP
-50 CD B-NP
-to TO B-PP
-2,660 CD B-NP
-. . O
-
-Misawa NNP B-NP
-Homes NNP I-NP
-was VBD B-VP
-up IN B-ADVP
-20 CD B-NP
-at IN B-PP
-2,960 CD B-NP
-. . O
-
-Kajima NNP B-NP
-advanced VBD B-VP
-40 CD B-NP
-to TO B-PP
-2,120 CD B-NP
-and CC O
-Ohbayashi NNP B-NP
-added VBD B-VP
-50 CD B-NP
-to TO B-PP
-1,730 CD B-NP
-. . O
-
-Fujisawa NNP B-NP
-added VBD B-VP
-80 CD B-NP
-to TO B-PP
-2,010 CD B-NP
-and CC O
-Mochida NNP B-NP
-advanced VBD B-VP
-230 CD B-NP
-to TO B-PP
-4,400 CD B-NP
-. . O
-
-London JJ B-NP
-share NN I-NP
-prices NNS I-NP
-were VBD B-VP
-influenced VBN I-VP
-largely RB B-ADVP
-by IN B-PP
-declines NNS B-NP
-on IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-and CC O
-weakness NN B-NP
-in IN B-PP
-the DT B-NP
-British JJ I-NP
-pound NN I-NP
-. . O
-
-The DT B-NP
-key JJ I-NP
-Financial NNP I-NP
-Times-Stock NNP I-NP
-Exchange NNP I-NP
-100-share JJ I-NP
-index NN I-NP
-ended VBD B-VP
-10.2 CD B-NP
-points NNS I-NP
-lower JJR B-ADVP
-at IN B-PP
-2179.1 CD B-NP
-, , O
-above IN B-ADVP
-its PRP$ B-NP
-intraday JJ I-NP
-low NN I-NP
-of IN B-PP
-2176.9 CD B-NP
-, , B-ADVP
-but CC I-ADVP
-off IN B-ADVP
-the DT B-NP
-day NN I-NP
-'s POS I-NP
-high NN B-NP
-of IN B-PP
-2189 CD B-NP
-. . O
-
-The DT B-NP
-index NN I-NP
-finished VBD B-VP
-2.4 CD B-NP
-% NN I-NP
-under IN B-PP
-its PRP$ B-NP
-close NN I-NP
-of IN B-PP
-2233.9 CD B-NP
-the DT B-NP
-previous JJ I-NP
-Friday NNP I-NP
-, , O
-although IN B-SBAR
-it PRP B-NP
-recouped VBD B-VP
-some DT B-NP
-of IN B-PP
-the DT B-NP
-sharp JJ I-NP
-losses NNS I-NP
-staged VBD B-VP
-early JJ B-NP
-last JJ I-NP
-week NN I-NP
-on IN B-PP
-the DT B-NP
-back RB I-NP
-of IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-'s POS B-NP
-fall NN I-NP
-. . O
-
-London NNP B-NP
-was VBD B-VP
-weak JJ B-ADJP
-throughout IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-trading NN I-NP
-, , O
-however RB B-ADVP
-, , O
-on IN B-PP
-what WP B-NP
-dealers NNS B-NP
-attributed VBD B-VP
-to TO B-PP
-generally RB B-NP
-thin JJ I-NP
-interest NN I-NP
-ahead RB B-ADVP
-of IN B-PP
-the DT B-NP
-weekend NN I-NP
-and CC O
-this DT B-NP
-week NN I-NP
-'s POS I-NP
-potentially RB B-ADJP
-important JJ I-ADJP
-U.K. NNP B-NP
-trade NN I-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-. . O
-
-The DT B-NP
-FT-SE NNP I-NP
-100 CD I-NP
-largely RB B-ADVP
-remained VBD B-VP
-within IN B-PP
-an DT B-NP
-11-point JJ I-NP
-range NN I-NP
-establshed VBN B-VP
-within IN B-PP
-the DT B-NP
-first JJ I-NP
-hour NN I-NP
-of IN B-PP
-trading NN B-NP
-before IN B-PP
-it PRP B-NP
-eased VBD B-VP
-to TO B-PP
-an DT B-NP
-intraday JJ I-NP
-low JJ I-NP
-late RB B-ADVP
-in IN B-PP
-the DT B-NP
-session NN I-NP
-when WRB B-ADVP
-a DT B-NP
-flurry NN I-NP
-of IN B-PP
-program NN B-NP
-selling VBG I-NP
-pushed VBN B-VP
-Wall NNP B-NP
-Street NNP I-NP
-lower JJR B-ADVP
-. . O
-
-The DT B-NP
-FT NNP I-NP
-30-share JJ I-NP
-index NN I-NP
-closed VBD B-VP
-11.0 CD B-NP
-points NNS I-NP
-lower JJR B-ADVP
-at IN B-PP
-1761.0 CD B-NP
-. . O
-
-Volume NN B-NP
-was VBD B-VP
-extremely RB B-ADJP
-thin JJ I-ADJP
-at IN B-PP
-351.3 CD B-NP
-million CD I-NP
-shares NNS I-NP
-, , O
-the DT B-NP
-lightest JJS I-NP
-volume NN I-NP
-of IN B-PP
-the DT B-NP
-week NN I-NP
-and CC O
-modestly RB B-ADVP
-under IN B-PP
-Thursday NNP B-NP
-'s POS B-NP
-387.4 CD I-NP
-million CD I-NP
-shares NNS I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-the DT B-NP
-day NN I-NP
-'s POS B-NP
-action NN I-NP
-was VBD B-VP
-featureless JJ B-ADJP
-outside IN B-PP
-some DT B-NP
-response NN I-NP
-to TO B-PP
-sterling NN B-NP
-'s POS B-NP
-early JJ I-NP
-weakness NN I-NP
-against IN B-PP
-the DT B-NP
-mark NN I-NP
-, , O
-and CC O
-fears NNS B-NP
-that IN B-SBAR
-Wall NNP B-NP
-Street NNP I-NP
-might MD B-VP
-open RB I-VP
-lower JJR B-ADVP
-after IN B-PP
-its PRP$ B-NP
-strong JJ I-NP
-leap NN I-NP
-forward RB B-ADVP
-Thursday NNP B-NP
-. . O
-
-They PRP B-NP
-added VBD B-VP
-that IN B-SBAR
-market-makers NNS B-NP
-were VBD B-VP
-largely RB I-VP
-sidelined VBN I-VP
-after IN B-PP
-aggressively RB B-VP
-supporting VBG I-VP
-the DT B-NP
-market NN I-NP
-Thursday NNP B-NP
-in IN B-PP
-their PRP$ B-NP
-quest NN I-NP
-to TO B-VP
-cover VB I-VP
-internal JJ B-NP
-shortages NNS I-NP
-of IN B-PP
-FT-SE NNP B-NP
-100 CD I-NP
-shares NNS I-NP
-. . O
-
-Interest NN B-NP
-may MD B-VP
-remain VB I-VP
-limited JJ B-ADJP
-into IN B-PP
-tomorrow NN B-NP
-'s POS B-NP
-U.K. NNP I-NP
-trade NN I-NP
-figures NNS I-NP
-, , O
-which WDT B-NP
-the DT B-NP
-market NN I-NP
-will MD B-VP
-be VB I-VP
-watching VBG I-VP
-closely RB B-ADVP
-to TO B-VP
-see VB I-VP
-if IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-any DT B-NP
-improvement NN I-NP
-after IN B-PP
-disappointing JJ B-NP
-numbers NNS I-NP
-in IN B-PP
-the DT B-NP
-previous JJ I-NP
-two CD I-NP
-months NNS I-NP
-. . O
-
-The DT B-NP
-key JJ I-NP
-corporate JJ I-NP
-news NN I-NP
-of IN B-PP
-the DT B-NP
-day NN I-NP
-was VBD B-VP
-that IN B-SBAR
-British JJ B-NP
-Airways NNPS I-NP
-decided VBD B-VP
-to TO I-VP
-withdraw VB I-VP
-from IN B-PP
-a DT B-NP
-management-led JJ I-NP
-bid NN I-NP
-for IN B-PP
-UAL NNP B-NP
-Corp. NNP I-NP
-, , O
-the DT B-NP
-parent NN I-NP
-of IN B-PP
-United NNP B-NP
-Airlines NNPS I-NP
-. . O
-
-British JJ B-NP
-Airways NNPS I-NP
-rose VBD B-VP
-initially RB B-ADVP
-after IN B-PP
-announcing VBG B-VP
-its PRP$ B-NP
-withdrawal NN I-NP
-from IN B-PP
-the DT B-NP
-UAL NNP I-NP
-deal NN I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-they PRP B-NP
-viewed VBD B-VP
-the DT O
-initial JJ O
-# # O
-390-million CD O
--LRB- ( O
-$ $ B-ADJP
-622 CD O
-million CD O
--RRB- ) O
-outlay NN B-NP
-for IN B-PP
-a DT B-NP
-15 CD I-NP
-% NN I-NP
-stake NN I-NP
-in IN B-PP
-the DT B-NP
-airline NN I-NP
-as IN B-PP
-a DT B-NP
-bit NN I-NP
-much JJ I-NP
-. . O
-
-Its PRP$ B-NP
-shares NNS I-NP
-slid VBD B-VP
-in IN B-PP
-late JJ B-NP
-dealings NNS I-NP
-to TO B-VP
-close VB I-VP
-a DT B-NP
-penny NN I-NP
-per IN B-PP
-share NN B-NP
-lower JJR B-ADVP
-at IN B-PP
-197 CD B-NP
-pence NN I-NP
-. . O
-
-The DT B-NP
-airline NN I-NP
-was VBD B-VP
-the DT B-NP
-most RBS I-NP
-active JJ I-NP
-FT-SE NNP I-NP
-100 CD I-NP
-at IN B-PP
-8.2 CD B-NP
-million CD I-NP
-shares NNS I-NP
-traded VBN B-VP
-. . O
-
-The DT B-NP
-next JJ I-NP
-most RBS I-NP
-active JJ I-NP
-top-tier JJ I-NP
-stock NN I-NP
-was VBD B-VP
-B.A.T NNP B-NP
-Industries NNPS I-NP
-, , O
-the DT B-NP
-target NN I-NP
-of IN B-PP
-Sir NNP B-NP
-James NNP I-NP
-Goldsmith NNP I-NP
-'s POS B-NP
-# # B-ADJP
-13.4 CD O
-billion CD O
-bid NN B-NP
-. . O
-
-The DT B-NP
-company NN I-NP
-gained VBD B-VP
-shareholder NN B-NP
-approval NN I-NP
-Thursday NNP B-NP
-to TO B-VP
-restructure VB I-VP
-in IN B-PP
-a DT B-NP
-bid NN I-NP
-to TO B-VP
-fend VB I-VP
-off IN B-PRT
-the DT B-NP
-hostile JJ I-NP
-takeover NN I-NP
-. . O
-
-Sir NNP B-NP
-James NNP I-NP
-said VBD B-VP
-Thursday NNP B-NP
-night NN I-NP
-that IN B-SBAR
-his PRP$ B-NP
-plans NNS I-NP
-for IN B-PP
-the DT B-NP
-takeover NN I-NP
-had VBD B-VP
-n't RB I-VP
-changed VBN I-VP
-. . O
-
-B.A.T NNP B-NP
-ended VBD B-VP
-the DT B-NP
-day NN I-NP
-at IN B-PP
-778 CD B-NP
-, , O
-down JJ B-ADVP
-5 NN B-NP
-, , O
-on IN B-PP
-turnover NN B-NP
-of IN B-PP
-7.5 CD B-NP
-million CD I-NP
-shares NNS I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-it PRP B-NP
-was VBD B-VP
-hit VBN I-VP
-by IN B-PP
-some DT B-NP
-profit-taking NN I-NP
-after IN B-PP
-gains NNS B-NP
-since IN B-PP
-mid-week NN B-NP
-. . O
-
-In IN B-PP
-other JJ B-NP
-active JJ I-NP
-shares NNS I-NP
-, , O
-Trusthouse NNP B-NP
-Forte NNP I-NP
-shed VB B-VP
-10 CD B-NP
-to TO B-PP
-294 CD B-NP
-on IN B-PP
-volume NN B-NP
-of IN B-PP
-6.4 CD B-NP
-million CD I-NP
-shares NNS I-NP
-after IN B-PP
-a DT B-NP
-Barclays NNP I-NP
-De NNP I-NP
-Zoete NNP I-NP
-Wedd NNP I-NP
-downgrading NN I-NP
-, , O
-while IN B-SBAR
-Hillsdown NNP B-NP
-Holdings NNP I-NP
-, , O
-a DT B-NP
-food NN I-NP
-products NNS I-NP
-concern VBP I-NP
-, , O
-was VBD B-VP
-boosted VBN I-VP
-2 CD B-NP
-to TO B-PP
-271 CD B-NP
-after IN O
-it PRP B-NP
-disclosed VBD B-VP
-it PRP B-NP
-would MD B-VP
-seek VB I-VP
-shareholder NN B-NP
-approval NN I-NP
-to TO B-VP
-begin VB I-VP
-share NN B-NP
-repurchases NNS I-NP
-. . O
-
-Elsewhere RB B-ADVP
-in IN B-PP
-Europe NNP B-NP
-, , O
-share NN B-NP
-prices NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-Stockholm NNP B-NP
-, , I-NP
-Brussels NNP I-NP
-and CC I-NP
-Milan NNP I-NP
-. . O
-
-Prices NNS B-NP
-were VBD B-VP
-lower JJR B-ADJP
-in IN B-PP
-Frankfurt NNP B-NP
-, , I-NP
-Zurich NNP I-NP
-, , I-NP
-Paris NNP I-NP
-and CC I-NP
-Amsterdam NNP I-NP
-. . O
-
-South JJ B-NP
-African JJ I-NP
-gold NN I-NP
-stocks NNS I-NP
-closed VBD B-VP
-moderately RB B-ADVP
-lower JJR I-ADVP
-. . O
-
-Share NN B-NP
-prices NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-Sydney NNP B-NP
-, , O
-Taipei NNP B-NP
-, , O
-Wellington NNP B-NP
-, , O
-Manila NNP B-NP
-, , O
-Hong NNP B-NP
-Kong NNP I-NP
-and CC O
-Singapore NNP B-NP
-and CC O
-were VBD B-VP
-lower JJR B-ADJP
-in IN B-PP
-Seoul NNP B-NP
-. . O
-
-Here RB B-ADVP
-are VBP B-VP
-price NN B-NP
-trends NNS I-NP
-on IN B-PP
-the DT B-NP
-world NN I-NP
-'s POS B-NP
-major JJ I-NP
-stock NN I-NP
-markets NNS I-NP
-, , O
-as IN B-SBAR
-calculated VBN B-VP
-by IN B-PP
-Morgan NNP B-NP
-Stanley NNP I-NP
-Capital NNP I-NP
-International NNP I-NP
-Perspective NNP I-NP
-, , O
-Geneva NNP B-NP
-. . O
-
-To TO B-VP
-make VB I-VP
-them PRP B-NP
-directly RB B-ADJP
-comparable JJ I-ADJP
-, , O
-each DT B-NP
-index NN I-NP
-is VBZ B-VP
-based VBN I-VP
-on IN B-PP
-the DT B-NP
-close NN I-NP
-of IN B-PP
-1969 CD B-NP
-equaling VBG B-VP
-100 CD B-NP
-. . O
-
-The DT B-NP
-percentage NN I-NP
-change NN I-NP
-is VBZ B-VP
-since IN B-PP
-year-end NN B-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-is VBZ B-VP
-required VBN I-VP
-to TO I-VP
-notify VB I-VP
-foreign JJ B-NP
-dictators NNS I-NP
-if IN B-SBAR
-it PRP B-NP
-knows VBZ B-VP
-of IN B-PP
-coup NN B-NP
-plans NNS I-NP
-likely JJ B-ADJP
-to TO B-VP
-endanger VB I-VP
-their PRP$ B-NP
-lives NNS I-NP
-, , O
-government NN B-NP
-officials NNS I-NP
-said VBD B-VP
-. . O
-
-The DT B-NP
-notification NN I-NP
-policy NN I-NP
-was VBD B-VP
-part NN B-NP
-of IN B-PP
-a DT B-NP
-set NN I-NP
-of IN B-PP
-guidelines NNS B-NP
-on IN B-PP
-handling NN B-VP
-coups NNS B-NP
-outlined VBN B-VP
-in IN B-PP
-a DT B-NP
-secret JJ I-NP
-1988 CD I-NP
-exchange NN I-NP
-of IN B-PP
-letters NNS B-NP
-between IN B-PP
-the DT B-NP
-Reagan NNP I-NP
-administration NN I-NP
-and CC O
-the DT B-NP
-Senate NNP I-NP
-Intelligence NNP I-NP
-Committee NNP I-NP
-. . O
-
-The DT B-NP
-existence NN I-NP
-of IN B-PP
-the DT B-NP
-guidelines NNS I-NP
-has VBZ B-VP
-become VBN I-VP
-known VBN I-VP
-since IN B-SBAR
-President NNP B-NP
-Bush NNP I-NP
-disclosed VBD B-VP
-them PRP B-NP
-privately RB B-ADVP
-to TO B-PP
-seven CD B-NP
-Republican NNP I-NP
-senators NNS I-NP
-at IN B-PP
-a DT B-NP
-White NNP I-NP
-House NNP I-NP
-meeting NN I-NP
-last JJ B-NP
-Monday NNP I-NP
-. . O
-
-Officials NNS B-NP
-familiar JJ B-ADJP
-with IN B-PP
-the DT B-NP
-meeting NN I-NP
-said VBD B-VP
-Mr. NNP B-NP
-Bush NNP I-NP
-cited VBD B-VP
-the DT B-NP
-policy NN I-NP
-as IN B-PP
-an DT B-NP
-example NN I-NP
-of IN B-PP
-the DT B-NP
-sort NN I-NP
-of IN B-PP
-congressional JJ B-NP
-requirements NNS I-NP
-the DT B-NP
-administration NN I-NP
-contends VBZ B-VP
-contribute VB B-VP
-to TO B-PP
-the DT B-NP
-failure NN I-NP
-of IN B-PP
-such JJ B-NP
-covert JJ I-NP
-actions NNS I-NP
-as IN B-PP
-this DT B-NP
-month NN I-NP
-'s POS B-NP
-futile JJ I-NP
-effort NN I-NP
-to TO B-VP
-oust VB I-VP
-Panamanian JJ B-NP
-dictator NN I-NP
-Manuel NNP I-NP
-Noriega NNP I-NP
-. . O
-
-According VBG B-PP
-to TO B-PP
-the DT B-NP
-officials NNS I-NP
-, , O
-Mr. NNP B-NP
-Bush NNP I-NP
-even RB B-ADVP
-read VB B-VP
-to TO B-PP
-the DT B-NP
-senators NNS I-NP
-selections NNS B-NP
-from IN B-PP
-a DT B-NP
-highly RB I-NP
-classified VBN I-NP
-letter NN I-NP
-from IN B-PP
-the DT B-NP
-committee NN I-NP
-to TO B-PP
-the DT B-NP
-White NNP I-NP
-House NNP I-NP
-discussing VBG B-VP
-the DT B-NP
-guidelines NNS I-NP
-. . O
-
-They PRP B-NP
-said VBD B-VP
-the DT B-NP
-president NN I-NP
-conceded VBD B-VP
-the DT B-NP
-notification NN I-NP
-requirement NN I-NP
-did VBD B-VP
-n't RB I-VP
-affect VB I-VP
-his PRP$ B-NP
-decision NN I-NP
-to TO B-VP
-lend VB I-VP
-only RB B-NP
-minor JJ I-NP
-support NN I-NP
-to TO B-PP
-this DT B-NP
-month NN I-NP
-'s POS B-NP
-Panama NNP I-NP
-coup NN I-NP
-effort NN I-NP
-. . O
-
-No DT B-NP
-notification NN I-NP
-was VBD B-VP
-ever RB I-VP
-considered VBN I-VP
-, , O
-officials NNS B-NP
-said VBD B-VP
-, , O
-apparently RB B-ADVP
-because IN B-SBAR
-the DT B-NP
-U.S. NNP I-NP
-did VBD B-VP
-n't RB I-VP
-think VB I-VP
-the DT B-NP
-coup NN I-NP
-plotters NNS I-NP
-intended VBN B-VP
-to TO I-VP
-kill VB I-VP
-Mr. NNP B-NP
-Noriega NNP I-NP
-, , O
-but CC O
-merely RB B-VP
-sought VBD I-VP
-to TO I-VP
-imprison VB I-VP
-him PRP B-NP
-. . O
-
-What WP B-NP
-'s VBZ B-VP
-more JJR B-NP
-, , O
-both DT B-NP
-administration NN B-NP
-and CC O
-congressional JJ B-NP
-officials NNS I-NP
-hint VBP B-VP
-that IN B-SBAR
-the DT B-NP
-notification NN I-NP
-requirement NN I-NP
-is VBZ B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-dropped VBN I-VP
-from IN B-PP
-the DT B-NP
-guidelines NNS I-NP
-on IN B-PP
-coup NN B-NP
-attempts NNS I-NP
-that WDT B-NP
-are VBP B-VP
-being VBG I-VP
-rewritten VBN I-VP
-by IN B-PP
-the DT B-NP
-panel NN I-NP
-and CC O
-the DT B-NP
-White NNP I-NP
-House NNP I-NP
-. . O
-
-The DT B-NP
-rewriting VBG I-NP
-was VBD B-VP
-launched VBN I-VP
-at IN B-PP
-a DT B-NP
-meeting NN I-NP
-between IN B-PP
-Mr. NNP B-NP
-Bush NNP I-NP
-and CC O
-intelligence NN B-NP
-committee NN I-NP
-leaders NNS I-NP
-Oct. NNP B-NP
-12 CD I-NP
-, , O
-a DT B-NP
-few JJ I-NP
-days NNS I-NP
-before IN B-PP
-the DT B-NP
-meeting NN I-NP
-at IN B-PP
-which WDT B-NP
-the DT B-NP
-president NN I-NP
-complained VBD B-VP
-about IN B-PP
-the DT B-NP
-rules NNS I-NP
-. . O
-
-However RB B-ADVP
-, , O
-the DT B-NP
-disclosure NN I-NP
-of IN B-PP
diff --git a/paddle/trainer/tests/train_files.txt b/paddle/trainer/tests/train_files.txt
deleted file mode 100644
index 1c26891495..0000000000
--- a/paddle/trainer/tests/train_files.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/train_proto.bin
diff --git a/paddle/trainer/tests/train_sparse.list b/paddle/trainer/tests/train_sparse.list
deleted file mode 100644
index 6ea020e220..0000000000
--- a/paddle/trainer/tests/train_sparse.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/compare_sparse_data
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index 0add66da74..5c2c504f53 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -17,8 +17,7 @@ limitations under the License. */
 
 #include <fenv.h>
 
-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
+#if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
 int feenableexcept(unsigned int excepts);
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
index 42ecaa06d2..ac44461578 100644
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -14,9 +14,13 @@ limitations under the License. */
 
 #include "paddle/utils/Excepts.h"
 
-#if (defined(__APPLE__) || defined(__OSX__)) && !defined(__arm__) && \
-    !defined(__aarch64__)
-
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
@@ -49,5 +53,5 @@ int fedisableexcept(unsigned int excepts) {
 
   return (fesetenv(&fenv) ? -1 : old_excepts);
 }
-
+#endif
 #endif
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index fdc914d1bc..248f58a7f2 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -18,6 +18,6 @@ limitations under the License. */
 
 TEST(StringUtil, to) {
   ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH(paddle::str::to<int>(""), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
 }
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index ebf0911d6e..1fbdd5bbd8 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -139,6 +139,8 @@ message PoolConfig {
   optional uint32 output_z = 16 [ default = 1 ];
   optional uint32 img_size_z = 17 [ default = 1 ];
   optional uint32 padding_z = 18 [ default = 1 ];
+
+  optional bool exclude_mode = 19;
 }
 
 message SppConfig {
@@ -321,6 +323,19 @@ message ClipConfig {
   required double max = 2;
 }
 
+message ROIPoolConfig {
+  required uint32 pooled_width = 1;
+  required uint32 pooled_height = 2;
+  required float spatial_scale = 3;
+  optional uint32 height = 4 [ default = 1 ];
+  optional uint32 width = 5 [ default = 1 ];
+}
+
+message ScaleSubRegionConfig {
+  required ImageConfig image_conf = 1;
+  required float value = 2;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -342,6 +357,8 @@ message LayerInputConfig {
   optional MultiBoxLossConfig multibox_loss_conf = 16;
   optional DetectionOutputConfig detection_output_conf = 17;
   optional ClipConfig clip_conf = 18;
+  optional ScaleSubRegionConfig scale_sub_region_conf = 19;
+  optional ROIPoolConfig roi_pool_conf = 20;
 }
 
 message LayerConfig {
@@ -525,6 +542,13 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for batch normalization layer
+  // The small constant added to the variance to improve numeric stability.
+  optional double epsilon = 60 [ default = 0.00001 ];
+
+  // for factorization machine layer
+  optional uint32 factor_size = 61;
 }
 
 message EvaluatorConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7bd6d59b00..c8632295a2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -37,13 +37,14 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND touch stub.cc
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
@@ -65,7 +66,7 @@ if (WITH_TESTING)
     add_subdirectory(paddle/v2/tests)
     add_subdirectory(paddle/v2/reader/tests)
     add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/framework/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
   endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f662d68263..1030c94e16 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,3 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+try:
+    from version import full_version as __version__
+    from version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write('''Warning with import paddle: you should not 
+     import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
+                     )
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index e88e962cff..239fe4204b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1116,35 +1116,6 @@ def PyData(files=None,
     return data_config
 
 
-@config_func
-def ProtoData(files=None,
-              type=None,
-              file_group_queue_capacity=None,
-              load_file_count=None,
-              constant_slots=None,
-              load_thread_num=None,
-              **xargs):
-    data_config = create_data_config_proto(**xargs)
-    if type is None:
-        data_config.type = 'proto'
-    else:
-        data_config.type = type
-    data_config.files = files
-
-    # When type="proto_group", one data provider contains at most
-    # load_file_count files, and there are at most
-    # (queue_capacity + load_thread_num + 1) data providers in memory
-    if file_group_queue_capacity is not None:
-        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
-    if load_file_count is not None:
-        data_config.file_group_conf.load_file_count = load_file_count
-    if load_thread_num is not None:
-        data_config.file_group_conf.load_thread_num = load_thread_num
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    return data_config
-
-
 #real data for training is actually provided by "sub_data" data providers.
 @config_func
 def MultiData(sub_data=[]):
@@ -1200,8 +1171,14 @@ def TestData(data_config, async_load_data=None):
 
 #caffe_mode: compute the output size using floor instead of ceil,
 #            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
-    output = (2 * padding + img_size - filter_size) / float(stride)
+def cnn_output_size(img_size,
+                    filter_size,
+                    padding,
+                    stride,
+                    caffe_mode,
+                    dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    output = (2 * padding + img_size - filter_s) / float(stride)
     if caffe_mode:
         return 1 + int(math.floor(output))
     else:
@@ -1210,8 +1187,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
 
 #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
-def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+def cnn_image_size(output_size,
+                   filter_size,
+                   padding,
+                   stride,
+                   caffe_mode,
+                   dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    img_size = (output_size - 1) * stride + filter_s - 2 * padding
     if not caffe_mode:
         img_size = img_size + 1
     return img_size
@@ -1250,12 +1233,12 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in "
-                  "['max-projection', 'avg-projection', "
+        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
+    ], "pool-type %s is not in " \
+              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
                   "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
 
     pool_conf.channels = pool.channels
@@ -1279,6 +1262,8 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
                                          pool_conf.stride_y, not ceil_mode)
+    if exclude_mode != None:
+        pool_conf.exclude_mode = exclude_mode
 
 
 def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
@@ -1376,6 +1361,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
     conv_conf.stride_y = conv.stride_y
     conv_conf.groups = conv.groups
     conv_conf.caffe_mode = conv.caffe_mode
+    if not conv.dilation:
+        conv.dilation = 1
+        conv.dilation_y = 1
+    else:
+        conv_conf.dilation = conv.dilation
+        conv_conf.dilation_y = conv.dilation_y
 
     if not trans:
         conv_conf.filter_channels = conv.channels / conv.groups
@@ -1383,20 +1374,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
             get_img_size(input_layer_name, conv.channels)
         conv_conf.output_x = cnn_output_size(
             conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
         conv_conf.output_y = cnn_output_size(
             conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
     else:
         conv_conf.filter_channels = num_filters / conv.groups
         conv_conf.output_x, conv_conf.output_y = \
             get_img_size(input_layer_name, conv.channels)
         conv_conf.img_size = cnn_image_size(
             conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
         conv_conf.img_size_y = cnn_image_size(
             conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
 
 
 #caffe_mode: compute the output size using floor instead of ceil,
@@ -1808,7 +1799,7 @@ class FCLayer(LayerBase):
             self.layer_type = 'mkldnn_fc'
             config_assert(
                 len(inputs) == 1,
-                "MkldnnFCLayer support one and only one input!")
+                "MKLDNNFCLayer support one and only one input!")
         super(FCLayer, self).__init__(
             name, self.layer_type, size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
@@ -1819,7 +1810,7 @@ class FCLayer(LayerBase):
             sparse = format == "csr" or format == "csc"
             if use_mkldnn:
                 config_assert(not sparse,
-                              "MkldnnFCLayer do not support sparse format yet")
+                              "MKLDNNFCLayer do not support sparse format yet")
                 if use_mkldnn_wgt:
                     dims = [self.config.size, input_layer.size]
             if sparse:
@@ -1835,7 +1826,7 @@ class FCLayer(LayerBase):
 
 
 @config_layer('mkldnn_fc')
-class MkldnnFcLayer(FCLayer):
+class MKLDNNFcLayer(FCLayer):
     layer_type = 'mkldnn_fc'
 
 
@@ -1969,6 +1960,18 @@ class DetectionOutputLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('roi_pool')
+class ROIPoolLayer(LayerBase):
+    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
+                 num_channels, **xargs):
+        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
+        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
+        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
+        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
+        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
+        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self,
@@ -2036,13 +2039,20 @@ class ParameterReluLayer(LayerBase):
     def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
+
         input_layer = self.get_input_layer(0)
         config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
         config_assert(input_layer.size % partial_sum == 0,
                       "a wrong setting for partial_sum")
+
+        dims = [1, input_layer.size / partial_sum]
         self.set_layer_size(input_layer.size)
         self.config.partial_sum = partial_sum
-        self.create_input_parameter(0, input_layer.size / partial_sum)
+        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
+
+        self.set_layer_height_width(self.get_input_layer(0).height, \
+                                        self.get_input_layer(0).width)
+        self.set_layer_depth(self.get_input_layer(0).depth)
 
 
 @config_layer('conv')
@@ -2279,11 +2289,17 @@ class Conv3DLayer(Conv3DLayerBase):
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
         super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn = True if use_mkldnn and self.inputs[
+            0].norm.norm_type == 'cmrnorm-projection' else False
+        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
             parse_norm(self.inputs[input_index].norm, input_layer.name,
                        norm_conf)
+            norm_conf.scale = self.inputs[
+                input_index].norm.scale if use_mkldnn else norm_conf.scale
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
             if norm_conf.norm_type == "cross-channel-norm":
@@ -2295,7 +2311,8 @@ class NormLayer(LayerBase):
 class PoolLayer(LayerBase):
     layer_type = 'pool'
 
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
+                 **xargs):
         use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
         if self.layer_type == "mkldnn_pool":
             config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
@@ -2306,7 +2323,7 @@ class PoolLayer(LayerBase):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode)
+                       pool_conf, ceil_mode, exclude_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
@@ -2392,6 +2409,14 @@ class CropLayer(LayerBase):
         image_conf.img_size_y = input_layer.height
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 
 
 @config_layer('batch_norm')
@@ -2404,6 +2429,7 @@ class BatchNormLayer(LayerBase):
                  bias=True,
                  img3D=False,
                  use_global_stats=True,
+                 epsilon=1e-5,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
                  mean_var_names=None,
@@ -2452,6 +2478,9 @@ class BatchNormLayer(LayerBase):
             self.config.use_global_stats = use_global_stats
         if moving_average_fraction is not None:
             self.config.moving_average_fraction = moving_average_fraction
+        if epsilon is not None:
+            assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
+            self.config.epsilon = epsilon
 
         input_layer = self.get_input_layer(0)
         image_conf = self.config.inputs[0].image_conf
@@ -2684,7 +2713,7 @@ Usage:
              max_sort_size = -1, inputs = ["output", "score"])
 
   Input data: Samples of the same query should be loaded as a sequence,
-          by ProtoDataProvider or PyDataProvider etc.. User should provide
+          by PyDataProvider etc.. User should provide
           scores for each sample. The score slot should be the 2nd
           input of lambdaRank layer.
 
@@ -2775,27 +2804,37 @@ class NCELayer(LayerBase):
 
 @config_layer('addto')
 class AddToLayer(LayerBase):
+    layer_type = 'addto'
+
     def __init__(self, name, inputs, bias=True, **xargs):
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_addto":
+            config_assert(use_mkldnn, "mkldnn_addto only support MKLDNN")
+        self.layer_type = 'mkldnn_addto' if use_mkldnn else 'addto'
         super(AddToLayer, self).__init__(
-            name, 'addto', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
 
-        if len(self.inputs) > 1:
-            for input_index in xrange(len(self.inputs)):
-                assert self.get_input_layer(0).height == self.get_input_layer(
-                    input_index).height
-                assert self.get_input_layer(0).width == self.get_input_layer(
-                    input_index).width
-                assert self.get_input_layer(0).depth == self.get_input_layer(
-                    input_index).depth
+        layer_size = self.get_input_layer(0).size
+        # To reserve heght, width, depth.
+        layer_with_hwc = self.get_input_layer(0)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            assert layer_size == input_layer.size
+            if input_layer.height and input_layer.height and input_layer.height:
+                layer_with_hwc = input_layer
 
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
+        self.set_layer_size(layer_with_hwc.size)
+        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
+        self.set_layer_depth(layer_with_hwc.depth)
         self.create_bias_parameter(bias, self.config.size)
 
 
+@config_layer('mkldnn_addto')
+class MKLDNNAddtoLayer(AddToLayer):
+    layer_type = 'mkldnn_addto'
+
+
 @config_layer('agent')
 class AgentLayer(LayerBase):
     def __init__(self, name, size, device=None):
@@ -3168,6 +3207,18 @@ class SubNestedSequenceLayer(LayerBase):
         self.set_layer_size(size)
 
 
+@config_layer('dot_prod')
+class DotProdLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(DotProdLayer, self).__init__(
+            name, 'dot_prod', 0, inputs, device=device)
+        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            "Two inputs should have the same size.")
+        self.set_layer_size(1)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
@@ -3289,6 +3340,20 @@ class RowL2NormLayer(LayerBase):
         self.set_layer_size(input_layer.size)
 
 
+@config_layer('cos')
+class CosSimLayer(LayerBase):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
+        super(CosSimLayer, self).__init__(
+            name, 'cos', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2,
+            'The CosSimLayer expects two and only two inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'The two inputs of CosSimLayer must have the same dimensionality.')
+        self.config.cos_scale = cos_scale
+
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
     def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
@@ -3296,10 +3361,24 @@ class CosSimVecMatLayer(LayerBase):
             name, 'cos_vm', size, inputs=inputs, device=device)
         self.config.cos_scale = cos_scale
         config_assert(
-            len(self.inputs) == 2, 'CosSimVecMatLayer must have 2 inputs')
+            len(self.inputs) == 2, 'The CosSimVecMatLayer must have 2 inputs.')
         config_assert(
             size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for CosSimVecMatLayer')
+            'Wrong input size for CosSimVecMatLayer.')
+
+
+@config_layer('l2_distance')
+class L2DistanceLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(L2DistanceLayer, self).__init__(
+            name, 'l2_distance', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, ('The L2DistanceLayer must have '
+                                    'and only have 2 inputs.'))
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            ('Two inputs of the L2DistanceLayer must have '
+             'the same dimensionality.'))
 
 
 @config_layer('sampling_id')
@@ -3343,18 +3422,6 @@ class AverageLayer(LayerBase):
         self.create_bias_parameter(bias, self.config.size)
 
 
-@config_layer('cos')
-class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=1, device=None):
-        super(CosSimLayer, self).__init__(
-            name, 'cos', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'inputs of CosSimLayer must have same dim')
-        self.config.cos_scale = cos_scale
-
-
 @config_layer('tensor')
 class TensorLayer(LayerBase):
     def __init__(self, name, size, inputs, bias=True, **xargs):
@@ -3465,11 +3532,17 @@ def ExpressionLayer(name, inputs, **xargs):
 
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
+    layer_type = 'concat'
+
     def __init__(self, name, inputs, bias=False, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         config_assert(not bias, 'ConcatenateLayer cannot support bias.')
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_concat":
+            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
+        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
         super(ConcatenateLayer, self).__init__(
-            name, 'concat', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         size = 0
         for input_index in xrange(len(self.inputs)):
             assert self.get_input_layer(0).height == self.get_input_layer(
@@ -3489,6 +3562,11 @@ class ConcatenateLayer(LayerBase):
         self.set_layer_size(size)
 
 
+@config_layer('mkldnn_concat')
+class MKLDNNConcatLayer(ConcatenateLayer):
+    layer_type = 'mkldnn_concat'
+
+
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):
@@ -3788,6 +3866,60 @@ class SwitchOrderLayer(LayerBase):
             name, 'switch_order', 0, inputs=inputs, **xargs)
         self.config.reshape_conf.height_axis.extend(reshape['height'])
         self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
+
+
+@config_layer('scale_sub_region')
+class ScaleSubRegionLayer(LayerBase):
+    def __init__(self, name, inputs, value, **xargs):
+        super(ScaleSubRegionLayer, self).__init__(
+            name, 'scale_sub_region', 0, inputs=inputs, **xargs)
+        scale_sub_region_conf = self.config.inputs[0].scale_sub_region_conf
+        scale_sub_region_conf.value = value
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = scale_sub_region_conf.image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels)
+
+
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
 
 
 # Deprecated, use a new layer specific class instead
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index c749fa827f..00efc01c05 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -17,7 +17,8 @@ __all__ = [
     "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
     'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
     "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation", "SqrtActivation", "ReciprocalActivation"
+    "LogActivation", "SqrtActivation", "ReciprocalActivation",
+    "SoftSignActivation"
 ]
 
 
@@ -243,8 +244,20 @@ class ReciprocalActivation(BaseActivation):
     Reciprocal Activation.
 
     .. math::
-       f(z) = 1/z
+       f(z)=\\frac{1}{z}
     """
 
     def __init__(self):
         BaseActivation.__init__(self, 'reciprocal', False)
+
+
+class SoftSignActivation(BaseActivation):
+    """
+    SoftSign Activation.
+
+    .. math::
+       f(z)=\\frac{z}{1 + |z|}
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'softsign', False)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 57979db4de..95797fba8f 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -297,7 +297,7 @@ def auc_evaluator(
 def pnpair_evaluator(
         input,
         label,
-        info,
+        query_id,
         weight=None,
         name=None, ):
     """
@@ -308,16 +308,20 @@ def pnpair_evaluator(
 
     .. code-block:: python
 
-       eval = pnpair_evaluator(input, label, info)
+       eval = pnpair_evaluator(input, label, query_id)
 
     :param input: Input Layer name. The output prediction of network.
     :type input: LayerOutput
     :param label: Label layer name.
     :type label: LayerOutput
-    :param info: Info layer name. (TODO, explaination)
-    :type info: LayerOutput
+    :param query_id: Query_id layer name. Query_id indicates that which query
+     each sample belongs to. Its shape should be
+     the same as output of Label layer.
+    :type query_id: LayerOutput
     :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
+                  [sample_num, 1] which indicates the weight of each sample.
+                  The default weight of sample is 1 if the weight layer is None.
+                  And the pair weight is the mean of the two samples' weight.
     :type weight: LayerOutput
     :param name: Evaluator name.
     :type name: None|basestring
@@ -326,8 +330,8 @@ def pnpair_evaluator(
         input = [input]
     if label:
         input.append(label)
-    if info:
-        input.append(info)
+    if query_id:
+        input.append(query_id)
     evaluator_base(
         input=input,
         type="pnpair",
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 6e8ac8838b..8c5cc25d6c 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -20,8 +20,8 @@ from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
-    CudnnAvgPooling, CudnnMaxPooling
+from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
+    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
 
@@ -51,6 +51,7 @@ __all__ = [
     'last_seq',
     'first_seq',
     'cos_sim',
+    'l2_distance_layer',
     'hsigmoid',
     'conv_projection',
     'square_error_cost',
@@ -115,6 +116,7 @@ __all__ = [
     'huber_classification_cost',
     'block_expand_layer',
     'maxout_layer',
+    'dot_prod_layer',
     'out_prod_layer',
     'printer_layer',
     'print_layer',
@@ -122,6 +124,7 @@ __all__ = [
     'cross_channel_norm_layer',
     'multibox_loss_layer',
     'detection_output_layer',
+    'roi_pool_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -143,6 +146,9 @@ __all__ = [
     'scale_shift_layer',
     'img_conv3d_layer',
     'resize_layer',
+    'sub_seq_layer',
+    'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -164,6 +170,7 @@ class LayerType(object):
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
     COSINE_SIM = 'cos'
+    L2_DISTANCE = 'l2_distance'
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = 'conv'
     CONVTRANS_LAYER = 'convt'
@@ -194,6 +201,7 @@ class LayerType(object):
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
     ROTATE_LAYER = 'rotate'
+    DOT_PROD_LAYER = 'dot_prod'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -219,6 +227,7 @@ class LayerType(object):
     PRIORBOX_LAYER = 'priorbox'
     MULTIBOX_LOSS_LAYER = 'multibox_loss'
     DETECTION_OUTPUT_LAYER = 'detection_output'
+    ROI_POOL_LAYER = 'roi_pool'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -252,6 +261,11 @@ class LayerType(object):
     SCALE_SHIFT_LAYER = 'scale_shift'
 
     RESIZE = 'resize'
+    SUB_SEQ_LAYER = 'subseq'
+
+    SCALE_SUB_REGION_LAYER = 'scale_sub_region'
+
+    FACTORIZATION_MACHINE = 'factorization_machine'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -784,10 +798,9 @@ class MixedLayerType(LayerOutput):
         :type size: int
         :param act: Activation type.
         :type act: BaseActivation
-        :param bias_attr: The Bias Attribute. If the parameter is set to
-                          False or something not type of ParameterAttribute,
-                          no bias is defined. If the parameter is set to
-                          True, the bias is initialized to zero.
+        :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                          whose type is not ParameterAttribute, no bias is defined. If the
+                          parameter is set to True, the bias is initialized to zero.
         :type bias_attr: ParameterAttribute | None | bool | Any
         :param layer_attr: Extra Layer Attribute.
         :type layer_attr: ExtraLayerAttribute or None
@@ -882,12 +895,11 @@ def mixed_layer(size=0,
     :type size: int
     :param input: The input of this layer. It is an optional parameter. If set,
                   then this function will just return layer's name.
-    :param act: Activation Type. LinearActivation is the default.
+    :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: The extra layer config. Default is None.
     :type layer_attr: ExtraLayerAttribute
@@ -1025,14 +1037,13 @@ def fc_layer(input,
     :type input: LayerOutput | list | tuple
     :param size: The layer dimension.
     :type size: int
-    :param act: Activation Type. TanhActivation is the default.
+    :param act: Activation Type. TanhActivation is the default activation.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute|list.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -1303,6 +1314,50 @@ def detection_output_layer(input_loc,
         name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
 
 
+@wrap_name_default("roi_pool")
+def roi_pool_layer(input,
+                   rois,
+                   pooled_width,
+                   pooled_height,
+                   spatial_scale,
+                   num_channels=None,
+                   name=None):
+    """
+    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+    feature map.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param rois: The input ROIs' data.
+    :type rois: LayerOutput.
+    :param pooled_width: The width after pooling.
+    :type pooled_width: int
+    :param pooled_height: The height after pooling.
+    :type pooled_height: int
+    :param spatial_scale: The spatial scale between the image and feature map.
+    :type spatial_scale: float
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :return: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    size = num_channels * pooled_width * pooled_height
+    Layer(
+        name=name,
+        type=LayerType.ROI_POOL_LAYER,
+        inputs=[input.name, rois.name],
+        pooled_width=pooled_width,
+        pooled_height=pooled_height,
+        spatial_scale=spatial_scale,
+        num_channels=num_channels)
+    return LayerOutput(
+        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
@@ -1385,10 +1440,9 @@ def pooling_layer(input,
     :type pooling_type: BasePoolingType | None
     :param stride: The step size between successive pooling regions.
     :type stride: Int
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute | None
@@ -1480,16 +1534,15 @@ def lstmemory(input,
     :type input: LayerOutput
     :param reverse: is sequence process reversed or not.
     :type reverse: bool
-    :param act: Activation type. TanhActivation is the default. :math:`h_t`
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
     :param gate_act: gate activation type, SigmoidActivation by default.
     :type gate_act: BaseActivation
     :param state_act: state activation type, TanhActivation by default.
     :type state_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute.
     :type param_attr: ParameterAttribute | None | False
@@ -1612,10 +1665,9 @@ def grumemory(input,
                      This activation affects the :math:`z_t` and :math:`r_t`. It is the
                      :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute.
     :type param_attr: ParameterAttribute | None | False
@@ -1812,10 +1864,9 @@ def expand_layer(input,
     :type expand_as: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param expand_level: whether input layer is timestep(default) or sequence.
     :type expand_level: ExpandLevel
@@ -1852,9 +1903,12 @@ def repeat_layer(input,
     A layer for repeating the input for num_repeats times.
 
     If as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
     If not as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
 
@@ -1867,19 +1921,19 @@ def repeat_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
     :param name: The name of this layer. It is optional.
-    :param as_row_vector: True for treating input as row vector and repeating
-                          in the column direction.  This is equivalent to apply
-                          concat_layer() with num_repeats same input.
-                          False for treating input as column vector and repeating
-                          in the row direction.
+    :type name: basestring
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
     :type as_row_vector: bool
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1926,18 +1980,18 @@ def seq_reshape_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1961,7 +2015,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -1983,7 +2037,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2017,7 +2072,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -2027,18 +2082,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int | None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int | None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None | basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -2073,8 +2129,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -2084,11 +2140,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2128,11 +2185,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2170,7 +2228,8 @@ def trans_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2206,11 +2265,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
+    :param width: The width of the sample matrix.
+    :type width: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2255,15 +2317,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2289,6 +2351,51 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def l2_distance_layer(x, y, name=None, layer_attr=None):
+    """
+    This layer calculates and returns the Euclidean distance between two input
+    vectors x and y. The equation is as follows:
+
+    ..  math::
+        l2_distance(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i=1}^D(x_i - y_i)}
+
+    The output size of this layer is fixed to be 1. Note that the above
+    computation is for one sample. Multiple samples are processed in one batch.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       l2_sim = l2_distance(x=layer1, y=layer2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param x: The first input x for this layer, whose output is a matrix with
+              dimensionality N x D. N is the sample number in a mini-batch.
+              D is the dimensionality of x's output.
+    :type x: LayerOutput
+    :param y: The second input y for this layer, whose output is a matrix with
+              dimensionality N x D. N is the sample number in a mini-batch.
+              D is the dimensionality of y's output.
+    :type y: LayerOutput
+    :param layer_attr: The extra layer attributes, for example, drop rate.
+                       See ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: The returned LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(x, LayerOutput) and isinstance(y, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.L2_DISTANCE,
+        inputs=[x.name, y.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.L2_DISTANCE, parents=[x, y], size=1)
+
+
 @wrap_name_default()
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
@@ -2303,8 +2410,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2315,20 +2424,21 @@ def hsigmoid(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param label: Label layer.
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int | None
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra Layer Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2416,12 +2526,12 @@ def img_conv_layer(input,
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
     num_filters * num_group.
 
-    There are several group of filter in PaddlePaddle implementation.
-    Each group will process some channel of the inputs. For example, if an input
+    There are several groups of filters in PaddlePaddle implementation.
+    Each group will process some channels of the input. For example, if
     num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
-    32*4 = 128 filters to process inputs. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will process by first 32 filters. The
-    rest channels will be processed by rest group of filters.
+    32*4 = 128 filters to process the input. The channels will be split into 4
+    pieces. First 256/4 = 64 channels will be processed by first 32 filters. The
+    rest channels will be processed by the rest groups of filters.
 
     The example usage is:
 
@@ -2437,54 +2547,68 @@ def img_conv_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
-                        two image dimension.
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size_y is not provided.
     :type filter_size: int | tuple | list
-    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
-                        currently supports rectangular filters, the filter's
-                        shape will be (filter_size, filter_size_y).
-    :type filter_size_y: int | None
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
+    :type filter_size_y: int
     :param num_filters: Each filter group's number of filter
-    :param act: Activation type. ReluActivation is the default.
+    :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
-    :param groups: Group size of filters.
+    :param groups: The group number. 1 is the default group number.
     :type groups: int
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided. 1 is the default value.
     :type stride: int | tuple | list
-    :param stride_y: The y dimension of the stride.
+    :param stride_y: The stride on the y axis.
     :type stride_y: int
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided. 0 is the default padding size.
     :type padding: int | tuple | list
-    :param padding_y: The y dimension of the padding.
+    :param padding_y: The padding size on the y axis.
     :type padding_y: int
-    :param dilation: The x dimension of the dilation. Or input a tuple for two
-                    image dimension
+    :param dilation: The dimensions of the dilation. If the parameter is set to one integer,
+                     the two dimensions on x and y axises will be same when dilation_y is not
+                     set. If it is set to a list, the first element indicates the dimension
+                     on the x axis, and the second is used to specify the dimension on the y
+                     axis when dilation_y is not provided. 1 is the default dimension.
     :type dilation: int | tuple | list
-    :param dilation_y: The y dimension of the dilation.
+    :param dilation_y: The dimension of the dilation on the y axis.
     :type dilation_y: int
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channel number of the input.
     :type num_channels: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
+    :param shared_biases: Whether biases will be shared between filters or not.
     :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
+    :param layer_type: Specify the layer type. If the dilation's dimension on one axis is
+                       larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
+                       If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or "cudnn_conv".
+    :type layer_type: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2530,7 +2654,9 @@ def img_conv_layer(input,
 
     if layer_type:
         if dilation > 1 or dilation_y > 1:
-            assert layer_type in ["cudnn_conv", "cudnn_convt"]
+            assert layer_type in [
+                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
+            ]
         if trans:
             assert layer_type in ["exconvt", "cudnn_convt"]
         else:
@@ -2583,11 +2709,12 @@ def img_pool_layer(input,
                    pool_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   ceil_mode=True):
+                   ceil_mode=True,
+                   exclude_mode=None):
     """
     Image pooling Layer.
 
-    The details of pooling layer, please refer ufldl's pooling_ .
+    The details of pooling layer, please refer to ufldl's pooling_ .
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
@@ -2619,33 +2746,43 @@ def img_pool_layer(input,
                                  padding_y=2,
                                  pool_type=MaxPooling())
 
-    :param padding: pooling padding width.
+    :param padding: The padding size on the x axis. 0 is the default padding size.
     :type padding: int
-    :param padding_y: pooling padding height. It's equal to padding by default.
-    :type padding_y: int | None
-    :param name: name of pooling layer
-    :type name: basestring.
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window length on the x axis.
     :type pool_size: int
-    :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
-    :type pool_size_y: int | None
-    :param num_channels: number of input channel.
+    :param pool_size_y: The pooling window length on the y axis. If the parameter is
+                        not set or set to None, its actual value will be automatically
+                        set to pool_size.
+    :type pool_size_y: int
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The stride on the x axis. 1 is the default value.
     :type stride: int
-    :param stride_y: stride height of pooling. It is equal to stride by default.
-    :type stride_y: int | None
-    :param layer_attr: Extra Layer attribute.
+    :param stride_y: The stride on the y axis. If the parameter is not set or set to
+                     None, its actual value will be automatically set to 'stride'.
+    :type stride_y: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
     :type ceil_mode: bool
+    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
+                         work when pool_type is AvgPooling. If None, also exclude the padding 
+                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
+                         as pool_type to identify the mode.
+    :type exclude_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2658,9 +2795,9 @@ def img_pool_layer(input,
     elif isinstance(pool_type, AvgPooling):
         pool_type.name = 'avg'
 
-    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
-                               CudnnMaxPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
+                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
 
     type_name = pool_type.name + '-projection' \
         if (
@@ -2688,6 +2825,7 @@ def img_pool_layer(input,
                     padding_y=padding_y))
         ],
         ceil_mode=ceil_mode,
+        exclude_mode=exclude_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2750,24 +2888,32 @@ def img_pool3d_layer(input,
 
     :param padding: pooling padding width.
     :type padding: int | tuple | list
-    :param name: name of pooling layer
+    :param name: The name of this layer. It is optional.
     :type name: basestring.
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window lengths along three axises. If the parameter
+                      is set to one integer, the three lengths will be same.
     :type pool_size: int | tuple | list
-    :param num_channels: number of input channel.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The strides of the pooling along three axises. If the parameter
+                   is set to one integer, the three strides will be same. 1 is the
+                   default value.
     :type stride: int | tuple | list
-    :param layer_attr: Extra Layer attribute.
+    :param padding: The sizes of padding along three axises. If the parameter is set to
+                    one integer, they will be same. 0 is the default padding size.
+    :type padding: int | tuple | list
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
     :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2846,9 +2992,11 @@ def spp_layer(input,
               pyramid_height=None,
               layer_attr=None):
     """
-    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
-    The details please refer to
-    `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
+    A layer performs spatial pyramid pooling.
+
+    Reference:
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        https://arxiv.org/abs/1406.4729`_
 
     The example usage is:
 
@@ -2863,13 +3011,16 @@ def spp_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: number of input channel.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling or AveragePooling. Default is MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type scale: BasePoolingType
-    :param pyramid_height: pyramid height.
+    :param pyramid_height: The pyramid height of this pooling.
     :type pyramid_height: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2944,8 +3095,10 @@ def img_cmrnorm_layer(input,
                       layer_attr=None):
     """
     Response normalization across feature maps.
-    The details please refer to
-    `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
+
+    Reference:
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
 
     The example usage is:
 
@@ -2954,7 +3107,7 @@ def img_cmrnorm_layer(input,
         norm = img_cmrnorm_layer(input=net, size=5)
 
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
+    :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
     :param size: Normalize in number of :math:`size` feature maps.
@@ -2963,9 +3116,11 @@ def img_cmrnorm_layer(input,
     :type scale: float
     :param power: The hyper-parameter.
     :type power: float
-    :param num_channels: input layer's filers number or channels. If
-                         num_channels is None, it will be set automatically.
-    :param layer_attr: Extra Layer Attribute.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2989,11 +3144,12 @@ def batch_norm_layer(input,
                      param_attr=None,
                      layer_attr=None,
                      batch_norm_type=None,
+                     epsilon=1e-5,
                      moving_average_fraction=0.9,
                      use_global_stats=None,
                      mean_var_names=None):
     """
-    Batch Normalization Layer. The notation of this layer as follow.
+    Batch Normalization Layer. The notation of this layer is as follows.
 
     :math:`x` is the input features over a mini-batch.
 
@@ -3007,8 +3163,10 @@ def batch_norm_layer(input,
         \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
-    The details of batch normalization please refer to this
-    `paper <http://arxiv.org/abs/1502.03167>`_.
+    Reference:
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
+        Internal Covariate Shift
+        http://arxiv.org/abs/1502.03167`_
 
     The example usage is:
 
@@ -3018,48 +3176,49 @@ def batch_norm_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: batch normalization input. Better be linear activation.
-                Because there is an activation inside batch_normalization.
+    :param input: This layer's input which is to be performed batch normalization on.
     :type input: LayerOutput
     :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
                             batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
                             requires cuDNN version greater or equal to v4 (>=v4).
                             But cudnn_batch_norm is faster and needs less
                             memory than batch_norm. mkldnn_batch_norm requires
-                            enable use_mkldnn. By default (None), we will
-                            automaticly select cudnn_batch_norm for GPU,
+                            use_mkldnn is enabled. By default (None), we will
+                            automatically select cudnn_batch_norm for GPU,
                             mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
-                            Otherwise, select batch norm type based on the
-                            specified type. If you use cudnn_batch_norm,
-                            we suggested you use latest version, such as v5.1.
+                            Users can specify the batch norm type. If you use
+                            cudnn_batch_norm, we suggested you use latest version,
+                            such as v5.1.
     :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
                            or "mkldnn_batch_norm"
-    :param act: Activation Type. Better be relu. Because batch
-                     normalization will normalize input near zero.
+    :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
-    :param num_channels: num of image channels or previous layer's number of
-                         filters. None will automatically get from layer's
-                         input.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param bias_attr: :math:`\\beta`, better be zero when initialize. So the
-                      initial_std=0, initial_mean=1 is best practice.
+    :param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to
+                      False or an object whose type is not ParameterAttribute, no
+                      bias is defined. If the parameter is set to True, the bias is
+                      initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: :math:`\\gamma`, better be one when initialize. So the
-                       initial_std=0, initial_mean=1 is best practice.
+    :param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute
+                       for details.
     :type param_attr: ParameterAttribute
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param use_global_stats: whether use moving mean/variance statistics
-                             during testing peroid. If None or True,
-                             it will use moving mean/variance statistics during
-                             testing. If False, it will use the mean
-                             and variance of current batch of test data for
-                             testing.
+    :param use_global_stats: Whether use moving mean/variance statistics during
+                             testing peroid. If the parameter is set to None or
+                             True, it will use moving mean/variance statistics
+                             during testing. If the parameter is set to False, it
+                             will use the mean and variance of the current batch
+                             of test data.
     :type use_global_stats: bool | None.
-    :param moving_average_fraction: Factor used in the moving average
-                                   computation, referred to as facotr,
-                                   :math:`runningMean = newMean*(1-factor)
-                                   + runningMean*factor`
+    :param epsilon: The small constant added to the variance to improve numeric stability.
+    :type epsilon: float.
+    :param moving_average_fraction: Factor used in the moving average computation.
+                                   :math:`runningMean = newMean*(1-factor) + runningMean*factor`
     :type moving_average_fraction: float.
     :param mean_var_names: [mean name, variance name]
     :type mean_var_names: string list
@@ -3075,6 +3234,7 @@ def batch_norm_layer(input,
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
            (batch_norm_type == "mkldnn_batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
+
     l = Layer(
         name=name,
         img3D=img3D,
@@ -3084,6 +3244,7 @@ def batch_norm_layer(input,
         type=LayerType.BATCH_NORM_LAYER,
         batch_norm_type=batch_norm_type,
         bias=ParamAttr.to_bias(bias_attr),
+        epsilon=epsilon,
         moving_average_fraction=moving_average_fraction,
         use_global_stats=use_global_stats,
         mean_var_names=mean_var_names,
@@ -3121,8 +3282,9 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
-    :type layer_attr: ExtraLayerAttribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
+                       for details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3157,7 +3319,8 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
+                       for details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3194,32 +3357,27 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
                             act=ReluActivation(),
                             bias_attr=False)
 
-    This layer just simply add all input layers together, then activate the sum
-    inputs. Each input of this layer should be the same size, which is also the
-    output size of this layer.
+    This layer just simply adds all input layers together, then activates the
+    sum. All inputs should share the same dimension, which is also the dimension
+    of this layer's output.
 
     There is no weight matrix for each input, because it just a simple add
     operation. If you want a complicated operation before add, please use
     mixed_layer.
 
-    It is a very good way to set dropout outside the layers. Since not all
-    PaddlePaddle layer support dropout, you can add an add_to layer, set
-    dropout here.
-    Please refer to dropout_layer for details.
-
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layers. It could be a LayerOutput or list/tuple of
+    :param input: The input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
     :type input: LayerOutput | list | tuple
-    :param act: Activation Type. LinearActivation is the default.
+    :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3258,8 +3416,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
 @layer_support(DROPOUT, ERROR_CLIPPING)
 def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
-    Concat all input vector into one huge vector.
-    Inputs can be list of LayerOutput or list of projection.
+    Concatenate all input vectors to one vector.
+    Inputs can be a list of LayerOutput or a list of projection.
 
     The example usage is:
 
@@ -3269,11 +3427,12 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layers or projections
+    :param input: The input layers or projections
     :type input: list | tuple | collections.Sequence
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3343,7 +3502,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                      bias_attr=None):
     """
-    Concat sequence a with sequence b.
+    Concatenate sequence a and sequence b.
 
     Inputs:
       - a = [a1, a2, ..., am]
@@ -3362,18 +3521,18 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input sequence layer
+    :param a: The first input sequence layer
     :type a: LayerOutput
-    :param b: input sequence layer
+    :param b: The second input sequence layer
     :type b: LayerOutput
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3406,31 +3565,25 @@ def memory(name,
            boot_bias_active_type=None,
            boot_with_const_id=None):
     """
-    The memory layers is a layer cross each time step. Reference this output
-    as previous time step layer :code:`name` 's output.
-
-    The default memory is zero in first time step, previous time step's
-    output in the rest time steps.
+    The memory takes a layer's output at previous time step as its own output.
 
-    If boot_bias, the first time step value is this bias and
-    with activation.
+    If boot_bias, the activation of the bias is the initial value of the memory.
 
-    If boot_with_const_id, then the first time stop is a IndexSlot, the
-    Arguments.ids()[0] is this :code:`cost_id`.
+    If boot_with_const_id is set, then the memory's output at the first time step
+    is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`.
 
-    If boot_layer is not null, the memory is just the boot_layer's output.
-    Set :code:`is_seq` is true boot layer is sequence.
+    If boot_layer is specified, the memory's output at the first time step will
+    be the boot_layer's output.
 
-    The same name layer in recurrent group will set memory on each time
-    step.
+    In other case, the default memory's output at the first time step is zero.
 
     .. code-block:: python
 
        mem = memory(size=256, name='state')
        state = fc_layer(input=mem, size=256, name='state')
 
-    If you do not want to specify the name, you can equivalently use set_input()
-    to specify the layer needs to be remembered as the following:
+    If you do not want to specify the name, you can also use set_input()
+    to specify the layer to be remembered as the following:
 
     .. code-block:: python
 
@@ -3438,26 +3591,31 @@ def memory(name,
        state = fc_layer(input=mem, size=256)
        mem.set_input(mem)
 
-    :param name: the name of the layer which this memory remembers.
+    :param name: The name of the layer which this memory remembers.
                  If name is None, user should call set_input() to specify the
                  name of the layer which this memory remembers.
     :type name: basestring
-    :param size: size of memory.
+    :param size: The dimensionality of memory.
     :type size: int
-    :param memory_name: the name of the memory.
-                        It is ignored when name is provided.
+    :param memory_name: The name of the memory. It is ignored when name is provided.
     :type memory_name: basestring
     :param is_seq: DEPRECATED. is sequence for boot_layer
     :type is_seq: bool
-    :param boot_layer: boot layer of memory.
+    :param boot_layer: This parameter specifies memory's output at the first time
+                       step and the output is boot_layer's output.
     :type boot_layer: LayerOutput | None
-    :param boot_bias: boot layer's bias
+    :param boot_bias: The bias attribute of memory's output at the first time step.
+                      If the parameter is set to False or an object whose type is not
+                      ParameterAttribute, no bias is defined. If the parameter is set
+                      to True, the bias is initialized to zero.
     :type boot_bias: ParameterAttribute | None
-    :param boot_bias_active_type: boot layer's active type.
+    :param boot_bias_active_type: Activation type for memory's bias at the first time
+                                  step. LinearActivation is the default activation.
     :type boot_bias_active_type: BaseActivation
-    :param boot_with_const_id: boot layer's id.
+    :param boot_with_const_id: This parameter specifies memory's output at the first
+                               time step and the output is an index.
     :type boot_with_const_id: int
-    :return: LayerOutput object which is a memory.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if boot_bias_active_type is None:
@@ -3533,32 +3691,32 @@ def lstm_step_layer(input,
         ...
 
 
-    This layer has two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, whose name is 'state' and can use
+    This layer has two outputs. The default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and users can use
     :code:`get_output_layer` to extract this output.
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
-                 :code:`input.size/4`, and should be equal to
-                 :code:`state.size`.
+    :param size: The dimension of this layer's output, which must be
+                 equal to the dimension of the state.
     :type size: int
-    :param input: input layer. :math:`Wx_t + Wh_{t-1}`
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param state: State Layer. :math:`c_{t-1}`
+    :param state: The state of the LSTM unit.
     :type state: LayerOutput
-    :param act: Activation type. TanhActivation is the default.
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param gate_act: Gate Activation Type. SigmoidActivation is the default.
+    :param gate_act: Activation type of the gate. SigmoidActivation is the
+                     default activation.
     :type gate_act: BaseActivation
-    :param state_act: State Activation Type. TanhActivation is the default.
+    :param state_act: Activation type of the state. TanhActivation is the
+                      default activation.
     :type state_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3603,23 +3761,31 @@ def gru_step_layer(input,
                    layer_attr=None):
     """
 
-    :param input:
+    :param input: The input of this layer, whose dimension can be divided by 3.
     :type input: LayerOutput
-    :param output_mem:
-    :param size:
-    :param act:
+    :param output_mem: A memory which memorizes the output of this layer at previous
+                       time step.
+    :type output_mem: LayerOutput
+    :param size: The dimension of this layer's output. If it is not set or set to None,
+                 it will be set to one-third of the dimension of the input automatically.
+    :type size: int
+    :param act: Activation type of this layer's output. TanhActivation
+                is the default activation.
     :type act: BaseActivation
     :param name: The name of this layer. It is optional.
-    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
+    :type name: basestring
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation.
     :type gate_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: the parameter_attribute for transforming the output_mem
-                       from previous step.
-    :param layer_attr:
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3664,25 +3830,34 @@ def gru_step_naive_layer(input,
                          param_attr=None,
                          layer_attr=None):
     """
-    GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
+    GRU Step Layer, which is realized using PaddlePaddle API. It supports ERROR_CLIPPING
     and DROPOUT.
 
-    :param input:
-    :param output_mem:
-    :param size:
+    :param input: The input of this layer, whose dimensionality can be divided by 3.
+    :param output_mem: A memory which memorizes the output of this layer at previous
+                       time step.
+    :type output_mem: LayerOutput
+    :param size: The dimension of this layer's output. If it is not set or set to None,
+                 it will be set to one-third of the dimension of the input automatically.
+    :type size: int
     :param name: The name of this layer. It is optional.
-    :param act:
+    :type name: basestring
+    :param act: Activation type of this layer's output. TanhActivation
+                is the default activation.
     :type act: BaseActivation
-    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation
+                     is the default activation.
     :type gate_act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr:
-    :param layer_attr:
-    :return:
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if input.size % 3 != 0:
@@ -3744,12 +3919,13 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: get output layer's input. And this layer should contains
+    :param input: The input layer. And this layer should contain
                    multiple outputs.
     :type input: LayerOutput
-    :param arg_name: Output name from input.
+    :param arg_name: The name of the output to be extracted from the input layer.
     :type arg_name: basestring
-    :param layer_attr: Layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3806,18 +3982,20 @@ def recurrent_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param act: Activation type. TanhActivation is the default.
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: parameter attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3842,7 +4020,7 @@ def recurrent_layer(input,
 class StaticInput(object):
     """
     StaticInput is only used in recurrent_group which defines a read-only memory
-    that can be a sequence or non-sequence.
+    and can be a sequence or non-sequence.
     :param size: DEPRECATED
     :param is_seq: DEPRECATED
     """
@@ -3875,8 +4053,8 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
     Recurrent layer group is an extremely flexible recurrent unit in
     PaddlePaddle. As long as the user defines the calculation done within a
     time step, PaddlePaddle will iterate such a recurrent calculation over
-    sequence input. This is extremely usefull for attention based model, or
-    Neural Turning Machine like models.
+    sequence input. This is useful for attention-based models, or Neural
+    Turning Machine like models.
 
     The basic usage (time steps) is:
 
@@ -3898,18 +4076,17 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                   demo/seqToseq/seqToseq_net.py
     - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
 
-    :param step: recurrent one time step function.The input of this function is
-                 input of the group. The return of this function will be
-                 recurrent group's return value.
+    :param step: A step function which takes the input of recurrent_group as its own
+                 input and returns values as recurrent_group's output every time step.
 
-                 The recurrent group scatter a sequence into time steps. And
-                 for each time step, will invoke step function, and return
-                 a time step result. Then gather each time step of output into
+                 The recurrent group scatters a sequence into time steps. And
+                 for each time step, it will invoke step function, and return
+                 a time step result. Then gather outputs of each time step into
                  layer group's output.
 
     :type step: callable
 
-    :param name: recurrent_group's name.
+    :param name: The recurrent_group's name. It is optional.
     :type name: basestring
 
     :param input: Input links array.
@@ -3917,11 +4094,11 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                   LayerOutput will be scattered into time steps.
                   SubsequenceInput will be scattered into sequence steps.
                   StaticInput will be imported to each time step, and doesn't change
-                  through time. It's a mechanism to access layer outside step function.
+                  over time. It's a mechanism to access layer outside step function.
 
     :type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple
 
-    :param reverse: If reverse is set true, the recurrent unit will process the
+    :param reverse: If reverse is set to True, the recurrent unit will process the
                     input sequence in a reverse order.
     :type reverse: bool
 
@@ -4056,7 +4233,8 @@ def maxid_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4075,6 +4253,45 @@ def maxid_layer(input, name=None, layer_attr=None):
         size=l.config.size)
 
 
+@wrap_name_default()
+def dot_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the dot product of two vectors.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input1: The first input layer.
+    :type input1: LayerOutput
+    :param input2: The second input layer.
+    :type input2: LayerOutput
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    assert input1.size == input2.size, ("Two inputs should have the same size.")
+
+    l = Layer(
+        name=name,
+        type=LayerType.DOT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.DOT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
+
+
 @wrap_name_default()
 def out_prod_layer(input1, input2, name=None, layer_attr=None):
     """
@@ -4089,11 +4306,12 @@ def out_prod_layer(input1, input2, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input1: The first input layer name.
+    :param input1: The first input layer.
     :type input: LayerOutput
-    :param input2: The second input layer name.
+    :param input2: The second input layer.
     :type input2: LayerOutput
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4132,9 +4350,10 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param eos_id: end id of sequence
+    :param eos_id: End id of sequence
     :type eos_id: int
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4195,8 +4414,9 @@ def beam_search(step,
     - machine translation : demo/seqToseq/translation/gen.conf \
                             demo/seqToseq/seqToseq_net.py
 
-    :param name: Name of the recurrent unit that generates sequences.
-    :type name: base string
+    :param name: The name of the recurrent unit that is responsible for
+                 generating sequences. It is optional.
+    :type name: basestring
     :param step: A callable function that defines the calculation in a time
                  step, and it is applied to sequences with arbitrary length by
                  sharing a same set of weights.
@@ -4321,16 +4541,18 @@ def square_error_cost(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Network prediction.
+    :param input: The first input layer.
     :type input: LayerOutput
-    :param label: Data label.
+    :param label: The input label.
     :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4363,17 +4585,20 @@ def classification_cost(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layer name. network output.
+    :param input: The first input layer.
     :type input: LayerOutput
-    :param label: label layer name. data_layer often.
+    :param label: The input label.
     :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param evaluator: Evaluator method.
-    :param layer_attr: layer's extra attribute.
+    :param evaluator: Evaluator method. classification_error_evaluator is the default.
+    :type evaluator: Evaluator method
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4426,7 +4651,7 @@ def conv_operator(img,
     Different from img_conv_layer, conv_op is an Operator, which can be used
     in mixed_layer. And conv_op takes two inputs to perform convolution.
     The first input is the image and the second is filter kernel. It only
-    support GPU mode.
+    supports GPU mode.
 
     The example usage is:
 
@@ -4438,27 +4663,31 @@ def conv_operator(img,
                           num_filters=64,
                           num_channels=64)
 
-    :param img: input image
+    :param img: The input image.
     :type img: LayerOutput
-    :param filter: input filter
+    :param filter: The input filter.
     :type filter: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
+    :param filter_size: The dimension of the filter kernel on the x axis.
     :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since
-                        PaddlePaddle now supports rectangular filters,
-                        the filter's shape can be (filter_size, filter_size_y).
+    :param filter_size_y: The dimension of the filter kernel on the y axis.
+                          If the parameter is not set or set to None, it will
+                          set to 'filter_size' automatically.
     :type filter_size_y: int
-    :param num_filters: channel of output data.
+    :param num_filters: The number of the output channels.
     :type num_filters: int
-    :param num_channels: channel of input data.
+    :param num_channels: The number of the input channels. If the parameter is not set
+                         or set to None, it will be automatically set to the channel
+                         number of the 'img'.
     :type num_channels: int
-    :param stride: The x dimension of the stride.
+    :param stride: The stride on the x axis.
     :type stride: int
-    :param stride_y: The y dimension of the stride.
+    :param stride_y: The stride on the y axis. If the parameter is not set or
+                     set to None, it will be set to 'stride' automatically.
     :type stride_y: int
-    :param padding: The x dimension of padding.
+    :param padding: The padding size on the x axis.
     :type padding: int
-    :param padding_y: The y dimension of padding.
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
     :type padding_y: int
     :return: A ConvOperator Object.
     :rtype: ConvOperator
@@ -4509,9 +4738,9 @@ def conv_projection(input,
                     param_attr=None,
                     trans=False):
     """
-    Different from img_conv_layer and conv_op, conv_projection is an Projection,
-    which can be used in mixed_layer and conat_layer. It use cudnn to implement
-    conv and only support GPU mode.
+    Different from img_conv_layer and conv_op, conv_projection is a Projection,
+    which can be used in mixed_layer and concat_layer. It uses cudnn to implement
+    convolution and only supports GPU mode.
 
     The example usage is:
 
@@ -4524,32 +4753,45 @@ def conv_projection(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
-    :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since
-                          PaddlePaddle now supports rectangular filters,
-                          the filter's shape can be (filter_size, filter_size_y).
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size_y is not provided.
+    :type filter_size: int | tuple | list
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
     :type filter_size_y: int
-    :param num_filters: channel of output data.
+    :param num_filters: The number of filters.
     :type num_filters: int
-    :param num_channels: channel of input data.
+    :param num_channels: The number of the input channels.
     :type num_channels: int
-    :param stride: The x dimension of the stride.
-    :type stride: int
-    :param stride_y: The y dimension of the stride.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided.
+    :type stride: int | tuple | list
+    :param stride_y: The stride on the y axis.
     :type stride_y: int
-    :param padding: The x dimension of padding.
-    :type padding: int
-    :param padding_y: The y dimension of padding.
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided.
+    :type padding: int | tuple | list
+    :param padding_y: The padding size on the y axis.
     :type padding_y: int
     :param groups: The group number.
     :type groups: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param trans: whether it is convTrans or conv
+    :param trans: Whether it is ConvTransProjection or ConvProjection
     :type trans: bool
-    :return: A DotMulProjection Object.
-    :rtype: DotMulProjection
+    :return: A Projection Object.
+    :rtype: ConvTransProjection | ConvProjection
     """
     if num_channels is None:
         assert input.num_filters is not None
@@ -4614,13 +4856,13 @@ def pad_layer(input,
               layer_attr=None):
     """
     This operation pads zeros to the input data according to pad_c,pad_h
-    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
-    of padding. And the input data shape is NCHW.
+    and pad_w. pad_c, pad_h, pad_w specify the size in the corresponding
+    dimension. And the input data shape is NCHW.
 
-    For example, pad_c=[2,3] means padding 2 zeros before the
-    input data and 3 zeros after the input data in channel dimension.
-    pad_h means padding zeros in height dimension. pad_w means padding zeros
-    in width dimension.
+    For example, pad_c=[2,3] means padding 2 zeros before the input data
+    and 3 zeros after the input data in the channel dimension. pad_h means
+    padding zeros in the height dimension. pad_w means padding zeros in the
+    width dimension.
 
     For example,
 
@@ -4657,13 +4899,14 @@ def pad_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param pad_c: padding size in channel dimension.
+    :param pad_c: The padding size in the channel dimension.
     :type pad_c: list | None
-    :param pad_h: padding size in height dimension.
+    :param pad_h: The padding size in the height dimension.
     :type pad_h: list | None
-    :param pad_w: padding size in width dimension.
+    :param pad_w: The padding size in the width dimension.
     :type pad_w: list | None
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -4712,7 +4955,7 @@ def pad_layer(input,
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
     """
-    This layer performs cyclic convolution for two input. For example:
+    This layer performs cyclic convolution on two inputs. For example:
       - a[in]: contains M elements.
       - b[in]: contains N elements (N should be odd).
       - c[out]: contains M elements.
@@ -4721,7 +4964,7 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
         c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
 
-    In this formular:
+    In this formula:
      - a's index is computed modulo M. When it is negative, then get item from
        the right side (which is the end of array) to the left.
      - b's index is computed modulo N. When it is negative, then get item from
@@ -4735,11 +4978,12 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: Input layer a.
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b.
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4770,8 +5014,8 @@ def tensor_layer(a,
                  bias_attr=None,
                  layer_attr=None):
     """
-    This layer performs tensor operation for two input.
-    For example, each sample:
+    This layer performs tensor operation on two inputs.
+    For example:
 
     .. math::
        y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
@@ -4791,22 +5035,24 @@ def tensor_layer(a,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: Input layer a.
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b.
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param size: the layer dimension.
-    :type size: int.
-    :param act: Activation type. LinearActivation is the default.
+    :param size: The dimension of this layer.
+    :type size: int
+    :param act: Activation type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4842,7 +5088,7 @@ def selective_fc_layer(input,
                        layer_attr=None):
     """
     Selectived fully connected layer. Different from fc_layer, the output
-    of this layer maybe sparse. It requires an additional input to indicate
+    of this layer can be sparse. It requires an additional input to indicate
     several selected columns for output. If the selected columns is not
     specified, selective_fc_layer acts exactly like fc_layer.
 
@@ -4856,22 +5102,34 @@ def selective_fc_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param select: The select layer. The output of select layer should be a
-                   sparse binary matrix, and treat as the mask of selective fc.
-                   If is None, acts exactly like fc_layer.
+    :param select: The layer to select columns to output. It should be a sparse
+                   binary matrix, and is treated as the mask of selective fc. If
+                   it is not set or set to None, selective_fc_layer acts exactly
+                   like fc_layer.
     :type select: LayerOutput
-    :param size: The layer dimension.
+    :param size: The dimension of this layer, which should be equal to that of
+                 the layer 'select'.
     :type size: int
-    :param act: Activation type. TanhActivation is the default.
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute.
+    :param pass_generation: The flag which indicates whether it is during generation.
+    :type pass_generation: bool
+    :param has_selected_colums: The flag which indicates whether the parameter 'select'
+                                has been set. True is the default.
+    :type has_selected_colums: bool
+    :param mul_ratio: A ratio helps to judge how sparse the output is and determine
+                      the computation method for speed consideration.
+    :type mul_ratio: float
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4922,7 +5180,7 @@ def selective_fc_layer(input,
 @layer_support()
 def sampling_id_layer(input, name=None, layer_attr=None):
     """
-    A layer for sampling id from multinomial distribution from the input layer.
+    A layer for sampling id from a multinomial distribution from the input layer.
     Sampling one id for one sample.
 
     The simple usage is:
@@ -4935,8 +5193,9 @@ def sampling_id_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4957,8 +5216,7 @@ def slope_intercept_layer(input,
                           intercept=0.0,
                           layer_attr=None):
     """
-    This layer for applying a slope and an intercept to the input
-    element-wise. There is no activation and weight.
+    This layer for applying a slope and an intercept to the input.
 
     ..  math::
         y = slope * x + intercept
@@ -4973,12 +5231,13 @@ def slope_intercept_layer(input,
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param slope: the scale factor.
-    :type slope: float.
-    :param intercept: the offset.
-    :type intercept: float.
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param slope: The scale factor.
+    :type slope: float
+    :param intercept: The offset.
+    :type intercept: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5033,12 +5292,13 @@ def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
     :type weights: LayerOutput
     :param vectors: The vector layer.
     :type vectors: LayerOutput
-    :param size: the dimension of this layer.
+    :param size: The dimension of this layer.
     :type size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5085,11 +5345,11 @@ def block_expand_layer(input,
 
        outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x
 
-    The expand method is the same with ExpandConvLayer, but saved the transposed
+    The expanding method is the same with ExpandConvLayer, but saved the transposed
     value. After expanding, output.sequenceStartPositions will store timeline.
-    The number of time steps are outputH * outputW and the dimension of each
+    The number of time steps is outputH * outputW and the dimension of each
     time step is block_y * block_x * num_channels. This layer can be used after
-    convolution neural network, and before recurrent neural network.
+    convolutional neural network, and before recurrent neural network.
 
     The simple usage is:
 
@@ -5104,8 +5364,10 @@ def block_expand_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: The channel number of input layer.
-    :type num_channels: int | None
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
     :param block_x: The width of sub block.
     :type block_x: int
     :param block_y: The width of sub block.
@@ -5119,9 +5381,10 @@ def block_expand_layer(input,
     :param padding_y: The padding size in vertical direction.
     :type padding_y: int
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :type name: basestring.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5151,12 +5414,19 @@ def block_expand_layer(input,
 @layer_support()
 def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     """
-    A layer to do max out on conv layer output.
-      - Input: output of a conv layer.
-      - Output: feature map size same as input. Channel is (input channel) / groups.
+    A layer to do max out on convolutional layer output.
+      - Input: the output of a convolutional layer.
+      - Output: feature map size same as the input's, and its channel number is
+        (input channel) / groups.
 
     So groups should be larger than 1, and the num of channels should be able
-    to devided by groups.
+    to be devided by groups.
+
+    Reference:
+        `Maxout Networks
+        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        https://arxiv.org/pdf/1312.6082v4.pdf`_
 
     .. math::
        y_{si+j} = \max_k x_{gsi + sk + j}
@@ -5166,12 +5436,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
        0 \le j < s
        0 \le k < groups
 
-    Please refer to Paper:
-      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-      - Multi-digit Number Recognition from Street View \
-        Imagery using Deep Convolutional Neural Networks: \
-        https://arxiv.org/pdf/1312.6082v4.pdf
-
     The simple usage is:
 
     .. code-block:: python
@@ -5182,14 +5446,16 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: The channel number of input layer. If None will be set
-                     automatically from previous output.
-    :type num_channels: int | None
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
     :param groups: The group number of input layer.
     :type groups: int
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param layer_attr: Extra Layer attribute.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5221,20 +5487,20 @@ def ctc_layer(input,
               layer_attr=None):
     """
     Connectionist Temporal Classification (CTC) is designed for temporal
-    classication task. That is, for sequence labeling problems where the
+    classication task. e.g. sequence labeling problems where the
     alignment between the inputs and the target labels is unknown.
 
-    More details can be found by referring to `Connectionist Temporal
-    Classification: Labelling Unsegmented Sequence Data with Recurrent
-    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_
+    Reference:
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        with Recurrent Neural Networks
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
-        Considering the 'blank' label needed by CTC, you need to use
-        (num_classes + 1) as the input size. num_classes is the category number.
-        And the 'blank' is the last category index. So the size of 'input' layer, such as
-        fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
-        should also be num_classes + 1.
+        Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
+        as the size of the input, where num_classes is the category number.
+        And the 'blank' is the last category index. So the size of 'input' layer (e.g.
+        fc_layer with softmax activation) should be (num_classes + 1). The size of
+        ctc_layer should also be (num_classes + 1).
 
     The example usage is:
 
@@ -5247,16 +5513,17 @@ def ctc_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param label: The data layer of label with variable length.
+    :param label: The input label.
     :type label: LayerOutput
-    :param size: category numbers + 1.
+    :param size: The dimension of this layer, which must be equal to (category number + 1).
     :type size: int
     :param name: The name of this layer. It is optional.
-    :type name: basestring | None
-    :param norm_by_times: Whether to normalization by times. False by default.
+    :type name: basestring
+    :param norm_by_times: Whether to do normalization by times. False is the default.
     :type norm_by_times: bool
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5297,20 +5564,19 @@ def warp_ctc_layer(input,
     building process, PaddlePaddle will clone the source codes, build and
     install it to :code:`third_party/install/warpctc` directory.
 
-    More details of CTC can be found by referring to `Connectionist Temporal
-    Classification: Labelling Unsegmented Sequence Data with Recurrent
-    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_.
+    Reference:
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        with Recurrent Neural Networks
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
-        - Let num_classes represent the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the input size.
-          Thus, the size of both warp_ctc layer and 'input' layer should be set to
-          num_classes + 1.
+        - Let num_classes represents the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the size of
+          warp_ctc layer.
         - You can set 'blank' to any value ranged in [0, num_classes], which
-          should be consistent as that used in your labels.
+          should be consistent with those used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
-          'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected to be used instead in the 'input' layer.
 
     The example usage is:
 
@@ -5324,18 +5590,19 @@ def warp_ctc_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param label: The data layer of label with variable length.
+    :param label: The input label.
     :type label: LayerOutput
-    :param size: category numbers + 1.
+    :param size: The dimension of this layer, which must be equal to (category number + 1).
     :type size: int
     :param name: The name of this layer. It is optional.
-    :type name: basestring | None
-    :param blank: the 'blank' label used in ctc
+    :type name: basestring
+    :param blank: The 'blank' label used in ctc.
     :type blank: int
-    :param norm_by_times: Whether to normalization by times. False by default.
+    :param norm_by_times: Whether to do normalization by times. False is the default.
     :type norm_by_times: bool
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5381,23 +5648,26 @@ def crf_layer(input,
                       label=label,
                       size=label_dim)
 
-    :param input: The first input layer is the feature.
+    :param input: The first input layer.
     :type input: LayerOutput
-    :param label: The second input layer is label.
+    :param label: The input label.
     :type label: LayerOutput
     :param size: The category number.
     :type size: int
-    :param weight: The third layer is "weight" of each sample, which is an
-                  optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param param_attr: Parameter attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5443,9 +5713,9 @@ def crf_decoding_layer(input,
     """
     A layer for calculating the decoding sequence of sequential conditional
     random field model. The decoding sequence is stored in output.ids.
-    If a second input is provided, it is treated as the ground-truth label, and
-    this layer will also calculate error. output.value[i] is 1 for incorrect
-    decoding or 0 for correct decoding.
+    If the input 'label' is provided, it is treated as the ground-truth label, and
+    this layer will also calculate error. output.value[i] is 1 for an incorrect
+    decoding and 0 for the correct.
 
     The example usage is:
 
@@ -5456,16 +5726,18 @@ def crf_decoding_layer(input,
 
     :param input: The first input layer.
     :type input: LayerOutput
-    :param size: size of this layer.
+    :param size: The dimension of this layer.
     :type size: int
-    :param label: None or ground-truth label.
-    :type label: LayerOutput or None
-    :param param_attr: Parameter attribute. None means default attribute
+    :param label: The input label.
+    :type label: LayerOutput | None
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5492,7 +5764,11 @@ def crf_decoding_layer(input,
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
 
 
-@wrap_act_default(act=SigmoidActivation())
+"""
+Following are cost Layers.
+"""
+
+
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
 @wrap_name_default()
@@ -5500,7 +5776,6 @@ def crf_decoding_layer(input,
 def nce_layer(input,
               label,
               num_classes=None,
-              act=None,
               param_attr=None,
               weight=None,
               num_neg_samples=10,
@@ -5510,8 +5785,10 @@ def nce_layer(input,
               layer_attr=None):
     """
     Noise-contrastive estimation.
-    Implements the method in the following paper:
-    A fast and simple algorithm for training neural probabilistic language models.
+
+    Reference:
+        `A fast and simple algorithm for training neural probabilistic language
+        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
 
     The example usage is:
 
@@ -5523,32 +5800,40 @@ def nce_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layers. It could be a LayerOutput of list/tuple of LayerOutput.
+    :param input: The first input of this layer.
     :type input: LayerOutput | list | tuple | collections.Sequence
-    :param label: label layer
+    :param label: The input label.
     :type label: LayerOutput
-    :param weight: weight layer, can be None(default)
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param num_classes: number of classes.
+    :param num_classes: The number of classes.
     :type num_classes: int
-    :param act: Activation type. SigmoidActivation is the default.
+    :param act: Activation type. SigmoidActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param num_neg_samples: number of negative samples. Default is 10.
+    :param num_neg_samples: The number of sampled negative labels. 10 is the
+                            default value.
     :type num_neg_samples: int
-    :param neg_distribution: The distribution for generating the random negative labels.
-                             A uniform distribution will be used if not provided.
-                             If not None, its length must be equal to num_classes.
+    :param neg_distribution: The discrete noisy distribution over the output
+                             space from which num_neg_samples negative labels
+                             are sampled. If this parameter is not set, a
+                             uniform distribution will be used. A user-defined
+                             distribution is a list whose length must be equal
+                             to the num_classes. Each member of the list defines
+                             the probability of a class given input x.
     :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
@@ -5571,8 +5856,6 @@ def nce_layer(input,
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
         assert abs(sum(neg_distribution) - 1.0) < 1e-5
-    if not isinstance(act, BaseActivation):
-        raise TypeError()
 
     ipts_for_layer = []
     parents = []
@@ -5594,7 +5877,7 @@ def nce_layer(input,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
         neg_sampling_dist=neg_distribution,
-        active_type=act.name,
+        active_type=SigmoidActivation().name,
         num_neg_samples=num_neg_samples,
         inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
@@ -5604,12 +5887,7 @@ def nce_layer(input,
         LayerType.NCE_LAYER,
         parents=parents,
         size=l.config.size,
-        activation=act)
-
-
-"""
-following are cost Layers.
-"""
+        activation=SigmoidActivation())
 
 
 @wrap_name_default()
@@ -5622,11 +5900,11 @@ def rank_cost(left,
               coeff=1.0,
               layer_attr=None):
     """
-    A cost Layer for learning to rank using gradient descent. Details can refer
-    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
-    ICML_ranking.pdf>`_.
-    This layer contains at least three inputs. The weight is an optional
-    argument, which affects the cost.
+    A cost Layer for learning to rank using gradient descent.
+
+    Reference:
+        `Learning to Rank using Gradient Descent
+        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
 
     .. math::
 
@@ -5657,14 +5935,16 @@ def rank_cost(left,
     :type right: LayerOutput
     :param label: Label is 1 or 0, means positive order and reverse order.
     :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5709,25 +5989,25 @@ def lambda_cost(input,
                          NDCG_num=8,
                          max_sort_size=-1)
 
-    :param input: Samples of the same query should be loaded as sequence.
+    :param input: The first input of this layer, which is often a document
+                  samples list of the same query and whose type must be sequence.
     :type input: LayerOutput
-    :param score: The 2nd input. Score of each sample.
+    :param score: The scores of the samples.
     :type input: LayerOutput
     :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
                      e.g., 5 for NDCG@5. It must be less than or equal to the
-                     minimum size of lists.
+                     minimum size of the list.
     :type NDCG_num: int
-    :param max_sort_size: The size of partial sorting in calculating gradient.
-                          If max_sort_size = -1, then for each list, the
-                          algorithm will sort the entire list to get gradient.
-                          In other cases, max_sort_size must be greater than or
-                          equal to NDCG_num. And if max_sort_size is greater
-                          than the size of a list, the algorithm will sort the
-                          entire list of get gradient.
+    :param max_sort_size: The size of partial sorting in calculating gradient. If
+                          max_sort_size is equal to -1 or greater than the number
+                          of the samples in the list, then the algorithm will sort
+                          the entire list to compute the gradient. In other cases,
+                          max_sort_size must be greater than or equal to NDCG_num.
     :type max_sort_size: int
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param layer_attr: Extra Layer Attribute.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5768,20 +6048,20 @@ def cross_entropy(input,
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param coeff: The cost is multiplied with coeff.
-                  The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param weight: The cost of each sample is multiplied with each weight.
-                   The weight should be a layer with size=1. Note that gradient
-                   will not be calculated for weight.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutout
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
 
     ipts, parents = __cost_input__(input, label, weight)
@@ -5814,19 +6094,21 @@ def cross_entropy_with_selfnorm(input,
                                           label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
-    :type softmax_selfnorm_alpha: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type softmax_selfnorm_alpha: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
     Layer(
         name=name,
@@ -5847,7 +6129,7 @@ def cross_entropy_with_selfnorm(input,
 @layer_support()
 def sum_cost(input, name=None, layer_attr=None):
     """
-    A loss layer which calculate the sum of the input as loss
+    A loss layer which calculates the sum of the input as loss.
 
     The example usage is:
 
@@ -5856,10 +6138,11 @@ def sum_cost(input, name=None, layer_attr=None):
        cost = sum_cost(input=input_layer)
 
     :param input: The input of this layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param layer_attr: Extra Layer Attribute.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
@@ -5899,16 +6182,18 @@ def huber_regression_cost(input,
        cost = huber_regression_cost(input=input_layer, label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
+    :type name: basestring
     :param delta: The difference between the observed and predicted values.
-    :type delta: float.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type delta: float
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
@@ -5949,17 +6234,19 @@ def huber_classification_cost(input,
        cost = huber_classification_cost(input=input_layer, label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
     assert isinstance(input, LayerOutput)
     if input.size is not None:
@@ -5996,10 +6283,12 @@ def multi_binary_label_cross_entropy(input,
     :param label: The input label.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6102,7 +6391,7 @@ def cross_entropy_over_beam(input, name=None):
 
     :param input: Input beams for this layer.
     :type input: BeamInput
-    :param name: The name of this layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6137,7 +6426,7 @@ def cross_entropy_over_beam(input, name=None):
 def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     This is a L1 loss but more smooth. It requires that the
-    size of input and label are equal. The formula is as follows,
+    sizes of input and label are equal. The formula is as follows,
 
     .. math::
 
@@ -6149,8 +6438,9 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
-    More details can be found by referring to `Fast R-CNN
-    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+    Reference:
+        `Fast R-CNN
+        https://arxiv.org/pdf/1504.08083v2.pdf`_
 
     The example usage is:
 
@@ -6164,10 +6454,12 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     :param label: The input label.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6189,12 +6481,12 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 @wrap_name_default()
 def multiplex_layer(input, name=None, layer_attr=None):
     """
-    This layer multiplex multiple layers according to the index,
-    which is provided by the first input layer.
-    inputs[0]: the index of the layer to output of size batchSize.
+    This layer multiplex multiple layers according to the indexes,
+    which are provided by the first input layer.
+    inputs[0]: the indexes of the layers to form the output of size batchSize.
     inputs[1:N]; the candidate output data.
-    For each index i from 0 to batchSize -1, the output is the i-th row of the
-    (index[i] + 1)-th layer.
+    For each index i from 0 to batchSize - 1, the i-th row of the output is the
+    the same to the i-th row of the (index[i] + 1)-th layer.
 
     For each i-th row of output:
     .. math::
@@ -6213,7 +6505,8 @@ def multiplex_layer(input, name=None, layer_attr=None):
     :type input: list of LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6315,16 +6608,16 @@ def row_conv_layer(input,
     :param context_len: The context length equals the lookahead step number
                         plus one.
     :type context_len: int
-    :param act: Activation Type. LinearActivation is the default.
+    :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute. If None, the parameter will be
-                       initialized smartly. It's better to set it by yourself.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
-
     """
     assert isinstance(input, LayerOutput)
     assert context_len > 0, "the context_len must be greatet than 0."
@@ -6342,18 +6635,19 @@ def row_conv_layer(input,
 
 @layer_support()
 @wrap_name_default()
-@wrap_param_attr_default()
 def prelu_layer(input,
                 name=None,
                 partial_sum=1,
+                channel_shared=None,
+                num_channels=None,
                 param_attr=None,
                 layer_attr=None):
     """
-    The Parameter Relu activation that actives outputs with a learnable weight.
+    The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6369,23 +6663,50 @@ def prelu_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param partial_sum: this parameter makes a group of inputs share a same weight.
+    :param partial_sum: this parameter makes a group of inputs share the same weight.
 
         - partial_sum = 1, indicates the element-wise activation: each element has a weight.
-        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share a same weight.
-        - partial_sum = number of outputs, indicates all elements share a same weight.
+        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share the same weight.
+        - partial_sum = number of outputs, indicates all elements share the same weight.
 
     :type partial_sum: int
+    :param channel_shared: whether or not the parameter are shared across channels.
+
+        - channel_shared = True, we set the partial_sum to the number of outputs.
+        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
+
+    :type channel_shared: bool
+    :param num_channels: number of input channel.
+    :type num_channels: int
     :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra layer configurations. Default is None.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
     assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
-    assert isinstance(param_attr, ParameterAttribute)
+
+    if not param_attr:
+        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
+    else:
+        assert isinstance(param_attr, ParameterAttribute)
+
+    if num_channels is None:
+        assert input.num_filters is not None, \
+                'the input channel cannot be detected, please specify the num_channels parameter'
+        num_channels = input.num_filters
+
+    if channel_shared is not None:
+        assert isinstance(channel_shared, bool)
+        assert (input.height != 0 and input.width != 0), \
+            'input height and widht must be setted'
+        if channel_shared:
+            partial_sum = input.height * input.width * num_channels
+        else:
+            partial_sum = input.height * input.width
 
     l = Layer(
         name=name,
@@ -6397,6 +6718,7 @@ def prelu_layer(input,
         name=name,
         layer_type=LayerType.PRELU,
         parents=input,
+        num_filters=num_channels,
         size=l.config.size)
 
 
@@ -6421,8 +6743,8 @@ def gated_unit_layer(input,
     product between :match:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
-        Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083
+        `Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6434,34 +6756,35 @@ def gated_unit_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param size: output size of the gated unit.
+    :param size: The dimension of this layer's output.
     :type size: int
-    :param act: Activation type of the projected input. LinearActivation is the default.
+    :param act: Activation type of the projection. LinearActivation is the default
+                activation.
     :type act: BaseActivation
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param gate_attr: Attributes to tune the gate output, for example, error
-        clipping threshold, dropout and so on. See ExtraLayerAttribute for
-        more details.
+    :param gate_attr: The extra layer attribute of the gate. See ExtraLayerAttribute for
+                      details.
     :type gate_attr: ExtraLayerAttribute | None
-    :param gate_param_attr: Attributes to tune the learnable projected matrix
-        parameter of the gate.
-    :type gate_param_attr: ParameterAttribute | None
-    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
-    :type gate_bias_attr: ParameterAttribute | None
-    :param inproj_attr: Attributes to the tune the projected input, for
-        example, error clipping threshold, dropout and so on. See
-        ExtraLayerAttribute for more details.
+    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
+                            for details.
+    :type gate_param_attr: ParameterAttribute
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
+                           an object whose type is not ParameterAttribute, no bias is defined.
+                           If this parameter is set to True, the bias is initialized to zero.
+    :type gate_bias_attr: ParameterAttribute | bool | None | Any
+    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
+                        details.
     :type inproj_attr: ExtraLayerAttribute | None
-    :param inproj_param_attr: Attributes to tune the learnable parameter of
-        the projection of input.
-    :type inproj_param_attr: ParameterAttribute | None
-    :param inproj_bias_attr: Attributes to tune the learnable bias of
-        projection of the input.
-    :type inproj_bias_attr: ParameterAttribute | None
-    :param layer_attr: Attributes to tune the final output of the gated unit,
-        for example, error clipping threshold, dropout and so on. See
-        ExtraLayerAttribute for more details.
+    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
+                              for details.
+    :type inproj_param_attr: ParameterAttribute
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
+                             or an object whose type is not ParameterAttribute, no bias is defined.
+                             If this parameter is set to True, the bias is initialized to zero.
+    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
+    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6546,26 +6869,28 @@ def switch_order_layer(input,
 @layer_support()
 def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     """
-    This layer crops images by offset and shape. User can set crop shape by
-    args 'shape' explicitly or by reference input layer.
+    This layer crops images according to the offset and shape. Users can set
+    the crop shape through the argument 'shape' explicitly or by specifying a
+    reference input layer.
 
     The example usage is:
 
     .. code-block:: python
     crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
 
-    :param input: The input of this layer. If two inputs are given, the second input
-                  will be regarded as reference input.
+    :param input: The input of this layer. If two inputs are given, the second one
+                  will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence
-    :param axis: start axis to be cropped. To image input layer:
+    :param axis: The start axis to be cropped. For image input layer:
         - 0: batch size
         - 1: channels
         - 2: height
         - 3: width
-    :type partial_sum: int
-    :param shape: The shape to be cropped. Default is None.
+    :type axis: int
+    :param shape: The shape to be cropped to. Default is None.
     :type shape: Sequence | None
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -6656,9 +6981,9 @@ def clip_layer(input, min, max, name=None):
     :param input: The input of this layer.
     :type input: LayerOutput.
     :param min: The lower threshold for clipping.
-    :type min: double
+    :type min: float
     :param max: The upper threshold for clipping.
-    :type max: double
+    :type max: float
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6700,13 +7025,12 @@ def seq_slice_layer(input, starts, ends, name=None):
     :type name: basestring
     :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
-    :param starts: start indices to slice the input sequence.
+    :param starts: The start indices to slice the input sequence.
     :type starts: LayerOutput | None
-    :param ends: end indices to slice the input sequence.
+    :param ends: The end indices to slice the input sequence.
     :type ends: LayerOutput | None
     :return: LayerOutput object.
     :rtype: LayerOutput
-
     """
 
     assert isinstance(input, LayerOutput), (
@@ -6742,7 +7066,7 @@ def seq_slice_layer(input, starts, ends, name=None):
 @layer_support()
 def kmax_seq_score_layer(input, name=None, beam_size=1):
     """
-    This layer accepts one input which are scores over a sequence or a nested
+    This layer accepts one input which is scores over a sequence or a nested
     sequence, and returns indices of beam_size sequences with highest scores.
 
     .. code-block:: python
@@ -6752,11 +7076,11 @@ def kmax_seq_score_layer(input, name=None, beam_size=1):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input of this layer. It stores scores over a sequence or a nested
-        sequence and its size must be 1.
+    :param input: The input of this layer. It stores scores over a sequence or
+                  a nested sequence and its size must be 1.
     :type input: LayerOutput
-    :param beam_size: sequence indices with top beam_size scores are returned.
-    :type beam_size: double
+    :param beam_size: The indices of the sequences with top beam_size scores are returned.
+    :type beam_size: int
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6812,38 +7136,43 @@ def img_conv3d_layer(input,
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a list.
+    :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
+                        is set to one integer, the three dimensions will be same.
     :type filter_size: int | tuple | list
-    :param num_filters: Each filter group's number of filter
-    :param act: Activation type. ReluActivation is the default.
+    :param num_filters: The number of filters in each group.
+    :type num_filters: int
+    :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
-    :param groups: Group size of filters.
+    :param groups: The number of the filter groups.
     :type groups: int
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
+    :param stride: The strides of the convolution along three axises. If the parameter
+                   is set to one integer, the three strides will be same.
     :type stride: int | tuple | list
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
+    :param padding: The numbers of padding along three axises. If the parameter is set to
+                    one integer, they will be same.
     :type padding: int | tuple | list
-    :param bias_attr: Convolution bias attribute. None means default bias.
-                      False means no bias.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
+    :param shared_biases: Whether biases will be shared between filters or not.
     :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
+    :param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
+                       when trans=True. If not set, it will be automatically set to "deconv3d"
+                       when trans=True and "conv3d" when trans=False.
+    :type layer_type: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6925,7 +7254,7 @@ def img_conv3d_layer(input,
 def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
     """
     A layer applies a linear transformation to each element in each row of
-    the input matrix. For each element, the layer first re-scale it and then
+    the input matrix. For each element, the layer first re-scales it and then
     adds a bias to it.
 
     This layer is very like the SlopeInterceptLayer, except the scale and
@@ -6943,12 +7272,12 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
     :type name: basestring
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: The parameter attribute of scaling.
+    :param param_attr: The parameter attribute of scaling. See ParameterAttribute for
+                      details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If the parameter is set to
-                      False or something not type of ParameterAttribute,
-                      no bias is defined. If the parameter is set to
-                      True, the bias is initialized to zero.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -6980,3 +7309,178 @@ def resize_layer(input, size, name=None):
     """
     Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
     return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_act_default(act=LinearActivation())
+@wrap_name_default('sub_seq')
+def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
+    """
+    sub_seq_layer will return sub-sequences from the input sequences. For each
+    sequence in the input sequence layer, sub_seq_layer will slice it by given
+    offset and size. Please notice that, number of offset value and size value
+    both are equal to the number of sequence in the input layer.
+
+    .. code-block:: python
+
+        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer, which should be sequence.
+    :type input: LayerOutput
+    :param offsets: The offset indices to slice the input sequence, which should
+                    be sequence type.
+    :type offsets: LayerOutput
+    :param sizes: The sizes of the sub-sequences, which should be sequence type.
+    :type sizes: LayerOutput
+    :param act: Activation type, LinearActivation is the default activation.
+    :type act: BaseActivation.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
+    assert isinstance(offsets, LayerOutput), (
+        'The offset indices for sub_seq_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(sizes, LayerOutput), (
+        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
+
+    Layer(
+        name=name,
+        type=LayerType.SUB_SEQ_LAYER,
+        inputs=[input.name, offsets.name, sizes.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr))
+
+    return LayerOutput(
+        name,
+        LayerType.SUB_SEQ_LAYER,
+        parents=[input, offsets, sizes],
+        size=input.size)
+
+
+@wrap_name_default('scale_sub_region')
+def scale_sub_region_layer(input, indices, value, name=None):
+    """
+    Given an image or feature map with CHW information, scale_sub_region_layer
+    can be used to multiply a real value to values of a sub continuous region.
+    You can provide start and end indices of CHW for each instance.
+    Please notice that all start indices are counting from 1.
+    The shape of indices should be [batch_size, 6] and the layout for each row
+    is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
+
+    .. code-block:: python
+
+        scale_sub_region = scale_sub_region_layer(input=input,
+                                                  indices=indices,
+                                                  value=value)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer which should contains CHW information.
+    :type input: LayerOutput
+    :param indices: Start index and end index for C H W, the input value should
+                    be a 2-D matrix with shape [batch_size, 6].
+    :type indices: LayerOutput.
+    :param value: value to multiply.
+    :type value: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of scale_sub_region_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(indices, LayerOutput), (
+        'The start and end indices for CHW, must be a PaddlePaddle layer.')
+    assert isinstance(value, float), (
+        'The value to multiply, must be a real value.')
+
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SUB_REGION_LAYER,
+        inputs=[input.name, indices.name],
+        value=value)
+
+    return LayerOutput(
+        name,
+        LayerType.SCALE_SUB_REGION_LAYER,
+        parents=[input, indices],
+        num_filters=input.num_filters,
+        size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+    Factorization machines.
+
+    .. code-block:: python
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 3821d075cb..9776ae1805 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import math
 
 from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
     IdentityActivation, TanhActivation, SequenceSoftmaxActivation
@@ -26,9 +26,9 @@ __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
     'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'simple_gru2',
-    'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
-    'outputs'
+    'simple_attention', 'dot_product_attention', 'multi_head_attention',
+    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
+    'inputs', 'outputs'
 ]
 
 ######################################################
@@ -681,34 +681,42 @@ def lstmemory_unit(input,
                                    state_act=TanhActivation())
 
 
-    :param input: input layer.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param out_memory: output of previous time step
+    :param out_memory: The output of previous time step.
     :type out_memory: LayerOutput | None
-    :param name: lstmemory unit name.
+    :param name: The lstmemory unit name.
     :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
     :type size: int
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
     :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param lstm_layer_attr: extra attribute of lstm layer.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
     :rtype: LayerOutput
     """
     if size is None:
@@ -786,34 +794,42 @@ def lstmemory_group(input,
                                     gate_act=SigmoidActivation(),
                                     state_act=TanhActivation())
 
-    :param input: input layer.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
     :type size: int
-    :param name: name of lstmemory group.
+    :param name: The name of lstmemory group.
     :type name: basestring
-    :param out_memory: output of previous time step.
+    :param out_memory: The output of previous time step.
     :type out_memory: LayerOutput | None
-    :param reverse: process the input in a reverse order or not.
+    :param reverse: Process the input in a reverse order or not.
     :type reverse: bool
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
     :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
     :return: the lstmemory group.
     :rtype: LayerOutput
@@ -1460,10 +1476,8 @@ def dot_product_attention(encoded_sequence,
         expand_as=encoded_sequence,
         name='%s_expand' % name)
 
-    m = linear_comb_layer(
-        weights=expanded,
-        vectors=encoded_sequence,
-        name='%s_dot-product' % name)
+    m = dot_prod_layer(
+        input1=expanded, input2=encoded_sequence, name='%s_dot-product' % name)
 
     attention_weight = fc_layer(
         input=m,
@@ -1482,6 +1496,134 @@ def dot_product_attention(encoded_sequence,
         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
+@wrap_name_default()
+def multi_head_attention(query,
+                         key,
+                         value,
+                         key_proj_size,
+                         value_proj_size,
+                         head_num,
+                         attention_type,
+                         softmax_param_attr=None,
+                         name=None):
+    """
+    Calculate and return a context vector with dot-product attention mechanism.
+    The dimension of the context vector equals to value_proj_size * head_num.
+
+    Please refer to **Attention Is All You Need** for more details. The link is
+    as follows:
+    https://arxiv.org/abs/1706.03762.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        context = multi_head_attention(query=decoder_state,
+                                       key=enc_seq,
+                                       value=enc_seq,
+                                       key_proj_size=64,
+                                       value_pro_size=64,
+                                       head_num=8,
+                                       attention_type='dot-product attention')
+
+    :param name: A prefix attached to the name of each layer that defined inside
+                 the multi_head_attention.
+    :type name: basestring
+    :param softmax_param_attr: The parameter attribute of sequence softmax
+                               that is used to produce attention weight.
+    :type softmax_param_attr: ParameterAttribute
+    :param query: query is used to calculate attention weights over values at current step.
+    :type query: LayerOutput
+    :param key: key is used to calculate the attention weight of the corresponding value.
+    :type key: LayerOutput
+    :param value: value is the sequence to be attended.
+    :type value: LayerOutput
+    :param key_proj_size: The dimension of the linear projection performed on key and query.
+    :type key_proj_size: int
+    :param value_proj_size: The dimension of the linear projection performed on value.
+    :type value_proj_size: int
+    :param head_num: The number of attention heads.
+    :type head_num: int
+    :param attention_type: The type of the attention mechanism used in each attention
+                           heads. Now, we only support scaled dot-product attention and
+                           additive attention.
+    :type attention_type: basestring
+    :return: The context vector.
+    :rtype: LayerOutput
+    """
+    assert attention_type in ['dot-product attention', 'additive attention']
+
+    with mixed_layer(
+            size=key_proj_size * head_num,
+            name='%s_query_proj' % name) as query_proj:
+        query_proj += full_matrix_projection(query)
+    query_proj = expand_layer(input=query_proj, expand_as=key)
+
+    with mixed_layer(
+            size=key_proj_size * head_num,
+            name='%s_key_proj' % name) as key_proj:
+        key_proj += full_matrix_projection(key)
+
+    with mixed_layer(
+            size=value_proj_size * head_num,
+            name='%s_value_proj' % name) as value_proj:
+        value_proj += full_matrix_projection(value)
+
+    head_list = []
+    for i in range(head_num):
+        with mixed_layer(size=key_proj_size) as sub_query_proj:
+            sub_query_proj += identity_projection(
+                query_proj, offset=key_proj_size * i, size=key_proj_size)
+
+        with mixed_layer(size=key_proj_size) as sub_key_proj:
+            sub_key_proj += identity_projection(
+                key_proj, offset=key_proj_size * i, size=key_proj_size)
+
+        with mixed_layer(size=value_proj_size) as sub_value_proj:
+            sub_value_proj += identity_projection(
+                value_proj, offset=value_proj_size * i, size=value_proj_size)
+
+        if attention_type == 'dot-product attention':
+            m = dot_prod_layer(
+                input1=sub_query_proj,
+                input2=sub_key_proj,
+                name='%s_dot-product_%d' % (name, i))
+            m = slope_intercept_layer(
+                input=m,
+                slope=math.sqrt(1.0 / key_proj_size),
+                name='%s_dot-product_scaling_%d' % (name, i))
+        else:
+            with mixed_layer(
+                    size=key_proj_size,
+                    act=TanhActivation(),
+                    name='%s_combine_%d' % (name, i)) as m:
+                m += identity_projection(sub_query_proj)
+                m += identity_projection(sub_key_proj)
+
+        attention_weight = fc_layer(
+            input=m,
+            size=1,
+            act=SequenceSoftmaxActivation(),
+            param_attr=softmax_param_attr,
+            name="%s_softmax_%d" % (name, i),
+            bias_attr=False)
+
+        scaled = scaling_layer(
+            weight=attention_weight,
+            input=sub_value_proj,
+            name='%s_scaling_%d' % (name, i))
+        head = pooling_layer(
+            input=scaled,
+            pooling_type=SumPooling(),
+            name="%s_pooling_%d" % (name, i))
+
+        head_list.append(head)
+
+    attended = concat_layer(head_list)
+
+    return attended
+
+
 def inputs(layers, *args):
     """
     Declare the inputs of network. The order of input should be as same as
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index c3495ee110..c3cd4cf8c3 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -116,7 +116,7 @@ class AdamOptimizer(BaseSGDOptimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index 0c38a8dce5..e0aeb311b3 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -15,8 +15,9 @@
 """
 
 __all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
-    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
+    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
+    "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -55,6 +56,19 @@ class MaxPooling(BasePoolingType):
         self.output_max_index = output_max_index
 
 
+class MaxWithMaskPooling(BasePoolingType):
+    """
+    MaxWithMask pooling.
+
+    Not only return the very large values for each dimension in sequence or time steps,
+    but also the location indices of found maxinum values.
+
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "max-pool-with-mask")
+
+
 class CudnnMaxPooling(BasePoolingType):
     """
     Cudnn max pooling only support GPU. Return the maxinum value in the
@@ -75,6 +89,16 @@ class CudnnAvgPooling(BasePoolingType):
         BasePoolingType.__init__(self, "cudnn-avg-pool")
 
 
+class CudnnAvgInclPadPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window taking into account the padding cells.
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
+
+
 class AvgPooling(BasePoolingType):
     """
     Average pooling.
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 6a4550c209..10c941f707 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -9,7 +9,9 @@ test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
+test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
+test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index 5ddf6052df..3e0f957648 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 227
       img_size_y: 256
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -63,6 +65,7 @@ layers {
   height: 227
   width: 227
   depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index c0252b945b..a18a4652e1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 227
       img_size_y: 256
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -63,6 +65,7 @@ layers {
   height: 256
   width: 256
   depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
index 832ed24a31..9b69ae4a3b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
@@ -36,6 +36,7 @@ layers {
   height: 6
   width: 20
   depth: 3
+  epsilon: 1e-05
 }
 parameters {
   name: "___batch_norm_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
index fd5224ca55..25ec632375 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
new file mode 100644
index 0000000000..f1530c382c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
@@ -0,0 +1,38 @@
+type: "nn"
+layers {
+  name: "vector1"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "vector2"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__dot_prod_layer_0__"
+  type: "dot_prod"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "vector1"
+  }
+  inputs {
+    input_layer_name: "vector2"
+  }
+}
+input_layer_names: "vector1"
+input_layer_names: "vector2"
+output_layer_names: "__dot_prod_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "vector1"
+  layer_names: "vector2"
+  layer_names: "__dot_prod_layer_0__"
+  input_layer_names: "vector1"
+  input_layer_names: "vector2"
+  output_layer_names: "__dot_prod_layer_0__"
+  is_recurrent_layer_group: false
+}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000..4f3002b199
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
new file mode 100644
index 0000000000..9ba33689ed
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "x"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "y"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__l2_distance_layer_0__"
+  type: "l2_distance"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "x"
+  }
+  inputs {
+    input_layer_name: "y"
+  }
+}
+input_layer_names: "x"
+input_layer_names: "y"
+output_layer_names: "__l2_distance_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "x"
+  layer_names: "y"
+  layer_names: "__l2_distance_layer_0__"
+  input_layer_names: "x"
+  input_layer_names: "y"
+  output_layer_names: "__l2_distance_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
index 03f4f3a31d..39dc487146 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -30,6 +30,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -105,6 +107,8 @@ layers {
       stride_y: 1
       output_y: 24
       img_size_y: 24
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_1__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
index 15c6ab4dc8..d5d6d31a17 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -30,6 +30,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
index 94ad56cab0..63fb38c650 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 300
   active_type: ""
+  height: 10
+  width: 10
 }
 layers {
   name: "__prelu_layer_0__"
@@ -15,6 +17,9 @@ layers {
     input_parameter_name: "___prelu_layer_0__.w0"
   }
   partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
   name: "__prelu_layer_1__"
@@ -26,6 +31,9 @@ layers {
     input_parameter_name: "___prelu_layer_1__.w0"
   }
   partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
 }
 layers {
   name: "__prelu_layer_2__"
@@ -37,41 +45,100 @@ layers {
     input_parameter_name: "___prelu_layer_2__.w0"
   }
   partial_sum: 5
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_3__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_3__.w0"
+  }
+  partial_sum: 300
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_4__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_4__.w0"
+  }
+  partial_sum: 100
+  height: 10
+  width: 10
+  depth: 1
 }
 parameters {
   name: "___prelu_layer_0__.w0"
   size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
   name: "___prelu_layer_1__.w0"
   size: 300
-  initial_mean: 0.0
-  initial_std: 0.057735026919
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 parameters {
   name: "___prelu_layer_2__.w0"
   size: 60
-  initial_mean: 0.0
-  initial_std: 0.129099444874
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 60
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_3__.w0"
+  size: 1
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_4__.w0"
+  size: 3
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 3
   initial_strategy: 0
-  initial_smart: true
+  initial_smart: false
 }
 input_layer_names: "input"
-output_layer_names: "__prelu_layer_2__"
+output_layer_names: "__prelu_layer_4__"
 sub_models {
   name: "root"
   layer_names: "input"
   layer_names: "__prelu_layer_0__"
   layer_names: "__prelu_layer_1__"
   layer_names: "__prelu_layer_2__"
+  layer_names: "__prelu_layer_3__"
+  layer_names: "__prelu_layer_4__"
   input_layer_names: "input"
-  output_layer_names: "__prelu_layer_2__"
+  output_layer_names: "__prelu_layer_4__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
new file mode 100644
index 0000000000..0ec88aa998
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -0,0 +1,100 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 588
+  active_type: ""
+  height: 14
+  width: 14
+}
+layers {
+  name: "rois"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3136
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 14
+      img_size: 14
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 14
+      img_size_y: 14
+      dilation: 1
+      dilation_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 14
+  width: 14
+}
+layers {
+  name: "__roi_pool_0__"
+  type: "roi_pool"
+  size: 784
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    roi_pool_conf {
+      pooled_width: 7
+      pooled_height: 7
+      spatial_scale: 0.0625
+    }
+  }
+  inputs {
+    input_layer_name: "rois"
+  }
+  height: 7
+  width: 7
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 432
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "rois"
+output_layer_names: "__roi_pool_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "rois"
+  layer_names: "__conv_0__"
+  layer_names: "__roi_pool_0__"
+  input_layer_names: "data"
+  input_layer_names: "rois"
+  output_layer_names: "__roi_pool_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
new file mode 100644
index 0000000000..d20133a10e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "indices"
+  type: "data"
+  size: 6
+  active_type: ""
+}
+layers {
+  name: "__scale_sub_region_0__"
+  type: "scale_sub_region"
+  size: 2016
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    scale_sub_region_conf {
+      image_conf {
+        channels: 1
+        img_size: 42
+        img_size_y: 48
+      }
+      value: 0.0
+    }
+  }
+  inputs {
+    input_layer_name: "indices"
+  }
+  height: 48
+  width: 42
+}
+input_layer_names: "data"
+input_layer_names: "indices"
+output_layer_names: "__scale_sub_region_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "indices"
+  layer_names: "__scale_sub_region_0__"
+  input_layer_names: "data"
+  input_layer_names: "indices"
+  output_layer_names: "__scale_sub_region_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
new file mode 100644
index 0000000000..e52d48dde0
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+vec1 = data_layer(name='vector1', size=10)
+vec2 = data_layer(name='vector2', size=10)
+dot_product = dot_prod_layer(input1=vec1, input2=vec2)
+
+outputs(dot_product)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000..b249de0fee
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
new file mode 100644
index 0000000000..b36a5c6d12
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+outputs(
+    l2_distance_layer(
+        x=data_layer(
+            name='x', size=128), y=data_layer(
+                name='y', size=128)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
index aae90fab32..45b02fbf32 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
@@ -1,8 +1,10 @@
 from paddle.trainer_config_helpers import *
 
-data = data_layer(name='input', size=300)
-prelu = prelu_layer(input=data)
-prelu = prelu_layer(input=data, partial_sum=1)
-prelu = prelu_layer(input=data, partial_sum=5)
+data = data_layer(name='input', size=300, height=10, width=10)
+prelu = prelu_layer(input=data, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)
 
 outputs(prelu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
new file mode 100644
index 0000000000..b739a81b85
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
+
+rois = data_layer(name='rois', size=10)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=3,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+roi_pool = roi_pool_layer(
+    input=conv,
+    rois=rois,
+    pooled_width=7,
+    pooled_height=7,
+    spatial_scale=1. / 16)
+
+outputs(roi_pool)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
new file mode 100644
index 0000000000..8d4bf28bf1
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
@@ -0,0 +1,11 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+indices = data_layer(name='indices', size=6)
+
+scale_sub_region = scale_sub_region_layer(
+    input=data, indices=indices, value=0.0)
+
+outputs(scale_sub_region)
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 48e5087cc2..421e953d27 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -23,32 +23,32 @@ from paddle.v2.topology import Topology
 
 
 def merge_v2_model(net, param_file, output_file):
-    '''Integrate the model config and model parameters into one file.
-    
+    '''Merge the model config and parameters into one file.
+
     The model configuration file describes the model structure which
     ends with .py. The parameters file stores the parameters of the model
     which ends with .tar.gz.
-    
-    @param  net            The output layer of the network.
-    @param  param_file     Path of the model parameters(.tar.gz) which is stored by v2 api.
+
+    @param  net            The output layer of the network for inference.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
     @param  output_file    Path of the merged file which will be generated.
-    
+
     Usage:
 
-        from paddle.util.merge_model import merge_v2_model
+        from paddle.utils.merge_model import merge_v2_model
         # import your network configuration
-        from mobilenet import mobile_net
-        
-        net = mobile_net(3*224*224, 102)
+        from example_net import net_conf
+
+        net = net_conf(is_predict=True)
         param_file = './param_pass_00000.tar.gz'
         output_file = './output.paddle'
-        
+
         merge_v2_model(net, param_file, output_file)
 
     '''
 
     assert isinstance(net, LayerOutput), \
-            "The net should be the output of the network"
+            "The net should be the output of the network for inference"
     assert os.path.exists(param_file), \
             "The model parameters file %s does not exists " % (param_file)
 
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 1c8d8f4b2f..70f61e8499 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -33,10 +33,11 @@ import networks
 import minibatch
 import plot
 import image
-import model
 import paddle.trainer.config_parser as cp
 
 __all__ = [
+    'default_startup_program',
+    'default_main_program',
     'optimizer',
     'layer',
     'activation',
@@ -56,12 +57,64 @@ __all__ = [
     'evaluator',
     'image',
     'master',
-    'model',
 ]
 
 cp.begin_parse()
 
 
+def set_omp_mkl_env_vars(trainer_count):
+    '''Auto set CPU environment if have not set before.
+       export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
+       export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+    '''
+    import platform
+    if not platform.system() in ['Linux', 'Darwin']:
+        return
+
+    def set_env(key, value):
+        '''If the key has not been set in the environment, set it with value.'''
+        assert isinstance(key, str)
+        assert isinstance(value, str)
+        envset = os.environ.get(key)
+        if envset is None:
+            os.environ[key] = value
+
+    def num_physical_cores():
+        '''Get the number of physical cores'''
+        if platform.system() == "Linux":
+            num_sockets = int(
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            num_cores_per_socket = int(
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            return num_sockets * num_cores_per_socket
+        else:
+            cmds = {"Darwin": "sysctl -n hw.physicalcpu"}
+            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    def num_logical_processors():
+        '''Get the number of logical processors'''
+        cmds = {
+            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
+            "Darwin": "sysctl -n hw.logicalcpu"
+        }
+        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    num_cores = num_physical_cores()
+    num_processors = num_logical_processors()
+    if num_processors > num_cores:  # Hyper Threading is enabled
+        set_env("OMP_DYNAMIC", "true")
+        set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+    else:
+        set_env("OMP_DYNAMIC", "false")
+        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    threads = num_processors / trainer_count
+    threads = '1' if threads < 1 else str(threads)
+    set_env("OMP_NUM_THREADS", threads)
+    set_env("MKL_NUM_THREADS", threads)
+
+
 def init(**kwargs):
     import py_paddle.swig_paddle as api
     args = []
@@ -76,6 +129,8 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
+    set_omp_mkl_env_vars(kwargs.get('trainer_count', 1))
+
     if 'use_gpu' in kwargs:
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
     if 'use_mkldnn' in kwargs:
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index ce60aa21c2..f10bf7e42a 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -22,6 +22,7 @@ parse training set and test set into paddle reader creators.
 import numpy as np
 import os
 import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
 
 __all__ = ['train', 'test']
 
@@ -34,6 +35,8 @@ feature_names = [
 
 UCI_TRAIN_DATA = None
 UCI_TEST_DATA = None
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
 
 def feature_range(maximums, minimums):
@@ -111,6 +114,14 @@ def test():
     return reader
 
 
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
+    with open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
 def fetch():
     paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
 
diff --git a/python/paddle/v2/framework/.gitignore b/python/paddle/v2/fluid/.gitignore
similarity index 100%
rename from python/paddle/v2/framework/.gitignore
rename to python/paddle/v2/fluid/.gitignore
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
new file mode 100644
index 0000000000..59986c9f0c
--- /dev/null
+++ b/python/paddle/v2/fluid/__init__.py
@@ -0,0 +1,44 @@
+# import all class inside framework into fluid module
+import framework
+from framework import *
+# import all class inside executor into fluid module
+import executor
+from executor import *
+
+import io
+import evaluator
+import initializer
+import layers
+import nets
+import optimizer
+import backward
+import regularizer
+from param_attr import ParamAttr
+from data_feeder import DataFeeder
+from core import LoDTensor, CPUPlace, GPUPlace
+
+Tensor = LoDTensor
+__all__ = framework.__all__ + executor.__all__ + [
+    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
+    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
+    'DataFeeder'
+]
+
+
+def __read_gflags_from_env__():
+    """
+    Enable reading gflags from environment variables.
+
+    Returns:
+        None
+    """
+    import sys
+    import core
+    read_env_flags = ['use_pinned_memory']
+    if core.is_compile_gpu():
+        read_env_flags.append('fraction_of_gpu_memory_to_use')
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
+
+
+__read_gflags_from_env__()
diff --git a/python/paddle/v2/framework/backward.py b/python/paddle/v2/fluid/backward.py
similarity index 76%
rename from python/paddle/v2/framework/backward.py
rename to python/paddle/v2/fluid/backward.py
index 6827792cb3..f188582178 100644
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework import framework as framework
+from paddle.v2.fluid import framework as framework
 
 __all__ = ['append_backward_ops']
 
@@ -19,8 +19,20 @@ def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
     :rtype: list[Variable]
     """
     assert isinstance(loss, framework.Variable)
-    param_grad_map = loss.block.program.append_backward(loss, no_grad_set or
-                                                        set())
+
+    if no_grad_set is None:
+        program = loss.block.program
+        assert isinstance(program, framework.Program)
+        no_grad_set = list()
+        for block in program.blocks:
+            assert isinstance(block, framework.Block)
+            for var in block.vars.itervalues():
+                assert isinstance(var, framework.Variable)
+                if var.stop_gradient:
+                    no_grad_set.append(var.name)
+        no_grad_set = set(no_grad_set)
+
+    param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
     if parameter_list is not None:
         parameters = parameter_list
     else:
diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py
new file mode 100644
index 0000000000..30a542af21
--- /dev/null
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -0,0 +1,97 @@
+from __future__ import print_function
+import core
+import numpy
+import six.moves as six
+
+from framework import Variable
+
+__all__ = ['DataFeeder']
+
+
+class DataToLoDTensorConverter(object):
+    def __init__(self, place, lod_level, shape, dtype):
+        self.place = place
+        self.lod_level = lod_level
+        self.shape = shape
+        if dtype == core.DataType.FP32:
+            self.dtype = 'float32'
+        elif dtype == core.DataType.INT64:
+            self.dtype = 'int64'
+        elif dtype == core.DataType.FP64:
+            self.dtype = 'float64'
+        elif dtype == core.DataType.INT32:
+            self.dtype = 'int32'
+        else:
+            raise ValueError("dtype must be any of [int32, float32, int64, "
+                             "float64]")
+
+        self.data = []
+        self.lod = []
+
+        for i in six.range(lod_level):
+            self.lod.append([0])
+
+    def feed(self, data):
+        self._feed_impl_(data, self.lod, self.lod_level)
+
+    def _feed_impl_(self, data, lod, lod_level):
+        if lod_level == 0:
+            self.data.append(data)
+        else:
+            cur_lod_len = len(data)
+            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            for each_data in data:
+                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+
+    def done(self):
+        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        t = core.LoDTensor()
+        t.set(arr, self.place)
+        if self.lod_level > 0:
+            t.set_lod(self.lod)
+        return t
+
+
+class DataFeeder(object):
+    def __init__(self, feed_list, place):
+        self.feed_dtypes = []
+        self.feed_names = []
+        self.feed_shapes = []
+        self.feed_lod_level = []
+        for each_var in feed_list:
+            if not isinstance(each_var, Variable):
+                raise TypeError("Feed list should contain a list of variable")
+            self.feed_dtypes.append(each_var.dtype)
+            self.feed_names.append(each_var.name)
+            shape = each_var.shape
+            batch_size_dim = -1
+            for i, s in enumerate(shape):
+                if s < 0:
+                    batch_size_dim = i
+                    break
+            if batch_size_dim == -1:
+                raise ValueError("Variable {0} must has a batch size dimension",
+                                 each_var.name)
+            self.feed_lod_level.append(each_var.lod_level)
+            self.feed_shapes.append(shape)
+
+        self.place = place
+
+    def feed(self, iterable):
+        converter = []
+        for lod_level, shape, dtype in six.zip(
+                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
+            converter.append(
+                DataToLoDTensorConverter(
+                    place=self.place,
+                    lod_level=lod_level,
+                    shape=shape,
+                    dtype=dtype))
+
+        for each_sample in iterable:
+            for each_converter, each_slot in six.zip(converter, each_sample):
+                each_converter.feed(each_slot)
+        ret_dict = {}
+        for each_name, each_converter in six.zip(self.feed_names, converter):
+            ret_dict[each_name] = each_converter.done()
+        return ret_dict
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/fluid/default_scope_funcs.py
similarity index 92%
rename from python/paddle/v2/framework/default_scope_funcs.py
rename to python/paddle/v2/fluid/default_scope_funcs.py
index c07f9a6ab9..60c6165b6b 100644
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope. 
 """
 
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import threading
 
 __tl_scope__ = threading.local()
@@ -27,13 +27,13 @@ __all__ = [
 def get_cur_scope():
     """
     Get current scope.
-    :rtype: paddle.v2.framework.core.Scope
+    :rtype: paddle.v2.fluid.core.Scope
     """
     cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
     if cur_scope_stack is None:
         __tl_scope__.cur_scope = list()
     if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
     return __tl_scope__.cur_scope[-1]
 
 
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
new file mode 100644
index 0000000000..137c573622
--- /dev/null
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -0,0 +1,134 @@
+import numpy as np
+
+import layers
+from framework import Program, unique_name, Variable
+from layer_helper import LayerHelper
+
+__all__ = ['Accuracy']
+
+
+def _clone_var_(block, var):
+    assert isinstance(var, Variable)
+    return block.create_var(
+        name=var.name,
+        shape=var.shape,
+        dtype=var.dtype,
+        type=var.type,
+        lod_level=var.lod_level,
+        persistable=True)
+
+
+class Evaluator(object):
+    """
+    Base Class for all evaluators
+    
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate 
+            temporary variable name.
+        main_program(Program, optional): The evaluator should be added to this 
+            main_program. Default default_main_program()
+        startup_program(Program, optional):The parameter should be added to this 
+            startup_program. Default default_startup_program()
+            
+    Attributes:
+        states(list): The list of state variables. states will be reset to zero 
+            when `reset` is invoked.
+        metrics(list): The list of metrics variables. They will be calculate 
+            every mini-batch
+    """
+
+    def __init__(self, name, **kwargs):
+        self.states = []
+        self.metrics = []
+        self.helper = LayerHelper(name, **kwargs)
+
+    def reset(self, executor, reset_program=None):
+        """
+        reset metric states at the begin of each pass/user specified batch
+        """
+        if reset_program is None:
+            reset_program = Program()
+
+        for var in self.states:
+            assert isinstance(var, Variable)
+            g_var = _clone_var_(reset_program.current_block(), var)
+            layers.fill_constant(
+                shape=g_var.shape,
+                value=0.0,
+                dtype=g_var.dtype,
+                out=g_var,
+                main_program=reset_program)
+
+        executor.run(reset_program)
+
+    def eval(self, executor, eval_program=None):
+        """
+        Evaluate the statistics merged by multiple mini-batches.
+        """
+        raise NotImplementedError()
+
+    def create_state(self, suffix, dtype, shape):
+        """
+        Create state variable. 
+        
+        NOTE: It is not a public API.
+        
+        Args:
+            suffix(str): the state suffix. 
+            dtype(str|core.DataType): the state data type 
+            shape(tuple|list): the shape of state 
+
+        Returns: State variable
+
+        """
+        state = self.helper.create_variable(
+            name="_".join([unique_name(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        self.states.append(state)
+        return state
+
+
+class Accuracy(Evaluator):
+    """
+    Average Accuracy for multiple mini-batches.
+    """
+
+    def __init__(self, input, label, k=1, **kwargs):
+        super(Accuracy, self).__init__("accuracy", **kwargs)
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
+        self.correct = self.create_state(
+            dtype='int64', shape=[1], suffix='correct')
+        kwargs = {'main_program': main_program}
+        total = self.helper.create_tmp_variable(dtype='int')
+        correct = self.helper.create_tmp_variable(dtype='int')
+        acc = layers.accuracy(
+            input=input,
+            label=label,
+            k=k,
+            total=total,
+            correct=correct,
+            **kwargs)
+        total = layers.cast(x=total, dtype='int64', **kwargs)
+        correct = layers.cast(x=correct, dtype='int64', **kwargs)
+        layers.sums(input=[self.total, total], out=self.total, **kwargs)
+        layers.sums(input=[self.correct, correct], out=self.correct, **kwargs)
+
+        self.metrics.append(acc)
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        kwargs = {'main_program': eval_program}
+        total = _clone_var_(block, self.total)
+        correct = _clone_var_(block, self.correct)
+        total = layers.cast(total, dtype='float32', **kwargs)
+        correct = layers.cast(correct, dtype='float32', **kwargs)
+        out = layers.elementwise_div(x=correct, y=total, **kwargs)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
new file mode 100644
index 0000000000..bdc82eede9
--- /dev/null
+++ b/python/paddle/v2/fluid/executor.py
@@ -0,0 +1,152 @@
+import numpy as np
+from . import core
+from framework import Program, default_main_program
+
+__all__ = ['Executor', 'g_scope']
+
+g_scope = core.Scope()
+
+
+def as_numpy(tensor):
+    if isinstance(tensor, list):
+        return [as_numpy(t) for t in tensor]
+    assert isinstance(tensor, core.LoDTensor)
+    lod = tensor.lod()
+    tensor_data = np.array(tensor)
+    if len(lod) == 0:
+        ans = tensor_data
+    else:
+        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
+    # elif len(lod) == 1:
+    #     ans = []
+    #     idx = 0
+    #     while idx < len(lod) - 1:
+    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
+    #         idx += 1
+    # else:
+    #     for l in reversed(lod):
+    #         ans = []
+    #         idx = 0
+    #         while idx < len(l) - 1:
+    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
+    #             idx += 1
+    #         tensor_data = ans
+    #     ans = tensor_data
+    return ans
+
+
+class Executor(object):
+    def __init__(self, places):
+        if not isinstance(places, list) and not isinstance(places, tuple):
+            places = [places]
+
+        act_places = []
+        for each in places:
+            p = core.Place()
+            p.set_place(each)
+            act_places.append(p)
+
+        self.executor = core.Executor(act_places)
+        self.places = places
+
+    def aslodtensor(self, data):
+        def accumulate(data):
+            if not isinstance(data, list):
+                return 1
+            return sum([accumulate(sub) for sub in data])
+
+        def parselod(data):
+            seq_lens = [accumulate(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            return lod
+
+        assert len(self.places) != 0
+        if not isinstance(data, list):
+            # pure tensor case
+            tensor = core.LoDTensor()
+            tensor.set(data, self.places[0])
+            return tensor
+        else:
+            raise RuntimeError("Current implementation lacks unittests")
+            # lodtensor case
+            lod = []
+            if not isinstance(data[0], list):
+                lod.append(parselod(data))
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            else:
+                while isinstance(data[0], list):
+                    lod.append(parselod(seq))
+                    flattened_data = [item for seq in data for item in seq]
+                    data = flattened_data
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            flattened_data = flattened_data.reshape([len(flattened_data), 1])
+            tensor = core.LoDTensor()
+            tensor.set(flattened_data, self.places[0])
+            tensor.set_lod(lod)
+            return tensor
+
+    def run(self,
+            program=None,
+            feed=None,
+            fetch_list=None,
+            feed_var_name='feed',
+            fetch_var_name='fetch',
+            scope=None,
+            return_numpy=True):
+        if feed is None:
+            feed = {}
+        if fetch_list is None:
+            fetch_list = []
+
+        if program is None:
+            program = default_main_program()
+
+        if not isinstance(program, Program):
+            raise TypeError()
+
+        if scope is None:
+            scope = g_scope
+
+        program = program.clone()
+        global_block = program.global_block()
+        feed_var = global_block.create_var(
+            name=feed_var_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+        for i, name in enumerate(feed):
+            out = global_block.var(name)
+            global_block.prepend_op(
+                'feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+            cur_feed = feed[name]
+            if not isinstance(cur_feed, core.LoDTensor):
+                cur_feed = self.aslodtensor(cur_feed)
+            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
+
+        fetch_var = global_block.create_var(
+            name=fetch_var_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+        for i, var in enumerate(fetch_list):
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [var]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+        self.executor.run(program.desc, scope, 0, True)
+        outs = [
+            core.get_fetch_variable(scope, fetch_var_name, i)
+            for i in xrange(len(fetch_list))
+        ]
+
+        if return_numpy:
+            outs = as_numpy(outs)
+        return outs
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/fluid/framework.py
similarity index 52%
rename from python/paddle/v2/framework/framework.py
rename to python/paddle/v2/fluid/framework.py
index 4e737549c9..bf0cd275b6 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,18 +1,128 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
 import collections
+
 import numpy as np
-import copy
+from . import core
+import proto.framework_pb2 as framework_pb2
+import google.protobuf.message
+import contextlib
 
-__all__ = ['Block', 'Variable', 'Program', 'Operator']
+__all__ = [
+    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program'
+]
 
 
 def unique_name(prefix):
+    """
+    Generate unique names with prefix
+
+    Args:
+        prefix(str): The prefix of return string
+
+    Returns(str): A unique string with the prefix
+
+    """
     uid = core.unique_integer(prefix)  # unique during whole process.
     return "_".join([prefix, str(uid)])
 
 
+def convert_np_dtype_to_dtype_(np_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        np_dtype(np.dtype): the data type in numpy
+
+    Returns(core.DataType): the data type in Paddle
+
+    """
+    dtype = np.dtype(np_dtype)
+    if dtype == np.float32:
+        return core.DataType.FP32
+    elif dtype == np.float64:
+        return core.DataType.FP64
+    elif dtype == np.float16:
+        return core.DataType.FP16
+    elif dtype == np.int32:
+        return core.DataType.INT32
+    elif dtype == np.int16:
+        return core.DataType.INT16
+    elif dtype == np.int64:
+        return core.DataType.INT64
+    elif dtype == np.bool:
+        return core.DataType.BOOL
+    else:
+        raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def dtype_is_floating(dtype):
+    """
+    Check the data type is floating or not.
+    Args:
+        dtype(np.dtype|core.DataType): data type.
+            Could be numpy format or Paddle format
+
+    Returns(bool): True if data type is a float value
+
+    """
+    if not isinstance(dtype, core.DataType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    return dtype in [core.DataType.FP16, core.DataType.FP32, core.DataType.FP64]
+
+
+def _debug_string_(proto, throw_on_error=True):
+    """
+    Get the debug string of a protobuf message. The message could be not
+    initialized.
+    Args:
+        proto(google.protobuf.message.Message): The protobuf message
+        throw_on_error(bool): True if raise an error when the protobuf message
+            is not initialized.
+
+    Returns(str): The debug string of the protobuf message
+
+    """
+    error_fields = list()
+    if not proto.IsInitialized(error_fields) and throw_on_error:
+        raise ValueError("{0} are not initialized\nThe message is {1}".format(
+            error_fields, proto))
+    return proto.__str__()
+
+
 class Variable(object):
+    """
+    Python variable. Every input and output of an operator is a variable. Every
+    variable belongs to a block. The variable has a name and two variables in
+    different blocks could have the same name.
+
+    There are many kinds of variables. Please reference the framework.proto for
+    details.
+
+    Notes: The constructor of Variable should not be invoked directly. Please
+    use `Block.create_var` to create a variable.
+
+    >>> cur_program = Program()
+    >>> cur_block = cur_program.current_block()
+    >>> new_variable = cur_block.create_var(
+    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+
+    Args:
+        block(Block): The associated block. It will be passed by
+            `Block.create_var` automatically.
+        type(core.VarDesc.VarType): Variable type. Please reference the
+            framework.proto for details.
+        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+            Some kinds of variable do not contain shape, just set it to None.
+        dtype(np.dtype|core.DataType|str): The data type of variable.
+        lod_level(int): The level of lod tensor. 0 means there is not a time
+            series data.
+        persistable(bool): True if the variable should be saved as check point.
+            Defaults to False.
+        stop_gradient(bool): True if the variable will stop to calculate
+            gradients when backward. Defaults to False.
+    """
+
     def __init__(self,
                  block,
                  type=core.VarDesc.VarType.LOD_TENSOR,
@@ -21,6 +131,7 @@ class Variable(object):
                  dtype=None,
                  lod_level=None,
                  persistable=None,
+                 stop_gradient=False,
                  **kwargs):
         self.block = block
 
@@ -54,11 +165,11 @@ class Variable(object):
                         "matched.".format(self.name, old_shape, shape))
         if dtype is not None:
             if not isinstance(dtype, core.DataType):
-                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
+                dtype = convert_np_dtype_to_dtype_(dtype)
             if is_new_var:
-                self.desc.set_data_type(dtype)
+                self.desc.set_dtype(dtype)
             else:
-                old_dtype = self.data_type
+                old_dtype = self.dtype
                 if dtype != old_dtype:
                     raise ValueError("Variable {0} has been created before. "
                                      "The previous data type is {1}; the new "
@@ -89,11 +200,25 @@ class Variable(object):
 
         self.block.vars[name] = self
         self.op = None
+        self.stop_gradient = stop_gradient
 
     def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error):
+        """
+        Get debug string.
+
+        Args:
+            throw_on_error(bool): True if raise an exception when self is not
+                intialized.
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto, throw_on_error)
 
     __repr__ = __str__
 
@@ -115,8 +240,8 @@ class Variable(object):
         return tuple(self.desc.shape())
 
     @property
-    def data_type(self):
-        return self.desc.data_type()
+    def dtype(self):
+        return self.desc.dtype()
 
     @property
     def lod_level(self):
@@ -132,31 +257,13 @@ class Variable(object):
         uid = core.unique_integer(prefix)  # unique during whole process.
         return "_".join([prefix, str(uid)])
 
-    @staticmethod
-    def _convert_np_dtype_to_dtype_(np_dtype):
-        dtype = np.dtype(np_dtype)
-        if dtype == np.float32:
-            return core.DataType.FP32
-        elif dtype == np.float64:
-            return core.DataType.FP64
-        elif dtype == np.float16:
-            return core.DataType.FP16
-        elif dtype == np.int32:
-            return core.DataType.INT32
-        elif dtype == np.int16:
-            return core.DataType.INT16
-        elif dtype == np.int64:
-            return core.DataType.INT64
-        elif dtype == np.bool:
-            return core.DataType.BOOL
-        else:
-            raise ValueError("Not supported numpy dtype " + str(dtype))
-
 
 def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
-    :return: A list of registered OpProto.
+
+    Returns(list): list of OpProto
+
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -167,6 +274,10 @@ def get_all_op_protos():
 
 
 class OpProtoHolder(object):
+    """
+    A global variable to hold all OpProtos from C++ as a map
+    """
+
     @classmethod
     def instance(cls):
         if not hasattr(cls, '_instance'):
@@ -183,12 +294,26 @@ class OpProtoHolder(object):
             self.op_proto_map[proto.type] = proto
 
     def get_op_proto(self, type):
+        """
+        Get OpProto by a type string.
+        Args:
+            type(str): The type that operator registered in C++ side.
+
+        Returns(framework_pb2.OpProto): The OpProto
+
+        """
         if type not in self.op_proto_map:
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
 
 class Operator(object):
+    """
+    Python Operator class. The operator represents the build in instructs in a
+    Block. Users can use the build in instructs to describe their neural
+    network.
+    """
+
     def __init__(self,
                  block,
                  desc,
@@ -196,6 +321,30 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
+        """
+        Constructor.
+
+        Notes: The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+        >>> cur_program = Program()
+        >>> cur_block = cur_program.current_block()
+        >>> # var1 += var2 + var3
+        >>> cur_block.append_op(type="sum",
+        >>>                     inputs={"X": [var1, var2, var3]},
+        >>>                     outputs={"Out": [var1]})
+
+        Args:
+            block(Block): The block has the current operator
+            desc(core.OpDesc): The protobuf description
+            type(str): The type of operator.
+            inputs(dict): The input dictionary. Key is the input parameter name.
+                Value is a list of variables.
+            outputs(dict): The output dictionary. Has same format with inputs
+            attrs(dict): The attributes dictionary. Key is attribute name. Value
+                is the attribute value. The attribute type should be as same as
+                the type registered in C++
+        """
         self.block = block
         self.desc = desc
         if len(self.desc.type()) != 0:
@@ -208,7 +357,7 @@ class Operator(object):
 
         def find_name(var_list, name):
             for var_name in var_list:
-                if var_name == name:
+                if var_list[var_name] is not None and var_name == name:
                     return True
             return False
 
@@ -219,17 +368,17 @@ class Operator(object):
                     in_proto.name)
 
                 if found:
-                    in_argus = inputs[in_proto.name]
-                    if not isinstance(in_argus, list):
-                        in_argus = [in_argus]
-                    if not in_proto.duplicable and len(in_argus) > 1:
+                    in_args = inputs[in_proto.name]
+                    if not isinstance(in_args, list):
+                        in_args = [in_args]
+                    if not in_proto.duplicable and len(in_args) > 1:
                         raise ValueError(
                             "Input %s expects only one input, but %d are given."
-                            % (in_proto.name, len(in_argus)))
-                    in_argu_names = []
-                    for argu in in_argus:
-                        in_argu_names.append(argu.name)
-                    self.desc.set_input(in_proto.name, in_argu_names)
+                            % (in_proto.name, len(in_args)))
+                    in_arg_names = []
+                    for arg in in_args:
+                        in_arg_names.append(arg.name)
+                    self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
 
@@ -247,18 +396,18 @@ class Operator(object):
                         str(e) for e in given)))
 
             for out_proto in proto.outputs:
-                out_argus = outputs[out_proto.name]
-                if not isinstance(out_argus, list):
-                    out_argus = [out_argus]
-                if not out_proto.duplicable and len(out_argus) > 1:
+                out_args = outputs[out_proto.name]
+                if not isinstance(out_args, list):
+                    out_args = [out_args]
+                if not out_proto.duplicable and len(out_args) > 1:
                     raise ValueError(
                         "Output %s expects only one output, but %d are given." %
-                        (out_proto.name, len(out_argus)))
-                out_argu_names = []
-                for argu in out_argus:
-                    out_argu_names.append(argu.name)
-                    argu.op = self
-                self.desc.set_output(out_proto.name, out_argu_names)
+                        (out_proto.name, len(out_args)))
+                out_arg_names = []
+                for arg in out_args:
+                    out_arg_names.append(arg.name)
+                    arg.op = self
+                self.desc.set_output(out_proto.name, out_arg_names)
 
         if attrs is not None:
             if not isinstance(attrs, dict):
@@ -275,16 +424,28 @@ class Operator(object):
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad'
+            'rnn_memory_helper_grad', 'conditional_block', 'while'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
-    def __str__(self):
+    def to_string(self, throw_on_error):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+
+        Returns(str): The debug string.
+
+        """
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.OpDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto, throw_on_error)
+
+    def __str__(self):
+        return self.to_string(True)
 
     __repr__ = __str__
 
@@ -293,21 +454,55 @@ class Operator(object):
         return self.desc.type()
 
     def input(self, name):
+        """
+        Get input arguments by the input parameter name
+        Args:
+            name(str): The input parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.input(name)
 
     @property
     def input_names(self):
+        """
+        Get all input parameter names
+        Returns(list): return a list of input parameter names
+
+        """
         return self.desc.input_names()
 
     def output(self, name):
+        """
+        Get output arguments by the output parameter name
+        Args:
+            name(str): The output parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
         return self.desc.output(name)
 
     @property
     def output_names(self):
+        """
+        Get all output parameter names
+        Returns(list): return a list of output parameter names
+
+        """
         return self.desc.output_names()
 
     @property
     def idx(self):
+        """
+        Return the array index of current operator.
+        Returns(int): The array index in block.ops array
+        Raises:
+            ValueError: when the operator is not found.
+        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -315,19 +510,57 @@ class Operator(object):
             "Can't find op itself in it's block. It could be a bug of Paddle.")
 
     def has_attr(self, name):
+        """
+        operator has the attribute with name or not.
+        Args:
+            name(str): the attribute name
+
+        Returns(bool): True if has this attribute.
+
+        """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
+        """
+        Get the type of attribute by attribute name
+        Args:
+            name(str): the attribute name
+
+        Returns(core.AttrType): the attribute type
+
+        """
         return self.desc.attr_type(name)
 
     @property
     def attr_names(self):
+        """
+        Get all attribute names
+        Returns(list): The list of attribute name
+
+        """
         return self.desc.attr_names()
 
     def attr(self, name):
+        """
+        Get attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(bool|int|str|float|list): The attribute value. The return value
+            can be any valid attribute type.
+
+        """
         return self.desc.attr(name)
 
     def block_attr(self, name):
+        """
+        Get the block attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(int): the block index
+
+        """
         return self.desc.block_attr(name)
 
 
@@ -339,9 +572,12 @@ class Block(object):
         self.program = program
 
     def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error):
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.BlockDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto, throw_on_error)
 
     __repr__ = __str__
 
@@ -362,7 +598,11 @@ class Block(object):
         return v
 
     def all_parameters(self):
-        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
         var = Variable(self, *args, **kwargs)
@@ -436,23 +676,59 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
 
 class Program(object):
     def __init__(self):
         self.desc = core.ProgramDesc()
         self.blocks = [Block(self, 0)]
         self.current_block_idx = 0
+        self._seed = 0
 
     def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error):
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.ProgramDesc.FromString(str(protostr))
-        return proto.__str__()
+        return _debug_string_(proto, throw_on_error)
 
     def clone(self):
         p = Program()
         p.desc = core.ProgramDesc(self.desc)
         p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
         p.sync_with_cpp()
+        p.copy_param_info_from(self)
         return p
 
     def prune(self, targets):
@@ -475,6 +751,13 @@ class Program(object):
         res.sync_with_cpp()
         return res
 
+    def inference_optimize(self):
+        res = Program()
+        res.desc = core.inference_optimize(self.desc)
+        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.sync_with_cpp()
+        return res
+
     @staticmethod
     def parse_from_string(binary_str):
         p = Program()
@@ -483,6 +766,16 @@ class Program(object):
         p.sync_with_cpp()
         return p
 
+    @property
+    def random_seed(self):
+        return self._seed
+
+    @random_seed.setter
+    def random_seed(self, seed):
+        if not isinstance(seed, int):
+            raise ValueError("Seed must be a integer.")
+        self._seed = seed
+
     def __repr__(self):
         return str(self)
 
@@ -502,7 +795,14 @@ class Program(object):
         assert isinstance(target, Variable)
         if no_grad_set is None:
             no_grad_set = set()
-        param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set)
+        try:
+            param_to_grad_info = self.desc.append_backward(target.desc,
+                                                           no_grad_set)
+        except Exception as e:
+            raise core.EnforceNotMet(
+                str(e) + "\nCurrent protobuf is\n{0}".format(
+                    self.to_string(False)))
+
         self.sync_with_cpp()
         return param_to_grad_info
 
@@ -522,6 +822,24 @@ class Program(object):
         for block in self.blocks:
             block.sync_with_cpp()
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program.
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
@@ -550,5 +868,88 @@ class Parameter(Variable):
 
 
 # program is a global instance.
-g_program = Program()
-g_init_program = Program()
+_main_program_ = Program()
+_startup_program_ = Program()
+
+
+def default_startup_program():
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+
+    Returns:
+        Program: startup program
+    """
+    return _startup_program_
+
+
+def default_main_program():
+    """
+    Get default main program. The main program is used for training or testing.
+
+    Returns:
+        Program: main program
+    """
+    return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement.
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/framework/initializer.py b/python/paddle/v2/fluid/initializer.py
similarity index 69%
rename from python/paddle/v2/framework/initializer.py
rename to python/paddle/v2/fluid/initializer.py
index 98a87bfa86..c0839caaf2 100644
--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,10 +1,7 @@
-import paddle.v2.framework.framework as framework
+import framework
 import numpy as np
 
-__all__ = [
-    'ConstantInitializer', 'UniformInitializer', 'NormalInitializer',
-    'XavierInitializer'
-]
+__all__ = ['Constant', 'Uniform', 'Normal', 'Xavier']
 
 
 class Initializer(object):
@@ -93,7 +90,7 @@ class ConstantInitializer(Initializer):
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "value": self._value
             })
         var.op = op
@@ -135,12 +132,14 @@ class UniformInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
         op = block.prepend_op(
             type="uniform_random",
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
@@ -183,12 +182,14 @@ class NormalInitializer(Initializer):
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
         # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
         op = block.prepend_op(
             type="gaussian_random",
             outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "data_type": int(var.data_type),
+                "dtype": int(var.dtype),
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
@@ -258,6 +259,9 @@ class XavierInitializer(Initializer):
         fan_in = f_in if self._fan_in is None else self._fan_in
         fan_out = f_out if self._fan_out is None else self._fan_out
 
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
             op = block.prepend_op(
@@ -265,7 +269,7 @@ class XavierInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "data_type": int(var.data_type),
+                    "dtype": int(var.dtype),
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
@@ -278,10 +282,112 @@ class XavierInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={
                     "shape": var.shape,
-                    "data_type": int(var.data_type),
+                    "dtype": int(var.dtype),
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
                 })
         var.op = op
         return op
+
+
+class MSRAInitializer(Initializer):
+    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
+    and Jian Sun. This is a robust initialization method that particularly
+    considers the rectifier nonlinearities. In case of Uniform distribution,
+    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
+    distribution, the mean is 0 and the standard deviation
+    is sqrt(2/ fan_in).
+
+    References:
+        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
+            on ImageNet Classification
+            (https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, uniform=True, fan_in=None, seed=0):
+        """Constructor for MSRAInitializer
+
+        Args:
+            uniform: whether to use uniform or normal distribution
+            fan_in: fan_in for MSRAInitializer. If None, it is
+                    inferred from the variable.
+            seed: random seed
+
+        Note: It is recommended to set fan_in to None for most cases.
+        """
+        assert uniform is not None
+        assert seed is not None
+        super(MSRAInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add MSRA initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in))
+            op = block.prepend_op(
+                type="uniform_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                })
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.prepend_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                })
+        var.op = op
+        return op
+
+
+# We short the class name, since users will use the initializer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.initializer.Xavier()))
+#
+# It is no need to add an `Initializer` as the class suffix
+Constant = ConstantInitializer
+Uniform = UniformInitializer
+Normal = NormalInitializer
+Xavier = XavierInitializer
+MSRA = MSRAInitializer
diff --git a/python/paddle/v2/framework/io.py b/python/paddle/v2/fluid/io.py
similarity index 61%
rename from python/paddle/v2/framework/io.py
rename to python/paddle/v2/fluid/io.py
index f3ba719bde..e147ac22ad 100644
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,12 +1,12 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.framework.framework import Program, Parameter, g_program, \
-    Variable
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', "save_inference_model", "load_inference_model"
+    'load_persistables', "save_inference_model", "load_inference_model",
+    "get_inference_program"
 ]
 
 
@@ -23,19 +23,19 @@ def _clone_var_in_block_(block, var):
     return block.create_var(
         name=var.name,
         shape=var.shape,
-        dtype=var.data_type,
+        dtype=var.dtype,
         type=var.type,
         lod_level=var.lod_level,
         persistable=True)
 
 
-def save_vars(executor, dirname, program=None, vars=None, predicate=None):
+def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     Save variables to directory by executor.
 
     :param executor: executor that save variable
     :param dirname: directory path
-    :param program: program. If vars is None, then filter all variables in this 
+    :param main_program: program. If vars is None, then filter all variables in this
     program which fit `predicate`. Default g_program.
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be saved.
@@ -44,15 +44,15 @@ def save_vars(executor, dirname, program=None, vars=None, predicate=None):
     :return: None
     """
     if vars is None:
-        if program is None:
-            program = g_program
-        if not isinstance(program, Program):
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()))
     else:
         save_program = Program()
         save_block = save_program.global_block()
@@ -66,54 +66,54 @@ def save_vars(executor, dirname, program=None, vars=None, predicate=None):
         executor.run(save_program)
 
 
-def save_params(executor, dirname, program=None):
+def save_params(executor, dirname, main_program=None):
     """
     Save all parameters to directory with executor.
     """
     save_vars(
         executor,
         dirname=dirname,
-        program=program,
+        main_program=main_program,
         vars=None,
         predicate=is_parameter)
 
 
-def save_persistables(executor, dirname, program=None):
+def save_persistables(executor, dirname, main_program=None):
     """
     Save all persistables to directory with executor.
     """
     save_vars(
         executor,
         dirname=dirname,
-        program=program,
+        main_program=main_program,
         vars=None,
         predicate=is_persistable)
 
 
-def load_vars(executor, dirname, program=None, vars=None, predicate=None):
+def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     Load variables from directory by executor.
 
     :param executor: executor that save variable
     :param dirname: directory path
-    :param program: program. If vars is None, then filter all variables in this 
-    program which fit `predicate`. Default g_program.
+    :param main_program: program. If vars is None, then filter all variables in this
+    program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
-    :param vars: variables need to be loaded. If specify vars, program & 
+    :param vars: variables need to be loaded. If specify vars, program &
     predicate will be ignored
     :return: None
     """
     if vars is None:
-        if program is None:
-            program = g_program
-        if not isinstance(program, Program):
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
         load_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()))
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
@@ -129,63 +129,81 @@ def load_vars(executor, dirname, program=None, vars=None, predicate=None):
         executor.run(load_prog)
 
 
-def load_params(executor, dirname, program=None):
+def load_params(executor, dirname, main_program=None):
     """
     load all parameters from directory by executor.
     """
     load_vars(
-        executor, dirname=dirname, program=program, predicate=is_parameter)
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_parameter)
 
 
-def load_persistables(executor, dirname, program=None):
+def load_persistables(executor, dirname, main_program=None):
     """
     load all persistables from directory by executor.
     """
     load_vars(
-        executor, dirname=dirname, program=program, predicate=is_persistable)
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_persistable)
+
+
+def get_inference_program(target_vars, main_program=None):
+    if main_program is None:
+        main_program = default_main_program()
+    if not isinstance(target_vars, list):
+        target_vars = [target_vars]
+
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
+    return inference_program
 
 
 def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
                          executor,
-                         program=None):
+                         main_program=None):
     """
-    Build a model especially for inference, 
+    Build a model especially for inference,
     and save it to directory by the executor.
 
     :param dirname: directory path
     :param feeded_var_names: Names of variables that need to be feeded data during inference
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
-    :param program: original program, which will be pruned to build the inference model. 
-    Default g_program.
+    :param main_program: original program, which will be pruned to build the inference model.
+            Default default_main_program().
 
     :return: None
     """
-    if program is None:
-        program = g_program
+    if main_program is None:
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = program.prune(target_vars)
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
     model_file_name = dirname + "/__model__"
     with open(model_file_name, "w") as f:
         pickle.dump({
-            "program_desc_str": pruned_program.desc.serialize_to_string(),
+            "program_desc_str": inference_program.desc.serialize_to_string(),
             "feed_var_names": feeded_var_names,
             "fetch_var_names": fetch_var_names
         }, f, -1)
 
-    save_params(executor, dirname, program)
+    save_params(executor, dirname, main_program)
 
 
-def load_persistables_if_exist(executor, dirname, program=None):
+def load_persistables_if_exist(executor, dirname, main_program=None):
     filenames = next(os.walk(dirname))[2]
     filenames = set(filenames)
 
@@ -198,7 +216,7 @@ def load_persistables_if_exist(executor, dirname, program=None):
     load_vars(
         executor,
         dirname,
-        program=program,
+        main_program=main_program,
         vars=None,
         predicate=_is_presistable_and_exist_)
 
@@ -228,3 +246,35 @@ def load_inference_model(dirname, executor):
     fetch_vars = [program.global_block().var(name) for name in fetch_var_names]
 
     return [program, feed_var_names, fetch_vars]
+
+
+def get_parameter_value(para, executor):
+    """
+    Get the LoDTensor for the parameter
+
+    :param executor: executor for retrieving the value
+    :param para: the given parameter
+    :return: the LoDTensor for the parameter
+    """
+    assert is_parameter(para)
+
+    get_program = Program()
+    block = get_program.global_block()
+    new_var = _clone_var_in_block_(block, para)
+    return executor.run(get_program, feed={}, fetch_list=[new_var])[0]
+
+
+def get_parameter_value_by_name(name, executor, program=None):
+    """
+    Get the LoDTensor for paramter with the given name
+
+    :param executor: executor for retrieving the value
+    :param name: the name of the parameter
+    :param program: the program where the variable is found
+            Default default_main_program().
+    :return: the LoDTensor for the variable
+    """
+    if program is None:
+        program = default_main_program()
+    var = program.global_block().var(name)
+    return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
similarity index 53%
rename from python/paddle/v2/framework/layer_helper.py
rename to python/paddle/v2/fluid/layer_helper.py
index 9e80eaa647..3963e13222 100644
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,10 +1,10 @@
 import copy
 import itertools
 
-from paddle.v2.framework.framework import Variable, g_program, \
-    g_init_program, unique_name, Program
-from paddle.v2.framework.initializer import ConstantInitializer, \
-    UniformInitializer
+from framework import Variable, Parameter, default_main_program, default_startup_program, \
+    unique_name, dtype_is_floating
+from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr
 
 
 class LayerHelper(object):
@@ -20,23 +20,23 @@ class LayerHelper(object):
         return self.kwargs['name']
 
     @property
-    def program(self):
-        prog = self.kwargs.get('program', None)
+    def main_program(self):
+        prog = self.kwargs.get('main_program', None)
         if prog is None:
-            return g_program
+            return default_main_program()
         else:
             return prog
 
     @property
-    def init_program(self):
-        prog = self.kwargs.get('init_program', None)
+    def startup_program(self):
+        prog = self.kwargs.get('startup_program', None)
         if prog is None:
-            return g_init_program
+            return default_startup_program()
         else:
             return prog
 
     def append_op(self, *args, **kwargs):
-        return self.program.current_block().append_op(*args, **kwargs)
+        return self.main_program.current_block().append_op(*args, **kwargs)
 
     def multiple_input(self, input_param_name='input'):
         inputs = self.kwargs.get(input_param_name, [])
@@ -61,30 +61,15 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None, 'initializer': UniformInitializer()}
-        actual = self.kwargs.get('param_attr', None)
-        if actual is None:
-            actual = default
-        for default_field in default.keys():
-            if default_field not in actual:
-                actual[default_field] = default[default_field]
-        return actual
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
 
+    @property
     def bias_attr(self):
-        default = {'name': None, 'initializer': ConstantInitializer()}
-        bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is True:
-            bias_attr = default
-
-        if isinstance(bias_attr, dict):
-            for default_field in default.keys():
-                if default_field not in bias_attr:
-                    bias_attr[default_field] = default[default_field]
-        return bias_attr
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
-        if isinstance(param_attr, dict):
+        if isinstance(param_attr, ParamAttr):
             param_attr = [param_attr]
 
         if len(param_attr) != 1 and len(param_attr) != length:
@@ -107,77 +92,100 @@ class LayerHelper(object):
         dtype = None
         for each in inputs:
             if dtype is None:
-                dtype = each.data_type
-            elif dtype != each.data_type:
+                dtype = each.dtype
+            elif dtype != each.dtype:
                 raise ValueError("Data Type mismatch")
         return dtype
 
-    def create_parameter(self, attr, shape, dtype, suffix='w',
-                         initializer=None):
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
         # Deepcopy the attr so that parameters can be shared in program
-        attr_copy = copy.deepcopy(attr)
-        if initializer is not None:
-            attr_copy['initializer'] = initializer
-        if attr_copy['name'] is None:
-            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
-        self.init_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr_copy)
-        return self.program.global_block().create_parameter(
-            name=attr_copy['name'], dtype=dtype, shape=shape)
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+
+        if default_initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
+        else:
+            attr.set_default_initializer(default_initializer)
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
+
+        self.startup_program.global_block().create_parameter(
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
+        return self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=shape, **attr.to_kwargs())
+
+    def get_parameter(self, name):
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
 
     def create_tmp_variable(self, dtype):
-        return self.program.current_block().create_var(
+        return self.main_program.current_block().create_var(
             name=unique_name(".".join([self.name, 'tmp'])),
             dtype=dtype,
             persistable=False)
 
     def create_variable(self, *args, **kwargs):
-        return self.program.current_block().create_var(*args, **kwargs)
+        return self.main_program.current_block().create_var(*args, **kwargs)
 
     def create_global_variable(self, persistable=False, *args, **kwargs):
-        return self.program.global_block().create_var(
+        return self.main_program.global_block().create_var(
             *args, persistable=persistable, **kwargs)
 
     def set_variable_initializer(self, var, initializer):
         assert isinstance(var, Variable)
-        self.init_program.global_block().create_var(
+        self.startup_program.global_block().create_var(
             name=var.name,
             type=var.type,
-            dtype=var.data_type,
+            dtype=var.dtype,
             shape=var.shape,
             persistable=True,
             initializer=initializer)
 
-    def append_bias_op(self, input_var, num_flatten_dims=None):
+    @property
+    def to_kwargs(self):
+        return {
+            'main_program': self.main_program,
+            'startup_program': self.startup_program
+        }
+
+    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
-        Append bias operator and return its output. If the user does not set 
+        Append bias operator and return its output. If the user does not set
         bias_attr, append_bias_op will return input_var
-         
-        :param input_var: the input variable. The len(input_var.shape) is larger
-        or equal than 2.
-        :param num_flatten_dims: The input tensor will be flatten as a matrix 
-        when adding bias.
-        `matrix.shape = product(input_var.shape[0:num_flatten_dims]), product(
-                input_var.shape[num_flatten_dims:])`
-        """
-        if num_flatten_dims is None:
-            num_flatten_dims = self.kwargs.get('num_flatten_dims', None)
-            if num_flatten_dims is None:
-                num_flatten_dims = 1
 
-        size = list(input_var.shape[num_flatten_dims:])
-        bias_attr = self.bias_attr()
+        :param input_var: the input variable. The len(input_var.shape) is
+        larger or equal than 2.
+        :bias_initializer: an instance of a subclass of Initializer used to
+        initialize the bias
+        :param dim_start:
+        :param dim_end: the shape of the bias will be
+        input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
+        dimensions and added to input_var to get the output
+        """
+        size = list(input_var.shape[dim_start:dim_end])
+        bias_attr = self.bias_attr
         if not bias_attr:
             return input_var
 
         b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.data_type, suffix='b')
-        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
             inputs={'X': [input_var],
                     'Y': [b]},
-            outputs={'Out': [tmp]})
+            outputs={'Out': [tmp]},
+            attrs={'axis': dim_start})
         return tmp
 
     def append_activation(self, input_var):
@@ -186,7 +194,7 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, basestring):
             act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
         act_type = act.pop('type')
         self.append_op(
             type=act_type,
@@ -194,3 +202,10 @@ class LayerHelper(object):
             outputs={"Y": [tmp]},
             attrs=act)
         return tmp
+
+    def _get_default_initializer(self, dtype):
+        if dtype is None or dtype_is_floating(dtype) is True:
+            return Xavier()
+        else:
+            # For integer and boolean types, initialize with all zeros
+            return Constant()
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
new file mode 100644
index 0000000000..fb444f2d86
--- /dev/null
+++ b/python/paddle/v2/fluid/layers.py
@@ -0,0 +1,2174 @@
+import core
+import proto.framework_pb2 as framework_pb2
+from framework import OpProtoHolder, Variable, Program, Operator
+from initializer import Constant, Normal, Xavier, Initializer
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
+import re
+import cStringIO
+from param_attr import ParamAttr
+import contextlib
+
+__all__ = [
+    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
+    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
+    'batch_norm', 'accuracy', 'split_lod_tensor', 'While'
+]
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None,
+       main_program=None,
+       startup_program=None):
+    """
+    Fully Connected Layer.
+
+    Args:
+       input: The input tensor to the function
+       size: The size of the layer
+       num_flatten_dims: Number of columns in input
+       param_attr: The parameters/weights to the FC Layer
+       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
+       bias_attr: The bias parameter for the FC layer
+       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
+       act: Activation to be applied to the output of FC layer
+       name: Name/alias of the function
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in multiple inputs and performs the Fully Connected
+    function (linear transformation) on top of each of them.
+    So for input x, the output will be : Wx + b. Where W is the parameter,
+    b the bias and x is the input.
+
+    The function also applies an activation (non-linearity) on top of the
+    output, if activation is passed in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+    helper = LayerHelper('fc', **locals())
+
+    dtype = helper.input_dtype()
+
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+        ] + [size]
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={
+                "X": input_var,
+                "Y": w,
+            },
+            outputs={"Out": tmp},
+            attrs={'x_num_col_dims': num_flatten_dims,
+                   'y_num_col_dims': 1})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def embedding(input,
+              size,
+              is_sparse=False,
+              param_attr=None,
+              dtype='float32',
+              main_program=None,
+              startup_program=None):
+    """
+    Embedding Layer.
+
+    Args:
+       param_initializer:
+       input: The input to the function
+       size: The size of the layer
+       is_sparse: A flag that decleares whether the input is sparse
+       param_attr: Parameters for this layer
+       dtype: The type of data : float32, float_16, int etc
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+
+    This function can take in the input (which is a vector of IDs) and
+    performs a lookup in the lookup_table using these IDs, to result into
+    the embedding of each ID in the input.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+
+    helper = LayerHelper('embedding', **locals())
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp},
+        attrs={'is_sparse': is_sparse})
+    return tmp
+
+
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
+def gru_unit(input,
+             hidden,
+             size,
+             weight=None,
+             bias=None,
+             activation='tanh',
+             gate_activation='sigmoid',
+             main_program=None,
+             startup_program=None):
+    """
+    GRUUnit Operator implements partial calculations of the GRU unit as following:
+
+    $$
+    update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+    reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+    output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+    output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+    $$
+
+    which is same as one time step of GRU Operator.
+
+    @note To implement the complete GRU unit, fully-connected operator must be
+    used before to feed xu, xr and xc as the Input of GRUUnit operator.
+
+    TODO(ChunweiYan) add more document here
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size / 3
+
+    # create weight
+    if weight is None:
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    # create bias
+    if bias is None:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru_unit',
+        inputs={'Input': input,
+                'HiddenPrev': hidden,
+                'Weight': weight},
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 0,
+            'gate_activation': 1,
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
+def data(name,
+         shape,
+         append_batch_size=True,
+         dtype='float32',
+         lod_level=0,
+         type=core.VarDesc.VarType.LOD_TENSOR,
+         main_program=None,
+         startup_program=None,
+         stop_gradient=True):
+    """
+    Data Layer.
+
+    Args:
+       name: The name/alias of the function
+       shape: Tuple declaring the shape.
+       append_batch_size: Whether or not to append the data as a batch.
+       dtype: The type of data : float32, float_16, int etc
+       type: The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program: Name of the main program that calls this
+       startup_program: Name of the startup program
+       stop_gradient: A boolean that mentions whether gradient should flow.
+
+    This function takes in input and based on whether data has
+    to be returned back as a minibatch, it creates the global variable using
+    the helper functions. The global variables can be accessed by all the
+    following operations and layers in the graph.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    """
+    helper = LayerHelper('data', **locals())
+    shape = list(shape)
+    for i in xrange(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+            append_batch_size = False
+        elif shape[i] < 0:
+            append_batch_size = False
+
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
+
+    return helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=type,
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
+
+
+def create_tensor(dtype, name=None, main_program=None, startup_program=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
+def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
+def _create_op_func_(op_type):
+    """
+    Create an Operator for a Function.
+
+    Args:
+       op_type: The name of the operator to be created
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    not_intermediate_outputs = \
+        filter(lambda output: not output.intermediate, op_proto.outputs)
+    intermediate_outputs = \
+        filter(lambda output: output.intermediate, op_proto.outputs)
+
+    if len(not_intermediate_outputs) != 1:
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated")
+
+    if not_intermediate_outputs[0].duplicable:
+        raise ValueError(
+            "Only non duplicable op can be automatically generated")
+
+    for output in intermediate_outputs:
+        if output.duplicable:
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable")
+
+    o_name = not_intermediate_outputs[0].name
+    intermediate_output_names = [output.name for output in intermediate_outputs]
+
+    def infer_and_check_dtype(op_proto, **kwargs):
+        """
+        This function performs the sanity check for dtype and
+        instance type.
+        """
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
+                    raise ValueError(
+                        "operator {0} must input same dtype".format(op_type))
+
+        return dtype
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_dtype(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            inputs[ipt.name] = val
+
+        outputs = dict()
+        out = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out]
+        for name in intermediate_output_names:
+            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        return helper.append_activation(out)
+
+    func.__name__ = op_type
+    globals()[op_type] = func
+    func.__doc__ = _generate_doc_string_(op_proto)
+    global __all__
+    __all__.append(op_type)
+
+
+_create_op_func_('mean')
+_create_op_func_('mul')
+_create_op_func_('elementwise_add')
+_create_op_func_('elementwise_div')
+_create_op_func_('dropout')
+_create_op_func_('reshape')
+_create_op_func_('sigmoid')
+_create_op_func_('scale')
+_create_op_func_('reshape')
+_create_op_func_('transpose')
+_create_op_func_('sigmoid_cross_entropy_with_logits')
+
+
+def cast(x, dtype, main_program=None):
+    """
+    This function takes in the input with input_dtype
+    and casts it to the output_dtype as the output.
+    """
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
+    return out
+
+
+def concat(input, axis, main_program=None, startup_program=None):
+    """
+    This function concats the input along the axis mentioned
+    and returns that as the output.
+    """
+    helper = LayerHelper('concat', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='concat',
+        inputs={'X': input},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def sums(input, out=None, main_program=None, startup_program=None):
+    """
+    This function takes in the input and performs the sum operation on it
+    and returns that as the output.
+    """
+    helper = LayerHelper('sum', **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    return out
+
+
+def linear_chain_crf(input,
+                     label,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
+def crf_decoding(input,
+                 param_attr,
+                 label=None,
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
+def assign(input, output, main_program=None, startup_program=None):
+    helper = LayerHelper('assign', **locals())
+    helper.append_op(
+        type='scale',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs={'scale': 1.0})
+    return output
+
+
+def split_lod_tensor(input,
+                     mask,
+                     level=0,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.dtype)
+    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true,
+                     in_false,
+                     x,
+                     mask,
+                     level=0,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
+def cos_sim(X, Y, **kwargs):
+    """
+    This function performs the cosine similarity between two tensors
+    X and Y and returns that as the output.
+    """
+    helper = LayerHelper('cos_sim', **kwargs)
+    out = helper.create_tmp_variable(dtype=X.dtype)
+    xnorm = helper.create_tmp_variable(dtype=X.dtype)
+    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X],
+                'Y': [Y]},
+        outputs={'Out': [out],
+                 'XNorm': [xnorm],
+                 'YNorm': [ynorm]})
+    return out
+
+
+def cross_entropy(input, label, **kwargs):
+    """
+    This function computes cross_entropy using the input and label.
+    """
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    """
+    This functions returns the squared error cost using the input and label.
+    The output is appending the op to do the above.
+    """
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
+    return square_out
+
+
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **kwargs)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def chunk_eval(input,
+               label,
+               chunk_scheme,
+               num_chunk_types,
+               excluded_chunk_types=None,
+               **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("chunk_eval", **kwargs)
+
+    # prepare output
+    precision = helper.create_tmp_variable(dtype="float32")
+    recall = helper.create_tmp_variable(dtype="float32")
+    f1_score = helper.create_tmp_variable(dtype="float32")
+
+    helper.append_op(
+        type="chunk_eval",
+        inputs={"Inference": [input],
+                "Label": [label]},
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score]
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            'chunk_scheme': chunk_scheme,
+            'excluded_chunk_types': excluded_chunk_types or []
+        })
+    return precision, recall, f1_score
+
+
+def sequence_conv(input,
+                  num_filters,
+                  filter_size=3,
+                  filter_stride=1,
+                  padding=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  act=None,
+                  main_program=None,
+                  startup_program=None):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    """
+
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes.
+    # such as, padding_trainable, context_start.
+
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [filter_size * input.shape[1], num_filters]
+    filter = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': [filter],
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'contextStride': filter_stride,
+            'contextStart': -int(filter_size / 2),
+            'contextLength': filter_size
+        })
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
+def conv2d(input,
+           num_filters,
+           filter_size,
+           stride=[1, 1],
+           padding=None,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           act=None,
+           name=None,
+           main_program=None,
+           startup_program=None):
+    """
+    This function creates the op for a 2-dimensional Convolution.
+    This is performed using the parameters of filters(size, dimensionality etc)
+    , stride and other configurations for a Convolution operation.
+    This funciton can also append an activation on top of the
+    conv-2d output, if mentioned in the input parameters.
+    """
+
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='conv2d_cudnn',
+        inputs={
+            'Input': input,
+            'Filter': filter,
+        },
+        outputs={"Output": pre_bias},
+        attrs={'strides': stride,
+               'paddings': padding,
+               'groups': groups})
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
+def sequence_pool(input, pool_type, **kwargs):
+    """
+    This function add the operator for sequence pooling.
+    This is applied on top of the input using pool_type mentioned
+    in the parameters.
+    """
+    helper = LayerHelper('sequence_pool', input=input, **kwargs)
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
+        attrs={"pooltype": pool_type.upper()})
+
+    return pool_out
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=[1, 1],
+           pool_padding=[0, 0],
+           global_pooling=False,
+           main_program=None,
+           startup_program=None):
+    """
+    This function adds the operator for pooling in 2 dimensions, using the
+    pooling configurations mentioned in input parameters.
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+
+    helper = LayerHelper('pool2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding
+        })
+
+    return pool_out
+
+
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW',
+               main_program=None,
+               startup_program=None):
+    """
+    This function helps create an operator to implement
+    the BatchNorm layer using the configurations from the input parameters.
+    """
+    helper = LayerHelper('batch_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0))
+
+    bias = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_global_variable(
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
+
+    variance = helper.create_global_variable(
+        dtype=input.dtype, shape=param_shape, persistable=True)
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_tmp_variable(dtype)
+    saved_variance = helper.create_tmp_variable(dtype)
+
+    batch_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="batch_norm",
+        inputs={
+            "X": input,
+            "Scale": scale,
+            "Bias": bias,
+            "Mean": mean,
+            "Variance": variance
+        },
+        outputs={
+            "Y": batch_norm_out,
+            "MeanOut": mean_out,
+            "VarianceOut": variance_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance
+        },
+        attrs={"momentum": momentum,
+               "epsilon": epsilon,
+               "is_test": is_test})
+
+    return helper.append_activation(batch_norm_out)
+
+
+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
+class BlockGuard(object):
+    """
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
+    """
+
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.main_program = main_program
+
+    def __enter__(self):
+        self.main_program.create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class StaticRNNGuard(BlockGuard):
+    """
+    StaticRNNGuard class.
+
+    StaticRNNGuard class is used to create a StaticRNN block in a program.
+    """
+
+    def __init__(self, rnn):
+        if not isinstance(rnn, StaticRNN):
+            raise TypeError("StaticRNNGuard takes a StaticRNN")
+        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
+        self.rnn = rnn
+
+    def __enter__(self):
+        self.rnn.status = StaticRNN.IN_RNN_BLOCK
+        return super(StaticRNNGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
+        self.rnn.complete_rnn_op()
+        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class StaticRNNMemoryLink(object):
+    """
+    StaticRNNMemoryLink class.
+
+    Args:
+        init: the initial variable for Memory
+        init: Variable
+        pre_mem: the memory variable in previous time step
+        pre_mem: Variable
+        mem: the memory variable in current time step
+        mem: Variable
+
+    StaticRNNMemoryLink class is used to create a link between two
+    memory cells of a StaticRNN.
+    """
+
+    def __init__(self, init, pre_mem, mem=None):
+        self.init = init
+        self.pre_mem = pre_mem
+        self.mem = mem
+
+
+class StaticRNN(object):
+    """
+    StaticRNN class.
+
+    StaticRNN class is used to create a StaticRNN. The RNN will have its
+    own parameters like inputs, outputs, memories, status and length.
+    """
+    BEFORE_RNN_BLOCK = 0
+    IN_RNN_BLOCK = 1
+    AFTER_RNN_BLOCK = 2
+
+    def __init__(self, name=None, main_program=None):
+        self.helper = LayerHelper(
+            "static_rnn", name=name, main_program=main_program)
+        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
+        self.inputs = []  # input variable list in current block
+        self.outputs = []  # output variable list in parent block
+        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
+        # sequence length, since it is a static RNN, sequence length are fixed.
+        self.seq_len = None
+
+    def step(self):
+        return StaticRNNGuard(self)
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != StaticRNN.IN_RNN_BLOCK:
+            raise ValueError("You must invoke {0} in rnn block".format(method))
+
+    def memory(self,
+               init=None,
+               shape=None,
+               batch_ref=None,
+               init_value=0.0,
+               init_batch_dim_idx=0,
+               ref_batch_dim_idx=1):
+        """
+        Args:
+            init: boot memory, if not set, a shape, batch_ref must be provided
+            shape: shape of the boot memory
+            batch_ref: batch size reference variable
+            init_value: the init value of boot memory
+            init_batch_dim_idx: the index of batch size in init's dimension
+            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        """
+        self._assert_in_rnn_block_('memory')
+        if init is None:
+            if shape is None or batch_ref is None:
+                raise ValueError(
+                    "if init is None, memory at least need shape and batch_ref")
+            parent_block = self.parent_block()
+            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            boot_var = parent_block.create_var(
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.dtype,
+                persistable=False)
+
+            parent_block.append_op(
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'dtype': boot_var.dtype,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx
+                })
+
+            return self.memory(init=boot_var)
+        else:
+            pre_mem = self.helper.create_variable(
+                name=unique_name("@".join([self.helper.name, "mem"])),
+                dtype=init.dtype,
+                shape=init.shape)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem)
+            return pre_mem
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_('step_input')
+        if not isinstance(x, Variable):
+            raise TypeError("step input takes a Variable")
+        if self.seq_len is None:
+            self.seq_len = x.shape[0]
+        elif self.seq_len != x.shape[0]:
+            raise ValueError("Static RNN only take fix seq_len input")
+
+        ipt = self.helper.create_variable(
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
+        self.inputs.append(ipt)
+        return ipt
+
+    def step_output(self, o):
+        self._assert_in_rnn_block_('step_output')
+        if not isinstance(o, Variable):
+            raise TypeError("step output takes a Variable")
+
+        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        self.helper.append_op(
+            type='rnn_memory_helper',
+            inputs={'X': [o]},
+            outputs={'Out': tmp_o},
+            attrs={'dtype': o.dtype})
+
+        out_var = self.parent_block().create_var(
+            name=tmp_o.name,
+            shape=[self.seq_len] + list(tmp_o.shape),
+            dtype=tmp_o.dtype)
+
+        self.outputs.append(out_var)
+
+    def output(self, *outputs):
+        for each in outputs:
+            self.step_output(each)
+
+    def update_memory(self, mem, var):
+        if not isinstance(mem, Variable) or not isinstance(var, Variable):
+            raise TypeError("update memory should take variables")
+        self.memories[mem.name].mem = var
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def complete_rnn_op(self):
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+        for m in self.memories:
+            local_inputs.add(m)
+
+        params = list()
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        parameters = [parent_block.var(name) for name in params]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        inlinks = [parent_block.var(i.name) for i in self.inputs]
+        outlinks = self.outputs
+
+        boot_memories = []
+        pre_memories = []
+        memories = []
+        for _, mem in self.memories.iteritems():
+            boot_memories.append(mem.init)
+            pre_memories.append(mem.pre_mem.name)
+            mem_var = rnn_block.var(mem.mem.name)
+            assert isinstance(mem_var, Variable)
+            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+
+            rnn_block.append_op(
+                type='rnn_memory_helper',
+                inputs={'X': [mem_var]},
+                outputs={'Out': [new_mem]},
+                attrs={'dtype': mem_var.dtype})
+
+            memories.append(new_mem.name)
+
+        parent_block.append_op(
+            type='recurrent',
+            inputs={
+                'inputs': inlinks,
+                'initial_states': boot_memories,
+                'parameters': parameters
+            },
+            outputs={'outputs': outlinks,
+                     'step_scopes': [step_scope]},
+            attrs={
+                'ex_states': pre_memories,
+                'states': memories,
+                'step_block': rnn_block
+            })
+
+
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None, main_program=None):
+        self.helper = LayerHelper("while", name=name, main_program=main_program)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.dtype != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'step_block': while_block})
+
+
+def lstm(x,
+         c_pre_init,
+         hidden_dim,
+         forget_bias=None,
+         main_program=None,
+         startup_program=None):
+    """
+    This function helps create an operator for the LSTM (Long Short Term
+    Memory) cell that can be used inside an RNN.
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = concat(
+            input=[x_t, c_pre],
+            axis=1,
+            main_program=main_program,
+            startup_program=startup_program)
+        after_fc = fc(input=before_fc,
+                      size=hidden_dim * 4,
+                      main_program=main_program,
+                      startup_program=startup_program)
+
+        dtype = x.dtype
+        c = helper.create_tmp_variable(dtype)
+        h = helper.create_tmp_variable(dtype)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
+
+
+def lod_rank_table(x, level=0, main_program=None):
+    """
+    This function creates an operator for creating a LOD_RANK_TABLE
+    using the input x.
+    """
+    helper = LayerHelper("lod_rank_table", **locals())
+    table = helper.create_variable(
+        type=core.VarDesc.VarType.LOD_RANK_TABLE,
+        name=unique_name("lod_rank_table"))
+    helper.append_op(
+        type='lod_rank_table',
+        inputs={'X': x},
+        outputs={'Out': table},
+        attrs={'level': level})
+    return table
+
+
+def max_sequence_len(rank_table, main_program=None):
+    """
+    This function creates an operator to calculate the length of
+    max seqence through input rank_table(should be a lod_rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k, main_program=None, startup_program=None):
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
+def lod_tensor_to_array(x, table, main_program=None):
+    """
+    This function creates an operator to convert an LOD_Tensor to
+    an array.
+    """
+    helper = LayerHelper("lod_tensor_to_array", **locals())
+    array = helper.create_variable(
+        name=unique_name("lod_tensor_to_array"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.dtype)
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': array})
+    return array
+
+
+def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
+    """
+    This function creates an operator to convert an array to a
+    LOD_Tensor.
+    """
+    helper = LayerHelper("array_to_lod_tensor", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def fill_constant(shape,
+                  dtype,
+                  value,
+                  out=None,
+                  main_program=None,
+                  startup_program=None):
+    """
+    This function creates a tensor , with shape as mentioned in the input and
+    specified dtype and fills this up with a constant value that
+    comes in the input. It also sets the stop_gradient to be True.
+    """
+    helper = LayerHelper("fill_constant", **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant',
+        inputs={},
+        outputs={'Out': [out]},
+        attrs={'shape': shape,
+               'dtype': out.dtype,
+               'value': float(value)})
+    out.stop_gradient = True
+    return out
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  dtype,
+                                  value,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0,
+                                  main_program=None,
+                                  startup_program=None):
+    helper = LayerHelper("fill_constant_batch_size_like", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx
+        })
+    out.stop_gradient = True
+    return out
+
+
+def ones(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 1.0.
+    """
+    return fill_constant(value=1.0, **locals())
+
+
+def zeros(shape, dtype, main_program=None):
+    """
+    This function performs the same function as fill_constant() declared above
+    with the constant value being 0.0.
+    """
+    return fill_constant(value=0.0, **locals())
+
+
+def increment(x,
+              value=1.0,
+              in_place=True,
+              main_program=None,
+              startup_program=None):
+    """
+    This function creates an operator to increment each value in the input
+    `x` by an amount: `value` as mentioned in the input parameter. This
+    operation is performed in-place by default.
+    """
+    helper = LayerHelper("increment", **locals())
+    if not in_place:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = x
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'step': float(value)})
+    return out
+
+
+def array_write(x, i, array=None, main_program=None, startup_program=None):
+    """
+    This function creates an operator to write the data out as a
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_write', **locals())
+    if array is None:
+        array = helper.create_variable(
+            name="{0}.out".format(helper.name),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x],
+                'I': [i]},
+        outputs={'Out': [array]})
+    return array
+
+
+def create_array(dtype, main_program=None):
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, main_program=None, **ignored):
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
+def array_read(array, i, main_program=None, startup_program=None):
+    """
+    This function creates an operator to read the data in as a
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_read', **locals())
+    if not isinstance(
+            array,
+            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        raise TypeError("array should be tensor array vairable")
+    out = helper.create_tmp_variable(dtype=array.dtype)
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array],
+                'I': [i]},
+        outputs={'Out': [out]})
+    return out
+
+
+def shrink_memory(x, i, table, main_program=None, startup_program=None):
+    """
+    This function creates an operator to shrink_rnn_memory using the RankTable
+    as mentioned in the input parameter.
+    """
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
+
+
+def array_length(array, main_program=None):
+    """
+    This function creates an operator to find the length of the
+    LOD_TENSOR_ARRAY.
+    """
+    helper = LayerHelper('array_length', **locals())
+    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp.stop_gradient = True
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    return tmp
+
+
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self,
+                 inputs,
+                 name=None,
+                 main_program=None,
+                 startup_program=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper(
+            'conditional_block',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name not in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'block': inside_block})
+
+
+class IfElseBlockGuard(object):
+    def __init__(self, is_true, ifelse):
+        if not isinstance(ifelse, IfElse):
+            raise TypeError("ifelse must be an instance of IfElse class")
+
+        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("You cannot invoke IfElse.block() inside a block")
+
+        self.is_true = is_true
+        self.ie = ifelse
+        if is_true:
+            self.cond_block = ifelse.conditional_true_block
+        else:
+            self.cond_block = ifelse.conditional_false_block
+
+        if not isinstance(self.cond_block, ConditionalBlock):
+            raise TypeError("Unexpected situation")
+
+        self.cond_block = self.cond_block.block()
+
+    def __enter__(self):
+        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.cond_block.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
+            # re-raise inside exception
+            return False
+        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
+            raise ValueError("Must set output inside block")
+        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
+
+
+class IfElse(object):
+    OUT_IF_ELSE_BLOCKS = 0
+    IN_IF_ELSE_TRUE_BLOCKS = 1
+    IN_IF_ELSE_FALSE_BLOCKS = 2
+
+    def __init__(self, cond, name=None, main_program=None,
+                 startup_program=None):
+        if not isinstance(cond, Variable):
+            raise TypeError("cond must be a Variable")
+        self.helper = LayerHelper(
+            'ifelse',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.cond = cond
+        self.input_table = {}
+        self.status = IfElse.OUT_IF_ELSE_BLOCKS
+        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
+        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
+        self.output_table = ([], [])  # (true_outs, false_outs)
+
+    def input(self, x):
+        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("input must in true/false blocks")
+        if id(x) not in self.input_table:
+            parent_block = self.parent_block()
+            out_true = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+
+            out_false = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true,
+                         'OutFalse': out_false},
+                attrs={'level': 0})
+            self.input_table[id(x)] = (out_true, out_false)
+        else:
+            out_true, out_false = self.input_table[id(x)]
+
+        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
+            return out_true
+        else:
+            return out_false
+
+    def parent_block(self):
+        current_block = self.helper.main_program.current_block()
+        return self.helper.main_program.block(current_block.parent_idx)
+
+    def true_block(self):
+        return IfElseBlockGuard(True, self)
+
+    def false_block(self):
+        return IfElseBlockGuard(False, self)
+
+    def output(self, *outs):
+        if self.status == self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("output can only be invoked in the sub-block")
+
+        out_table = self.output_table[1 if self.status ==
+                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        parent_block = self.parent_block()
+        for each_out in outs:
+            if not isinstance(each_out, Variable):
+                raise TypeError("Each output should be a variable")
+            # create outside tensor
+            outside_out = parent_block.create_var(
+                name=unique_name("_".join([self.helper.name, 'output'])),
+                dtype=each_out.dtype)
+            out_table.append(outside_out)
+
+            # assign local var to outside
+            assign(
+                input=each_out,
+                output=outside_out,
+                main_program=self.helper.main_program,
+                startup_program=self.helper.startup_program)
+
+    def __call__(self):
+        if self.status != self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("IfElse::__call__ must be out of sub-block")
+        false_len, true_len = map(len, self.output_table)
+        if false_len == 0 and true_len == 0:
+            raise ValueError("Must invoke true_block/false_block before "
+                             "__call__")
+        elif false_len != true_len and false_len != 0 and true_len != 0:
+            raise ValueError("The output side must be same")
+        elif false_len == 0 or true_len == 0:
+            return self.output_table[0 if false_len != 0 else 1]
+
+        # else none of false_len/true_len is zero
+        # merge together
+        rlist = []
+        for false_var, true_var in zip(*self.output_table):
+            rlist.append(
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0,
+                    main_program=self.helper.main_program,
+                    startup_program=self.helper.startup_program))
+        return rlist
+
+
+class DynamicRNN(object):
+    BEFORE_RNN = 0
+    IN_RNN = 1
+    AFTER_RNN = 2
+
+    def __init__(self, name=None, main_program=None, startup_program=None):
+        self.helper = LayerHelper(
+            'dynamic_rnn',
+            name=name,
+            main_program=main_program,
+            startup_program=startup_program)
+        self.status = DynamicRNN.BEFORE_RNN
+        self.lod_rank_table = None
+        self.max_seq_len = None
+        self.step_idx = None
+        self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64')
+        self.mem_dict = dict()
+        self.output_array = []
+        self.outputs = []
+        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond.stop_gradient = False
+        self.while_op = While(self.cond)
+        self.input_array = []
+        self.mem_link = []
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_("step_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "step_input() can only take a Variable as its input")
+        parent_block = self._parent_block_()
+        if self.lod_rank_table is None:
+            self.lod_rank_table = parent_block.create_var(
+                name=unique_name('lod_rank_table'),
+                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+            self.lod_rank_table.stop_gradient = True
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table})
+            self.max_seq_len = parent_block.create_var(
+                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+            self.max_seq_len.stop_gradient = False
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len})
+            self.cond.stop_gradient = True
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx,
+                        'Y': self.max_seq_len},
+                outputs={'Out': self.cond})
+
+        input_array = parent_block.create_var(
+            name=unique_name('dynamic_rnn_input_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+        self.input_array.append((input_array, x.dtype))
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x,
+                    'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array})
+        return array_read(
+            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+
+    @contextlib.contextmanager
+    def block(self):
+        if self.status != DynamicRNN.BEFORE_RNN:
+            raise ValueError("rnn.block() can only be invoke once")
+        self.step_idx = fill_constant(shape=[1], dtype='int64', value=0)
+        self.step_idx.stop_gradient = False
+        self.status = DynamicRNN.IN_RNN
+        with self.while_op.block():
+            yield
+            increment(
+                x=self.step_idx,
+                value=1.0,
+                in_place=True,
+                **self.helper.to_kwargs)
+
+            for new_mem, mem_array in self.mem_link:
+                array_write(
+                    x=new_mem,
+                    i=self.step_idx,
+                    array=mem_array,
+                    **self.helper.to_kwargs)
+
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                cond=self.cond,
+                **self.helper.to_kwargs)
+
+        self.status = DynamicRNN.AFTER_RNN
+        for each_array in self.output_array:
+            self.outputs.append(
+                array_to_lod_tensor(
+                    x=each_array,
+                    table=self.lod_rank_table,
+                    **self.helper.to_kwargs))
+
+    def __call__(self, *args, **kwargs):
+        if self.status != DynamicRNN.AFTER_RNN:
+            raise ValueError(
+                "Dynamic RNN outputs can only be retrieved after rnn block")
+        if len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def memory(self, init=None, shape=None, value=0.0, dtype='float32'):
+        self._assert_in_rnn_block_('memory')
+        if init is not None:
+            if not isinstance(init, Variable):
+                raise TypeError(
+                    "The input arg `init` of memory() must be a Variable")
+            parent_block = self._parent_block_()
+            mem_array = parent_block.create_var(
+                name=unique_name('dynamic_rnn_mem_array'),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=init.dtype)
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init,
+                        'I': self.zero_idx},
+                outputs={'Out': mem_array})
+            retv = array_read(
+                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = shrink_memory(
+                x=retv,
+                i=self.step_idx,
+                table=self.lod_rank_table,
+                **self.helper.to_kwargs)
+            self.mem_dict[retv.name] = mem_array
+            return retv
+        else:
+            if len(self.input_array) == 0:
+                raise ValueError(
+                    "step_input should be invoked before memory(shape=..., value=...)"
+                )
+            parent_block = self._parent_block_()
+            init = parent_block.create_var(
+                name=unique_name('mem_init'), dtype=dtype)
+            arr, dtype = self.input_array[0]
+            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr],
+                        'I': [self.zero_idx]},
+                outputs={'Out': [in0]})
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype
+                })
+            return self.memory(init=init)
+
+    def update_memory(self, ex_mem, new_mem):
+        self._assert_in_rnn_block_('update_memory')
+        if not isinstance(ex_mem, Variable):
+            raise TypeError("The input arg `ex_mem` of update_memory() must "
+                            "be a Variable")
+        if not isinstance(new_mem, Variable):
+            raise TypeError("The input arg `new_mem` of update_memory() must "
+                            "be a Variable")
+
+        mem_array = self.mem_dict.get(ex_mem.name, None)
+        if mem_array is None:
+            raise ValueError("Please invoke memory before update_memory")
+        if self.lod_rank_table is None:
+            raise ValueError("Please invoke step_input before update_memory")
+
+        self.mem_link.append((new_mem, mem_array))
+
+    def output(self, *outputs):
+        self._assert_in_rnn_block_('output')
+        parent_block = self._parent_block_()
+        for each in outputs:
+            outside_array = parent_block.create_var(
+                name=unique_name("_".join(
+                    [self.helper.name, "output_array", each.name])),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=each.dtype)
+            array_write(x=each, i=self.step_idx, array=outside_array)
+            self.output_array.append(outside_array)
+
+    def _parent_block_(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+
+        return parent_block
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != DynamicRNN.IN_RNN:
+            raise ValueError("{0} can only be invoked inside rnn block.".format(
+                method))
diff --git a/python/paddle/v2/framework/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py
similarity index 87%
rename from python/paddle/v2/framework/net_drawer.py
rename to python/paddle/v2/fluid/net_drawer.py
index aa30e2a6ca..94fdd5e389 100644
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -3,8 +3,8 @@ import json
 import logging
 from collections import defaultdict
 
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs):
             if not var_dict.has_key(var):
                 var_dict[var] = "Feed"
 
+    temp_id = 0
     proto = framework_pb2.ProgramDesc.FromString(
         program.desc.serialize_to_string())
     for block in proto.blocks:
         for op in block.ops:
+            op.type = op.type + "_" + str(temp_id)
+            temp_id += 1
             graph.node(**draw_node(op))
             for o in op.outputs:
                 for arg in o.arguments:
@@ -78,9 +81,10 @@ def parse_graph(program, graph, var_dict, **kwargs):
                 for arg in e.arguments:
                     if var_dict.has_key(arg):
                         graph.edge(**draw_edge(var_dict, op, e, arg))
+        break  # only plot the first block
 
 
-def draw_graph(init_program, program, **kwargs):
+def draw_graph(startup_program, main_program, **kwargs):
     if kwargs.has_key("graph_attr"):
         GRAPH_STYLE.update(kwargs[graph_attr])
     if kwargs.has_key("node_attr"):
@@ -101,8 +105,8 @@ def draw_graph(init_program, program, **kwargs):
         **kwargs)
 
     var_dict = {}
-    parse_graph(init_program, g, var_dict)
-    parse_graph(program, g, var_dict)
+    parse_graph(startup_program, g, var_dict)
+    parse_graph(main_program, g, var_dict)
 
     if filename != None:
         g.save()
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/fluid/nets.py
similarity index 75%
rename from python/paddle/v2/framework/nets.py
rename to python/paddle/v2/fluid/nets.py
index f5a2c27676..05728ad75a 100644
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.layers as layers
+import layers
 
 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
 
@@ -10,23 +10,23 @@ def simple_img_conv_pool(input,
                          pool_stride,
                          act,
                          pool_type='max',
-                         program=None,
-                         init_program=None):
+                         main_program=None,
+                         startup_program=None):
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         act=act,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     pool_out = layers.pool2d(
         input=conv_out,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool_out
 
 
@@ -40,8 +40,8 @@ def img_conv_group(input,
                    conv_batchnorm_drop_rate=None,
                    pool_stride=1,
                    pool_type=None,
-                   program=None,
-                   init_program=None):
+                   main_program=None,
+                   startup_program=None):
     """
     Image Convolution Group, Used for vgg net.
     """
@@ -71,30 +71,30 @@ def img_conv_group(input,
             filter_size=conv_filter_size[i],
             padding=conv_padding[i],
             act=local_conv_act,
-            program=program,
-            init_program=init_program)
+            main_program=main_program,
+            startup_program=startup_program)
 
         if conv_with_batchnorm[i]:
             tmp = layers.batch_norm(
                 input=tmp,
                 act=conv_act,
-                program=program,
-                init_program=init_program)
+                main_program=main_program,
+                startup_program=startup_program)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(
                     x=tmp,
                     dropout_prob=drop_rate,
-                    program=program,
-                    init_program=init_program)
+                    main_program=main_program,
+                    startup_program=startup_program)
 
     pool_out = layers.pool2d(
         input=tmp,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool_out
 
 
@@ -103,19 +103,19 @@ def sequence_conv_pool(input,
                        filter_size,
                        act="sigmoid",
                        pool_type="max",
-                       program=None,
-                       init_program=None):
+                       main_program=None,
+                       startup_program=None):
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         act=act,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
 
     pool_out = layers.sequence_pool(
         input=conv_out,
         pool_type=pool_type,
-        program=program,
-        init_program=init_program)
+        main_program=main_program,
+        startup_program=startup_program)
     return pool_out
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/fluid/op.py
similarity index 98%
rename from python/paddle/v2/framework/op.py
rename to python/paddle/v2/fluid/op.py
index bc771a964a..5828803497 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/fluid/op.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
 def get_all_op_protos():
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/fluid/optimizer.py
similarity index 80%
rename from python/paddle/v2/framework/optimizer.py
rename to python/paddle/v2/fluid/optimizer.py
index 902442297e..719e3b2563 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,16 +1,13 @@
 from collections import defaultdict
 
-import paddle.v2.framework.framework as framework
-from paddle.v2.framework.framework import unique_name, Program
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.initializer import ConstantInitializer
-from paddle.v2.framework.regularizer import append_regularization_ops
-from paddle.v2.framework.layer_helper import LayerHelper
+import framework
+from backward import append_backward_ops
+from framework import unique_name
+from initializer import Constant
+from layer_helper import LayerHelper
+from regularizer import append_regularization_ops
 
-__all__ = [
-    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer'
-]
+__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
 
 
 class Optimizer(object):
@@ -35,15 +32,21 @@ class Optimizer(object):
         """
         raise NotImplementedError()
 
-    def _initialize_tensors(self, block):
-        """Create all necessary tensors, that will be shared for all parameter updates.
-
-        Tensors like learning rate should be initialized here.
-
-        Args:
-            block: the block in which the loss variable is present
-        """
-        pass
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate variable for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        param_lr_shape = [1]
+        param_lr_var = self.helper.create_global_variable(
+            name=unique_name("learning_rate"),
+            dtype='float32',
+            shape=param_lr_shape,
+            lod_level=1,
+            persistable=True)
+        param_lr = param_lr * self._learning_rate
+        self.helper.set_variable_initializer(
+            var=param_lr_var, initializer=Constant(param_lr))
+        return param_lr_var
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -79,18 +82,18 @@ class Optimizer(object):
         """
         if (name in self._accumulators and
                 param.name in self._accumulators[name]):
-            raise Exception("Accumulator {} already exists for parmeter {}".
+            raise Exception("Accumulator {} already exists for parameter {}".
                             format(name, param.name))
 
         assert isinstance(self.helper, LayerHelper)
         var = self.helper.create_global_variable(
             name=unique_name(name),
             persistable=True,
-            dtype=dtype or param.data_type,
+            dtype=dtype or param.dtype,
             type=param.type,
             shape=param.shape)
         self.helper.set_variable_initializer(
-            var, initializer=ConstantInitializer(value=float(fill_value)))
+            var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
 
     def _get_accumulator(self, name, param):
@@ -132,7 +135,7 @@ class Optimizer(object):
     def create_optimization_pass(self,
                                  parameters_and_grads,
                                  loss,
-                                 init_program=None):
+                                 startup_program=None):
         """Add optimization operators to update gradients to variables.
 
         Args:
@@ -144,7 +147,7 @@ class Optimizer(object):
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param init_program: 
+          :param startup_program: 
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -156,15 +159,16 @@ class Optimizer(object):
         # Create any accumulators
         program = loss.block.program
         self.helper = LayerHelper(
-            self.__class__.__name__, program=program, init_program=init_program)
+            self.__class__.__name__,
+            main_program=program,
+            startup_program=startup_program)
         self._create_accumulators(loss.block,
                                   [p[0] for p in parameters_and_grads])
-        # Create any necessary tensors
-        self._initialize_tensors(loss.block)
 
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is not None:
+            if param_and_grad[0].trainable is True and param_and_grad[
+                    1] is not None:
                 optimize_op = self._append_optimize_op(loss.block,
                                                        param_and_grad)
                 optimize_ops.append(optimize_op)
@@ -185,7 +189,7 @@ class Optimizer(object):
 
     def minimize(self,
                  loss,
-                 init_program=None,
+                 startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
         """Add operations to minimize `loss` by updating `parameter_list`.
@@ -193,12 +197,11 @@ class Optimizer(object):
         This method combines interface `append_backward_ops()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set or
-                                           set())
-        # Add regularization if any 
+        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+        # Add regularization if any
         params_grads = append_regularization_ops(params_grads)
         optimize_ops = self.create_optimization_pass(params_grads, loss,
-                                                     init_program)
+                                                     startup_program)
         return optimize_ops
 
 
@@ -212,27 +215,16 @@ class SGDOptimizer(Optimizer):
         self.type = "sgd"
         self._learning_rate = learning_rate
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+
         # create the optimize op
         sgd_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0]})
 
@@ -257,19 +249,6 @@ class MomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
-    def _initialize_tensors(self, block):
-        assert isinstance(block, framework.Block)
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -288,14 +267,14 @@ class MomentumOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Velocity": velocity_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={
                 "ParamOut": param_and_grad[0],
                 "VelocityOut": velocity_acc
             },
             attrs={"mu": self._momentum,
-                   "useNesterov": self._use_nesterov})
+                   "use_nesterov": self._use_nesterov})
 
         return momentum_op
 
@@ -313,18 +292,6 @@ class AdagradOptimizer(Optimizer):
         self._learning_rate = learning_rate
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -337,14 +304,14 @@ class AdagradOptimizer(Optimizer):
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
 
-        # create the adagrad optimizer op
+        # Create the adagrad optimizer op
         adagrad_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Moment": moment_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
@@ -376,18 +343,6 @@ class AdamOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -401,7 +356,7 @@ class AdamOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         self._beta2_pow_acc = self.helper.create_global_variable(
             name=unique_name('beta2_pow_acc'),
@@ -411,7 +366,7 @@ class AdamOptimizer(Optimizer):
             persistable=True)
 
         self.helper.set_variable_initializer(
-            self._beta2_pow_acc, initializer=ConstantInitializer(self._beta2))
+            self._beta2_pow_acc, initializer=Constant(self._beta2))
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
@@ -431,7 +386,7 @@ class AdamOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment1": moment1,
                 "Moment2": moment2,
                 "Beta1Pow": self._beta1_pow_acc,
@@ -493,18 +448,6 @@ class AdamaxOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         # Create beta1 power accumulator tensor
         beta_shape = [1]
@@ -515,7 +458,7 @@ class AdamaxOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
@@ -534,7 +477,7 @@ class AdamaxOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment": moment,
                 "InfNorm": inf_norm,
                 "Beta1Pow": self._beta1_pow_acc
@@ -564,3 +507,67 @@ class AdamaxOptimizer(Optimizer):
             attrs={"scale": self._beta1})
 
         return [scale_beta1]
+
+
+class DecayedAdagradOptimizer(Optimizer):
+    """Simple Decayed Adagrad optimizer with moment state
+    """
+    _moment_acc_str = "moment"
+
+    def __init__(self,
+                 learning_rate,
+                 decay=0.95,
+                 epsilon=1.0e-6,
+                 global_step=None):
+        assert learning_rate is not None
+        assert decay is not None
+        assert epsilon is not None
+
+        super(DecayedAdagradOptimizer, self).__init__(global_step)
+        self.type = "decayed_adagrad"
+        self._learning_rate = learning_rate
+        self._decay = decay
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment_acc = self._get_accumulator(self._moment_acc_str,
+                                           param_and_grad[0])
+
+        # Create the decayed adagrad optimizer op
+        decayed_adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={"ParamOut": param_and_grad[0],
+                     "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon})
+
+        return decayed_adagrad_op
+
+
+# We short the class name, since users will use the optimizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# sgd = fluid.optimizer.SGD(...)
+#
+# It is no need to add an `Optimizer` as the class suffix
+SGD = SGDOptimizer
+Momentum = MomentumOptimizer
+Adagrad = AdagradOptimizer
+Adam = AdamOptimizer
+Adamax = AdamaxOptimizer
+DecayedAdagrad = DecayedAdagradOptimizer
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
new file mode 100644
index 0000000000..86088fdd7c
--- /dev/null
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -0,0 +1,61 @@
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+
+
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+
+        if self.initializer is not None:
+            return
+
+        self.initializer = initializer
+
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'learning_rate': self.learning_rate,
+            'regularizer': self.regularizer,
+            'trainable': self.trainable
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000..2069b713fa
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,46 @@
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+
+__all__ = ['CudaProfiler']
+
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (string) : The profiler options and counters can refer to
+            "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    core.nvprof_init(output_file, output_mode, config)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
diff --git a/python/paddle/v2/framework/regularizer.py b/python/paddle/v2/fluid/regularizer.py
similarity index 91%
rename from python/paddle/v2/framework/regularizer.py
rename to python/paddle/v2/fluid/regularizer.py
index 5111ac5566..bb1ac8911e 100644
--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,8 +1,6 @@
-import paddle.v2.framework.framework as framework
+import framework
 
-__all__ = [
-    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
-]
+__all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay']
 
 
 def append_regularization_ops(parameters_and_grads):
@@ -139,3 +137,16 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             attrs={"scale": self._regularization_coeff})
 
         return decay
+
+
+# We short the class name, since users will use the regulaizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=fluid.regularizer.Xavier())
+#
+# It is no need to add a `Regularizer` as the class suffix
+L1Decay = L1DecayRegularizer
+L2Decay = L2DecayRegularizer
diff --git a/python/paddle/v2/framework/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
similarity index 86%
rename from python/paddle/v2/framework/tests/.gitignore
rename to python/paddle/v2/fluid/tests/.gitignore
index fcc52c0488..a648f2b387 100644
--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
@@ -1,2 +1,3 @@
 image/
 fit_a_line.model/
+tmp
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
similarity index 88%
rename from python/paddle/v2/framework/tests/CMakeLists.txt
rename to python/paddle/v2/fluid/tests/CMakeLists.txt
index 4d7664469e..e795627bfe 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -3,3 +3,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+add_subdirectory(book)
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000..a35abe3e0c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -0,0 +1,11 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
new file mode 100644
index 0000000000..fbf46ac6cb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -0,0 +1,41 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
new file mode 100644
index 0000000000..4e71b6f345
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -0,0 +1,130 @@
+from __future__ import print_function
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import sys
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+classdim = 10
+data_shape = [3, 32, 32]
+
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
+
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+opts = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 128
+PASS_NUM = 1
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+exe.run(fluid.default_startup_program())
+
+for pass_id in range(PASS_NUM):
+    accuracy.reset(exe)
+    for data in train_reader():
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost] + accuracy.metrics)
+        pass_acc = accuracy.eval(exe)
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
new file mode 100644
index 0000000000..d2693b602e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -0,0 +1,201 @@
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    precision, recall, f1_score = fluid.layers.chunk_eval(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor()
+    embedding_param.set(
+        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
+
+    batch_id = 0
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            outs = exe.run(fluid.default_main_program(),
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_cost, precision, recall, f1_score])
+            avg_cost_val = np.array(outs[0])
+            precision_val = np.array(outs[1])
+            recall_val = np.array(outs[2])
+            f1_score_val = np.array(outs[3])
+
+            if batch_id % 10 == 0:
+                print("avg_cost=" + str(avg_cost_val))
+                print("precision_val=" + str(precision_val))
+                print("recall_val:" + str(recall_val))
+                print("f1_score_val:" + str(f1_score_val))
+
+            # exit early for CI
+            exit(0)
+
+            batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
new file mode 100644
index 0000000000..5bc7e1b59d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -0,0 +1,103 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor, g_scope
+from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as pd
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 512
+word_dim = 512
+IS_SPARSE = True
+batch_size = 50
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64')
+src_embedding = layers.embedding(
+    input=src_word_id,
+    size=[dict_size, word_dim],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr=fluid.ParamAttr(name='vemb'))
+
+
+def encoder():
+
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(
+        input=src_embedding,
+        size=hidden_dim,
+        candidate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    lstm_hidden1, lstm_1 = layers.dynamic_lstm(
+        input=src_embedding,
+        size=hidden_dim,
+        candidate_activation='sigmoid',
+        cell_activation='sigmoid',
+        is_reverse=True)
+
+    bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0)
+
+    return bidirect_lstm_out
+
+
+def decoder_trainer(context):
+    '''
+    decoder with trainer
+    '''
+    pass
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    encoder_out = encoder()
+    # TODO(jacquesqiao) call here
+    decoder_trainer(encoder_out)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(8000), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        print 'pass_id', pass_id
+        for data in train_data():
+            print 'batch', batch_id
+            batch_id += 1
+            if batch_id > 10: break
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={'src_word_id': word_data, },
+                           fetch_list=[encoder_out])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
new file mode 100644
index 0000000000..35bf8da924
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -0,0 +1,60 @@
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 50
+PASS_NUM = 3
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+exe.run(fluid.default_startup_program())
+
+for pass_id in range(PASS_NUM):
+    accuracy.reset(exe)
+    for data in train_reader():
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost] + accuracy.metrics)
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
+              str(pass_acc))
+        # print loss, acc
+        if loss < 10.0 and pass_acc > 0.9:
+            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
+            exit(0)
+
+    pass_acc = accuracy.eval(exe)
+    print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
new file mode 100644
index 0000000000..4dc2c50e1c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -0,0 +1,76 @@
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+BATCH_SIZE = 128
+image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
+
+hidden1 = fluid.layers.fc(input=image,
+                          size=128,
+                          act='relu',
+                          param_attr=regularizer)
+hidden2 = fluid.layers.fc(input=hidden1,
+                          size=64,
+                          act='relu',
+                          param_attr=regularizer)
+
+predict = fluid.layers.fc(input=hidden2,
+                          size=10,
+                          act='softmax',
+                          param_attr=regularizer)
+
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+opts = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+inference_program = fluid.default_main_program().clone()
+test_accuracy = fluid.evaluator.Accuracy(
+    input=predict, label=label, main_program=inference_program)
+test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+inference_program = fluid.io.get_inference_program(
+    test_target, main_program=inference_program)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    accuracy.reset(exe)
+    for data in train_reader():
+        out, acc = exe.run(fluid.default_main_program(),
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_cost] + accuracy.metrics)
+        pass_acc = accuracy.eval(exe)
+
+        test_accuracy.reset(exe)
+        for data in test_reader():
+            out, acc = exe.run(inference_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost] + test_accuracy.metrics)
+
+        test_pass_acc = test_accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " train_cost=" + str(
+            out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
+              + " test_acc=" + str(test_pass_acc))
+
+        if test_pass_acc > 0.7:
+            exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
new file mode 100644
index 0000000000..db91ca4f9c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -0,0 +1,207 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import SGDOptimizer
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+
+
+def get_usr_combined_features():
+    # FIXME(dzh) : old API integer_value(10) may has range check.
+    # currently we don't have user configurated check.
+
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+
+    usr_fc = layers.fc(input=usr_emb, size=32)
+
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    # FIXME(dzh) : need tanh operator
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return mov_combined_features
+
+
+def model():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    # need cos sim
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+
+    square_cost = layers.square_error_cost(input=inference, label=label)
+
+    avg_cost = layers.mean(x=square_cost)
+
+    return avg_cost
+
+
+def main():
+    cost = model()
+    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
+    opts = sgd_optimizer.minimize(cost)
+
+    if USE_GPU:
+        place = core.GPUPlace(0)
+    else:
+        place = core.CPUPlace()
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    feeding = {
+        'user_id': 0,
+        'gender_id': 1,
+        'age_id': 2,
+        'job_id': 3,
+        'movie_id': 4,
+        'category_id': 5,
+        'movie_title': 6,
+        'score': 7
+    }
+
+    def func_feed(feeding, data):
+        feed_tensors = {}
+        for (key, idx) in feeding.iteritems():
+            tensor = core.LoDTensor()
+            if key != "category_id" and key != "movie_title":
+                if key == "score":
+                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                        "float32")
+                else:
+                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                        "int64")
+            else:
+                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
+                                 data)
+                lod_info = [len(item) for item in numpy_data]
+                offset = 0
+                lod = [offset]
+                for item in lod_info:
+                    offset += item
+                    lod.append(offset)
+                numpy_data = np.concatenate(numpy_data, axis=0)
+                tensor.set_lod([lod])
+
+            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
+            tensor.set(numpy_data, place)
+            feed_tensors[key] = tensor
+        return feed_tensors
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            outs = exe.run(framework.default_main_program(),
+                           feed=func_feed(feeding, data),
+                           fetch_list=[cost])
+            out = np.array(outs[0])
+            if out[0] < 6.0:
+                # if avg cost less than 6.0, we think our code is good.
+                exit(0)
+
+
+main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
new file mode 100644
index 0000000000..f103358edc
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -0,0 +1,87 @@
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out = convolution_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_data():
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
+            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                  " pass_acc=" + str(pass_acc))
+            if cost_val < 1.0 and pass_acc > 0.8:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
new file mode 100644
index 0000000000..cd28f04b85
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -0,0 +1,99 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out = stacked_lstm_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_data():
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
+            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                  " pass_acc=" + str(pass_acc))
+            if cost_val < 1.0 and acc_val > 0.8:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
new file mode 100644
index 0000000000..80f8599679
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -0,0 +1,114 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
+    data = fluid.layers.data(
+        name="words",
+        shape=[seq_len * batch_size, 1],
+        append_batch_size=False,
+        dtype="int64",
+        lod_level=1)
+    label = fluid.layers.data(
+        name="label",
+        shape=[batch_size, 1],
+        append_batch_size=False,
+        dtype="int64")
+
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = fluid.layers.transpose(x=emb, axis=[1, 0, 2])
+
+    c_pre_init = fluid.layers.fill_constant(
+        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
+    c_pre_init.stop_gradient = False
+    layer_1_out = fluid.layers.lstm(
+        emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+
+    prediction = fluid.layers.fc(input=layer_1_out,
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def chop_data(data, chop_len=80, batch_size=50):
+    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
+
+    return data[:batch_size]
+
+
+def prepare_feed_data(data, place):
+    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+
+    label = np.array(map(lambda x: x[1], data)).astype("int64")
+    label = label.reshape([len(label), 1])
+    tensor_label = fluid.LoDTensor()
+    tensor_label.set(label, place)
+
+    return tensor_words, tensor_label
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            chopped_data = chop_data(data)
+            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
+
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"words": tensor_words,
+                                 "label": tensor_label},
+                           fetch_list=[cost, acc])
+            cost_val = np.array(outs[0])
+            acc_val = np.array(outs[1])
+
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if acc_val > 0.7:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
new file mode 100644
index 0000000000..8b928ff9ee
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -0,0 +1,73 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+IS_SPARSE = True
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+embed_first = fluid.layers.embedding(
+    input=first_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
+    input=second_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
+    input=third_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
+    input=forth_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+
+concat_embed = fluid.layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(
+    feed_list=[first_word, second_word, third_word, forth_word, next_word],
+    place=place)
+
+exe.run(fluid.default_startup_program())
+
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        avg_cost_np = exe.run(fluid.default_main_program(),
+                              feed=feeder.feed(data),
+                              fetch_list=[avg_cost])
+        if avg_cost_np[0] < 5.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/demo/fc_gan.py b/python/paddle/v2/fluid/tests/demo/fc_gan.py
new file mode 100644
index 0000000000..cae959593e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
@@ -0,0 +1,157 @@
+import errno
+import math
+import os
+
+import matplotlib
+import numpy
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+
+NOISE_SIZE = 100
+NUM_PASS = 1000
+NUM_REAL_IMGS_IN_BATCH = 121
+NUM_TRAIN_TIMES_OF_DG = 3
+LEARNING_RATE = 2e-5
+
+
+def D(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='D.w1',
+                             bias_attr='D.b1')
+    logits = fluid.layers.fc(input=hidden,
+                             size=1,
+                             act=None,
+                             param_attr='D.w2',
+                             bias_attr='D.b2')
+    return logits
+
+
+def G(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='G.w1',
+                             bias_attr='G.b1')
+    img = fluid.layers.fc(input=hidden,
+                          size=28 * 28,
+                          act='tanh',
+                          param_attr='G.w2',
+                          bias_attr='G.b2')
+    return img
+
+
+def plot(gen_data):
+    gen_data.resize(gen_data.shape[0], 28, 28)
+    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
+    fig = plt.figure(figsize=(n, n))
+    gs = gridspec.GridSpec(n, n)
+    gs.update(wspace=0.05, hspace=0.05)
+
+    for i, sample in enumerate(gen_data):
+        ax = plt.subplot(gs[i])
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_aspect('equal')
+        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
+
+    return fig
+
+
+def main():
+    try:
+        os.makedirs("./out")
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    startup_program = fluid.Program()
+    d_program = fluid.Program()
+    dg_program = fluid.Program()
+
+    with fluid.program_guard(d_program, startup_program):
+        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(img),
+            label=fluid.layers.data(
+                name='label', shape=[1], dtype='float32'))
+        d_loss = fluid.layers.mean(x=d_loss)
+
+    with fluid.program_guard(dg_program, startup_program):
+        noise = fluid.layers.data(
+            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        g_img = G(x=noise)
+        g_program = dg_program.clone()
+        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(g_img),
+            label=fluid.layers.fill_constant_batch_size_like(
+                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
+        dg_loss = fluid.layers.mean(x=dg_loss)
+
+    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
+
+    opt.minimize(loss=d_loss, startup_program=startup_program)
+    opt.minimize(
+        loss=dg_loss,
+        startup_program=startup_program,
+        parameter_list=[
+            p.name for p in g_program.global_block().all_parameters()
+        ])
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(startup_program)
+
+    num_true = NUM_REAL_IMGS_IN_BATCH
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=60000),
+        batch_size=num_true)
+
+    for pass_id in range(NUM_PASS):
+        for batch_id, data in enumerate(train_reader()):
+            num_true = len(data)
+            n = numpy.random.uniform(
+                low=-1.0, high=1.0,
+                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
+                    [num_true, NOISE_SIZE])
+            generated_img = exe.run(g_program,
+                                    feed={'noise': n},
+                                    fetch_list={g_img})[0]
+            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = real_data.reshape(num_true, 784)
+            total_data = numpy.concatenate([real_data, generated_img])
+            total_label = numpy.concatenate([
+                numpy.ones(
+                    shape=[real_data.shape[0], 1], dtype='float32'),
+                numpy.zeros(
+                    shape=[real_data.shape[0], 1], dtype='float32')
+            ])
+            d_loss_np = exe.run(d_program,
+                                feed={'img': total_data,
+                                      'label': total_label},
+                                fetch_list={d_loss})[0]
+            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+                n = numpy.random.uniform(
+                    low=-1.0, high=1.0,
+                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
+                        [2 * num_true, NOISE_SIZE, 1, 1])
+                dg_loss_np = exe.run(dg_program,
+                                     feed={'noise': n},
+                                     fetch_list={dg_loss})[0]
+            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
+                pass_id, batch_id, d_loss_np, dg_loss_np))
+        # generate image each batch
+        fig = plot(generated_img)
+        plt.savefig(
+            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
+        plt.close(fig)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
similarity index 93%
rename from python/paddle/v2/framework/tests/op_test.py
rename to python/paddle/v2/fluid/tests/op_test.py
index 2e6710b5fc..e83c4a0622 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -2,12 +2,12 @@ import unittest
 import numpy as np
 import random
 import itertools
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.op import Operator
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import Program, OpProtoHolder
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import Program, OpProtoHolder
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -215,7 +215,11 @@ class OpTest(unittest.TestCase):
             if isinstance(input_vars[var_name], list):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
-                    tensor.set(np_value, place)
+                    if isinstance(np_value, tuple):
+                        tensor.set(np_value[0], place)
+                        tensor.set_lod(np_value[1])
+                    else:
+                        tensor.set(np_value, place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
@@ -236,7 +240,6 @@ class OpTest(unittest.TestCase):
 
         inputs = append_input_output(block, op_proto, self.inputs, True)
         outputs = append_input_output(block, op_proto, self.outputs, False)
-
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -258,7 +261,10 @@ class OpTest(unittest.TestCase):
         feed_map = self.feed_var(inputs, place)
 
         exe = Executor(place)
-        outs = exe.run(program, feed=feed_map, fetch_list=fetch_list)
+        outs = exe.run(program,
+                       feed=feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
@@ -397,9 +403,11 @@ class OpTest(unittest.TestCase):
                 if not isinstance(item[0], basestring):
                     item = [[param_name] + list(item)]
                 if len(item) == 2:
-                    # only set var name and value, set lod to None
-                    var[i] = list(item) + [None]
-
+                    if isinstance(item[1], tuple):
+                        var[i] = [item[0], item[1][0], item[1][1]]
+                    else:
+                        # only set var name and value, set lod to None
+                        var[i] = list(item) + [None]
             var_descs = [(block.create_var(
                 name=name, shape=each.shape, dtype=each.dtype), each, lod)
                          for name, each, lod in var]
@@ -453,7 +461,7 @@ class OpTest(unittest.TestCase):
         mean_inputs = map(block.var, output_names)
 
         if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].data_type, shape=[1])
+            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
             op = block.append_op(
                 inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
             op.desc.infer_var_type(block.desc)
@@ -461,8 +469,7 @@ class OpTest(unittest.TestCase):
         else:
             avg_sum = []
             for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(
-                    dtype=cur_loss.data_type, shape=[1])
+                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
                 op = block.append_op(
                     inputs={"X": [cur_loss]},
                     outputs={"Out": [cur_avg_loss]},
@@ -471,13 +478,13 @@ class OpTest(unittest.TestCase):
                 op.desc.infer_shape(block.desc)
                 avg_sum.append(cur_avg_loss)
 
-            loss_sum = block.create_var(dtype=avg_sum[0].data_type, shape=[1])
+            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
             op_sum = block.append_op(
                 inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
             op_sum.desc.infer_var_type(block.desc)
             op_sum.desc.infer_shape(block.desc)
 
-            loss = block.create_var(dtype=loss_sum.data_type, shape=[1])
+            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
             op_loss = block.append_op(
                 inputs={"X": loss_sum},
                 outputs={"Out": loss},
@@ -496,5 +503,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        result = executor.run(prog, feed_dict, fetch_list)
-        return map(np.array, result)
+        return map(
+            np.array,
+            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/fluid/tests/test_accuracy_op.py
similarity index 86%
rename from python/paddle/v2/framework/tests/test_accuracy_op.py
rename to python/paddle/v2/fluid/tests/test_accuracy_op.py
index 6536c297e8..6f72918b71 100644
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/fluid/tests/test_accuracy_op.py
@@ -18,7 +18,9 @@ class TestAccuracyOp(OpTest):
                     num_correct += 1
                     break
         self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype("float32")
+            'Accuracy': np.array([num_correct / float(n)]).astype("float32"),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
         }
 
     def test_check_output(self):
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
similarity index 87%
rename from python/paddle/v2/framework/tests/test_activation_op.py
rename to python/paddle/v2/fluid/tests/test_activation_op.py
index 7649e60a38..b052374dc7 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from scipy.special import expit
 
 
 class TestExp(OpTest):
@@ -152,6 +153,49 @@ class TestAbs(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestCeil(OpTest):
+    def setUp(self):
+        self.op_type = "ceil"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.ceil(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestFloor(OpTest):
+    def setUp(self):
+        self.op_type = "floor"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        # numpy floor need +1
+        self.outputs = {'Y': np.floor(self.inputs['X']) + 1.0}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestRound(OpTest):
+    def setUp(self):
+        self.op_type = "round"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.round(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
@@ -412,5 +456,20 @@ class TestHardSigmoid(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.002)
 
 
+class TestSwish(OpTest):
+    def setUp(self):
+        self.op_type = "swish"
+        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.inputs = {'X': X}
+        self.attrs = {'beta': 2.3}
+        self.outputs = {'Y': X * expit(self.attrs['beta'] * X)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adadelta_op.py b/python/paddle/v2/fluid/tests/test_adadelta_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adadelta_op.py
rename to python/paddle/v2/fluid/tests/test_adadelta_op.py
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
new file mode 100644
index 0000000000..903e84c328
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -0,0 +1,177 @@
+import unittest
+import numpy as np
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from op_test import OpTest
+import math
+
+
+class TestAdagradOp1(OpTest):
+    ''' Test Adagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdagradOp2(OpTest):
+    ''' Test Adagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSparseAdagradOp(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Grad Variable   
+        height = 10
+        rows = [0, 4, 7, 4]
+        row_numel = 12
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        # create and initialize Param Variable
+        param = scope.var('Param').get_tensor()
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param.set(param_array, place)
+
+        # create and initialize LeraningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and initialize moment Variable
+        moment = scope.var('Moment').get_tensor()
+        moment_np_array = np.full((height, row_numel), 2.0).astype("float32")
+        moment.set(moment_np_array, place)
+
+        # create and run sgd operator
+        adagrad_op = Operator(
+            "adagrad",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            Moment='Moment',
+            MomentOut='Moment',
+            LearningRate='LearningRate',
+            epsilon=2.0)
+
+        ctx = core.DeviceContext.create(place)
+        adagrad_op.run(scope, ctx)
+
+        # get and compare moment result
+        moment_result_array = np.array(moment)
+
+        self.assertAlmostEqual(6.0, moment_result_array[rows[0], 0])
+        self.assertAlmostEqual(3.0, moment_result_array[rows[0], 2])
+        self.assertAlmostEqual(2.0, moment_result_array[1, 0])
+        # 2.0 + (1.0 + 1.0)^2
+        self.assertAlmostEqual(6.0, moment_result_array[rows[1], 10])
+        self.assertAlmostEqual(6.0, moment_result_array[rows[3], 4])
+
+        self.assertAlmostEqual(2.0, moment_result_array[5, 8])
+        self.assertAlmostEqual(3.0, moment_result_array[rows[2], 1])
+        self.assertAlmostEqual(18.0, moment_result_array[rows[2], 8])
+
+        # get and compare param result
+        result_array = np.array(param)
+
+        def get_out(param, lr, grad, m, epsilon):
+            return param - lr * grad / (math.sqrt(m) + epsilon)
+
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
+            result_array[rows[0], 0],
+            places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
+            result_array[rows[0], 2],
+            places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5)
+
+        # grad_merge = 1.0 + 1.0
+        # m = 6.0
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
+            result_array[rows[1], 10],
+            places=5)
+
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
+            result_array[rows[2], 1],
+            places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 4.0, 18.0, 2.0),
+            result_array[rows[2], 8],
+            places=5)
+
+    def test_sparse_adagrad(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.GPUPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adam_op.py
rename to python/paddle/v2/fluid/tests/test_adam_op.py
diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/fluid/tests/test_adamax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adamax_op.py
rename to python/paddle/v2/fluid/tests/test_adamax_op.py
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
new file mode 100644
index 0000000000..f6120aedec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -0,0 +1,88 @@
+import unittest
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import default_main_program
+import numpy
+
+
+class TestArrayReadWrite(unittest.TestCase):
+    def test_read_write(self):
+        x = [
+            layers.data(
+                name='x0', shape=[100]), layers.data(
+                    name='x1', shape=[100]), layers.data(
+                        name='x2', shape=[100])
+        ]
+
+        for each_x in x:
+            each_x.stop_gradient = False
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = False
+        arr = layers.array_write(x=x[0], i=i)
+        i = layers.increment(x=i)
+        arr = layers.array_write(x=x[1], i=i, array=arr)
+        i = layers.increment(x=i)
+        arr = layers.array_write(x=x[2], i=i, array=arr)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = False
+        a0 = layers.array_read(array=arr, i=i)
+        i = layers.increment(x=i)
+        a1 = layers.array_read(array=arr, i=i)
+        i = layers.increment(x=i)
+        a2 = layers.array_read(array=arr, i=i)
+
+        mean_a0 = layers.mean(x=a0)
+        mean_a1 = layers.mean(x=a1)
+        mean_a2 = layers.mean(x=a2)
+
+        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
+
+        mean_x0 = layers.mean(x=x[0])
+        mean_x1 = layers.mean(x=x[1])
+        mean_x2 = layers.mean(x=x[2])
+
+        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
+
+        scope = core.Scope()
+        cpu = core.CPUPlace()
+
+        exe = Executor(cpu)
+
+        tensor = numpy.random.random(size=(100, 100)).astype('float32')
+
+        outs = exe.run(feed={'x0': tensor,
+                             'x1': tensor,
+                             'x2': tensor},
+                       fetch_list=[a_sum, x_sum],
+                       scope=scope)
+        self.assertEqual(outs[0], outs[1])
+
+        total_sum = layers.sums(input=[a_sum, x_sum])
+        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
+
+        append_backward_ops(total_sum_scaled)
+
+        g_vars = map(default_main_program().global_block().var,
+                     [each_x.name + "@GRAD" for each_x in x])
+        g_out = [
+            item.sum()
+            for item in exe.run(
+                feed={'x0': tensor,
+                      'x1': tensor,
+                      'x2': tensor},
+                fetch_list=g_vars)
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        # since our final gradient is 1 and the neural network are all linear
+        # with mean_op.
+        # the input gradient should also be 1
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_assign_op.py b/python/paddle/v2/fluid/tests/test_assign_op.py
new file mode 100644
index 0000000000..1b0c145f1a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
@@ -0,0 +1,21 @@
+import op_test
+import numpy
+import unittest
+
+
+class TestAssignOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign"
+        x = numpy.random.random(size=(100, 10))
+        self.inputs = {'X': x}
+        self.outputs = {'Out': x}
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_backward(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_auc_op.py b/python/paddle/v2/fluid/tests/test_auc_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_auc_op.py
rename to python/paddle/v2/fluid/tests/test_auc_op.py
diff --git a/python/paddle/v2/framework/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
similarity index 86%
rename from python/paddle/v2/framework/tests/test_batch_norm_op.py
rename to python/paddle/v2/fluid/tests/test_batch_norm_op.py
index dee339f43c..e766a68c0e 100644
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -1,8 +1,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 
 
 def grad_var_name(var_name):
@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
 
 
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         mean = x_sum / element_count
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
         grad_x = np.transpose(grad_x, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
         grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
     return grad_x, grad_scale, grad_offset
 
 
@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
         momentum = 0.9
 
         # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
         x_shape = [n, h, w, c]
         scale_shape = [c]
 
@@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, tensor_format, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
 
-            # N, H, W, C: 12, 3, 4, 2
-            n, h, w, c = 2, 3, 4, 2
-
-            if data_format == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_format == "NCHW":
-                x_shape = [n, c, h, w]
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
             else:
-                raise ValueError("Unknown data type.")
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
             x_val = np.random.random_sample(x_shape).astype(np.float32)
@@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest):
             #  for gradient test
             # y_grad = np.ones(x_shape).astype(np.float32)
             y_grad = np.zeros(x_shape).astype(np.float32)
-            y_grad[0, 0, 0, 0] = 1.
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
             # y_grad = np.random.random_sample(x_shape).astype(np.float32)
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
             places.append(core.GPUPlace(0))
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
new file mode 100644
index 0000000000..5fad7d8cce
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -0,0 +1,75 @@
+import unittest
+
+import numpy as np
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+
+
+class TestBeamSearchDecodeOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.cpu_place = core.CPUPlace()
+
+    def append_lod_tensor(self, tensor_array, lod, data):
+        lod_tensor = core.LoDTensor()
+        lod_tensor.set_lod(lod)
+        lod_tensor.set(data, self.cpu_place)
+        tensor_array.append(lod_tensor)
+
+    def test_get_set(self):
+        ids = self.scope.var("ids").get_lod_tensor_array()
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="int64"))
+
+        scores = self.scope.var("scores").get_lod_tensor_array()
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="float64"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="float64"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="float64"))
+
+        sentence_ids = self.scope.var("sentence_ids").get_tensor()
+        sentence_scores = self.scope.var("sentence_scores").get_tensor()
+
+        beam_search_decode_op = Operator(
+            "beam_search_decode",
+            # inputs
+            Ids="ids",
+            Scores="scores",
+            # outputs
+            SentenceIds="sentence_ids",
+            SentenceScores="sentence_scores")
+
+        ctx = core.DeviceContext.create(self.cpu_place)
+        beam_search_decode_op.run(self.scope, ctx)
+
+        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        self.assertEqual(sentence_ids.lod(), expected_lod)
+        self.assertEqual(sentence_scores.lod(), expected_lod)
+
+        expected_data = np.array(
+            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+        self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
+        self.assertTrue(
+            np.array_equal(np.array(sentence_scores), expected_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
new file mode 100644
index 0000000000..cc7c09bb59
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -0,0 +1,65 @@
+import logging
+from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
+import paddle.v2.fluid.core as core
+import unittest
+import numpy as np
+
+
+def create_tensor(scope, name, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class BeamSearchOpTester(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.ctx = core.DeviceContext.create(core.CPUPlace())
+        self._create_ids()
+        self._create_scores()
+        self._create_pre_ids()
+        self.scope.var('selected_ids')
+        self.scope.var('selected_scores')
+
+    def test_run(self):
+        op = Operator(
+            'beam_search',
+            pre_ids="pre_ids",
+            ids='ids',
+            scores='scores',
+            selected_ids='selected_ids',
+            selected_scores='selected_scores',
+            level=0,
+            beam_size=2,
+            end_id=0, )
+        op.run(self.scope, self.ctx)
+        selected_ids = self.scope.find_var("selected_ids").get_tensor()
+        print 'selected_ids', np.array(selected_ids)
+        print 'lod', selected_ids.lod()
+
+    def _create_pre_ids(self):
+        np_data = np.array([[1, 2, 3, 4]], dtype='int32')
+        tensor = create_tensor(self.scope, "pre_ids", np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array(
+            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int32')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.5, 0.3, 0.2],
+                [0.6, 0.3, 0.1],
+                [0.9, 0.5, 0.1],
+                [0.7, 0.5, 0.1],
+            ],
+            dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
new file mode 100644
index 0000000000..080ca43b82
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
@@ -0,0 +1,37 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestBilinearTensorProductOp(OpTest):
+    def setUp(self):
+        self.op_type = "bilinear_tensor_product"
+        batch_size = 6
+        size0 = 3
+        size1 = 4
+        size2 = 5
+        a = np.random.random((batch_size, size0)).astype("float32")
+        b = np.random.random((batch_size, size1)).astype("float32")
+        w = np.random.random((size2, size0, size1)).astype("float32")
+        bias = np.random.random((1, size2)).astype("float32")
+        output = np.zeros((batch_size, size2)).astype("float32")
+        for i in range(size2):
+            w_i = w[i, :, :]
+            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
+        self.inputs = {
+            'X': a,
+            'Y': b,
+            'Weight': w,
+            'Bias': bias,
+        }
+        self.outputs = {'Out': output + bias}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cast_op.py b/python/paddle/v2/fluid/tests/test_cast_op.py
similarity index 77%
rename from python/paddle/v2/framework/tests/test_cast_op.py
rename to python/paddle/v2/fluid/tests/test_cast_op.py
index 52ee71a8a4..4e431bb88d 100644
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
@@ -1,7 +1,7 @@
 import op_test
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestCastOp(op_test.OpTest):
@@ -10,8 +10,8 @@ class TestCastOp(op_test.OpTest):
         self.inputs = {'X': ipt.astype('float32')}
         self.outputs = {'Out': ipt.astype('float64')}
         self.attrs = {
-            'in_data_type': int(core.DataType.FP32),
-            'out_data_type': int(core.DataType.FP64)
+            'in_dtype': int(core.DataType.FP32),
+            'out_dtype': int(core.DataType.FP64)
         }
         self.op_type = 'cast'
 
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
new file mode 100644
index 0000000000..819e65a653
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -0,0 +1,179 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class Segment(object):
+    def __init__(self, chunk_type, start_idx, end_idx):
+        self.chunk_type = chunk_type
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+
+    def __str__(self):
+        return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
+                                          self.end_idx)
+
+    __repr__ = __str__
+
+
+class TestChunkEvalOp(OpTest):
+    num_sequences = 5
+    batch_size = 50
+
+    def parse_scheme(self):
+        if self.scheme == 'IOB':
+            self.num_tag_types = 2
+        elif self.scheme == 'IOE':
+            self.num_tag_types = 2
+
+    def fill_with_chunks(self, data, chunks):
+        for chunk in chunks:
+            if self.scheme == 'IOB':
+                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.start_idx + 1:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                         self.num_tag_types - 1)
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1
+                ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
+            elif self.scheme == 'IOE':
+                data[chunk.start_idx:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1)
+
+    def rand_chunks(self, starts, num_chunks):
+        if num_chunks < 0:
+            num_chunks = np.random.randint(starts[-1])
+        chunks = []
+        # generate chunk beginnings
+        chunk_begins = sorted(
+            np.random.choice(
+                range(starts[-1]), num_chunks, replace=False))
+        seq_chunk_begins = []
+        begin_idx = 0
+        # divide chunks into sequences
+        for i in range(len(starts) - 1):
+            tmp_chunk_begins = []
+            while begin_idx < len(chunk_begins) and chunk_begins[
+                    begin_idx] < starts[i + 1]:
+                tmp_chunk_begins.append(chunk_begins[begin_idx])
+                begin_idx += 1
+            seq_chunk_begins.append(tmp_chunk_begins)
+        # generate chunk ends
+        chunk_ends = []
+        for i in range(len(seq_chunk_begins)):
+            for j in range(len(seq_chunk_begins[i])):
+                low = seq_chunk_begins[i][j]
+                high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
+                    i]) - 1 else starts[i + 1]
+                chunk_ends.append(np.random.randint(low, high))
+        # generate chunks
+        for chunk_pos in zip(chunk_begins, chunk_ends):
+            chunk_type = np.random.randint(self.num_chunk_types)
+            chunks.append(Segment(chunk_type, *chunk_pos))
+        return chunks
+
+    def gen_chunks(self, infer, label, starts):
+        chunks = self.rand_chunks(starts,
+                                  self.num_infer_chunks + self.num_label_chunks
+                                  - self.num_correct_chunks)
+        correct_chunks = np.random.choice(
+            range(len(chunks)), self.num_correct_chunks, replace=False)
+        infer_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in correct_chunks],
+            self.num_infer_chunks - self.num_correct_chunks,
+            replace=False)
+        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
+        label_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in infer_chunks],
+            self.num_label_chunks - self.num_correct_chunks,
+            replace=False)
+        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
+        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
+        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
+        # exclude types in excluded_chunk_types
+        if len(self.excluded_chunk_types) > 0:
+            for idx in correct_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_correct_chunks -= 1
+            for idx in infer_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_infer_chunks -= 1
+            for idx in label_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_label_chunks -= 1
+        return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
+
+    def set_confs(self):
+        # Use the IOB scheme and labels with 2 chunk types
+        self.scheme = 'IOB'
+        self.num_chunk_types = 2
+        self.excluded_chunk_types = []
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
+
+    def set_data(self):
+        infer = np.zeros((self.batch_size, )).astype('int64')
+        infer.fill(self.num_chunk_types * self.num_tag_types)
+        label = np.copy(infer)
+        starts = np.random.choice(
+            range(1, self.batch_size), self.num_sequences - 1,
+            replace=False).tolist()
+        starts.extend([0, self.batch_size])
+        starts = sorted(starts)
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
+            infer, label, starts)
+        self.inputs = {
+            'Inference': (infer, [starts]),
+            'Label': (label, [starts])
+        }
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1 = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        self.outputs = {
+            'Precision': np.asarray(
+                [precision], dtype='float32'),
+            'Recall': np.asarray(
+                [recall], dtype='float32'),
+            'F1-Score': np.asarray(
+                [f1], dtype='float32')
+        }
+
+    def setUp(self):
+        self.op_type = 'chunk_eval'
+        self.set_confs()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestChunkEvalOpWithExclude(TestChunkEvalOp):
+    def set_confs(self):
+        # Use the IOE scheme and labels with 3 chunk types
+        self.scheme = 'IOE'
+        self.num_chunk_types = 3
+        self.excluded_chunk_types = [1]
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
new file mode 100644
index 0000000000..02f6108a3a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
@@ -0,0 +1,50 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestClipByNormOp(OpTest):
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input) < self.max_relative_error] = 0.5
+        self.op_type = "clip_by_norm"
+        self.inputs = {'X': input, }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        norm = np.sqrt(np.sum(np.square(input)))
+        if norm > self.max_norm:
+            output = self.max_norm * input / norm
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1.0
+
+
+class TestCase1(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1e20
+
+
+class TestCase2(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.max_norm = 0.1
+
+
+class TestCase3(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max_norm = 1.0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_clip_op.py b/python/paddle/v2/fluid/tests/test_clip_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_clip_op.py
rename to python/paddle/v2/fluid/tests/test_clip_op.py
diff --git a/python/paddle/v2/fluid/tests/test_compare_op.py b/python/paddle/v2/fluid/tests/test_compare_op.py
new file mode 100644
index 0000000000..5d0dfab6ff
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_compare_op.py
@@ -0,0 +1,32 @@
+import op_test
+import unittest
+import numpy
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            a = numpy.random.random(size=(10, 7)).astype(typename)
+            b = numpy.random.random(size=(10, 7)).astype(typename)
+            c = callback(a, b)
+            self.inputs = {'X': a, 'Y': b}
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+
+        def test_output(self):
+            self.check_output()
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float32', 'float64', 'int32', 'int64'}:
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_concat_op.py b/python/paddle/v2/fluid/tests/test_concat_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_concat_op.py
rename to python/paddle/v2/fluid/tests/test_concat_op.py
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_cond_op.py
rename to python/paddle/v2/fluid/tests/test_cond_op.py
index 09a3f5dc97..9d1df44b90 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -1,8 +1,8 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
-from paddle.v2.framework.op import Operator, CondOp
+from paddle.v2.fluid.op import Operator, CondOp
 
 
 class PySimpleCond(object):
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
new file mode 100644
index 0000000000..2b9d8f351a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -0,0 +1,39 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+import numpy
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_forward(self):
+        data = layers.data(name='X', shape=[1], dtype='float32')
+        data.stop_gradient = False
+        cond = layers.ConditionalBlock(inputs=[data])
+        out = layers.create_tensor(dtype='float32')
+        with cond.block():
+            hidden = layers.fc(input=data, size=10)
+            layers.assign(hidden, out)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        exe.run(default_startup_program())
+
+        x = numpy.random.random(size=(10, 1)).astype('float32')
+
+        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
+        print outs
+        loss = layers.mean(x=out)
+        append_backward_ops(loss=loss)
+        outs = exe.run(
+            feed={'X': x},
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
+        print outs
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
new file mode 100644
index 0000000000..e82e3ab0c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -0,0 +1,199 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv2d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
+        'dilation']
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) / stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
+    out = np.zeros((in_n, out_c, out_h, out_w))
+
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
+                       mode='constant',
+                       constant_values=0)
+
+    filter_dilation = np.zeros((out_c, f_c, d_bolck_h, d_bolck_w))
+    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
+        1]] = filter
+
+    for i in range(out_h):
+        for j in range(out_w):
+            for g in range(group):
+                input_pad_masked = \
+                    input_pad[:, g * f_c:(g + 1) * f_c,
+                    i * stride[0]:i * stride[0] + d_bolck_h,
+                    j * stride[1]:j * stride[1] + d_bolck_w]
+
+                f_sub = filter_dilation[g * sub_out_c:(g + 1) *
+                                        sub_out_c, :, :, :]
+                for k in range(sub_out_c):
+                    out[:, g * sub_out_c + k, i, j] = \
+                        np.sum(input_pad_masked * f_sub[k, :, :, :],
+                               axis=(1, 2, 3))
+
+    return out
+
+
+class TestConv2dOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv2d_forward_naive(input, filter, self.groups,
+                                      conv2d_param).astype('float32')
+
+        self.inputs = {'Input': input, 'Filter': filter}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv2d"
+
+
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithGroup(TestConv2dOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+#----------------Conv2dCudnn----------------
+class TestCudnn(TestConv2dOp):
+    def init_op_type(self):
+        self.op_type = "conv2d_cudnn"
+
+
+class TestCudnnWithPad(TestWithPad):
+    def init_op_type(self):
+        self.op_type = "conv2d_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_op_type(self):
+        self.op_type = "conv2d_cudnn"
+
+
+class TestCudnnWithGroup(TestWithGroup):
+    def init_op_type(self):
+        self.op_type = "conv2d_cudnn"
+
+
+class TestCudnnWith1x1(TestWith1x1):
+    def init_op_type(self):
+        self.op_type = "conv2d_cudnn"
+
+
+#  cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv_cudnn"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
similarity index 78%
rename from python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
index 999a0bdc62..d7b1f2f2a3 100644
--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -4,9 +4,7 @@ from op_test import OpTest
 
 
 def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
-    # [2, 3, 5, 5]
     in_n, in_c, in_h, in_w = input_.shape
-    # [3, 6, 3, 3]
     f_c, out_c, f_h, f_w = filter_.shape
     assert in_c == f_c
 
@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
                     j1, j2 = j * stride[0], j * stride[0] + f_w
                     out[n, k, i1:i2, j1:j2] += tmp_out
 
+    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
 
 
@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.init_op_type()
-
-        # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
         self.init_test_case()
 
         conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
@@ -55,39 +52,59 @@ class TestConv2dTransposeOp(OpTest):
         self.outputs = {'Output': output}
 
     def test_check_output(self):
-        print 'check output here for', self.op_type
         self.check_output()
 
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.op_type = "conv2d_transpose"
-
     def test_check_grad_no_input(self):
         self.check_grad(
             ['Filter'],
             'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
             no_grad_set=set(['Input']))
 
     def test_check_grad_no_filter(self):
         self.check_grad(
             ['Input'],
             'Output',
-            max_relative_error=0.05,
+            max_relative_error=0.02,
             no_grad_set=set(['Filter']))
 
     def test_check_grad(self):
         self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose"
+
+
+class TestWithPad(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithStride(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
 
 
+# ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
     def init_op_type(self):
         self.op_type = "conv2d_transpose_cudnn"
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
new file mode 100644
index 0000000000..8593dff20b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
@@ -0,0 +1,199 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv3d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_d, in_h, in_w = input.shape
+    out_c, f_c, f_d, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
+        'dilations']
+
+    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) / stride[0]
+    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) / stride[1]
+    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) / stride[2]
+
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    d_bolck_d = (dilation[0] * (f_d - 1) + 1)
+    d_bolck_h = (dilation[1] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[2] * (f_w - 1) + 1)
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
+                               (pad[2], )),
+                       mode='constant',
+                       constant_values=0)
+
+    filter_dilation = np.zeros((out_c, f_c, d_bolck_d, d_bolck_h, d_bolck_w))
+    filter_dilation[:, :, 0:d_bolck_d:dilation[0], 0:d_bolck_h:dilation[1], 0:
+                    d_bolck_w:dilation[2]] = filter
+
+    for d in range(out_d):
+        for i in range(out_h):
+            for j in range(out_w):
+                for g in range(group):
+                    input_pad_masked = \
+                        input_pad[:, g * f_c:(g + 1) * f_c,
+                        d * stride[0]:d * stride[0] + d_bolck_d,
+                        i * stride[1]:i * stride[1] + d_bolck_h,
+                        j * stride[2]:j * stride[2] + d_bolck_w]
+
+                    f_sub = filter_dilation[g * sub_out_c:(g + 1) *
+                                            sub_out_c, :, :, :, :]
+                    for k in range(sub_out_c):
+                        out[:, g * sub_out_c + k, d, i, j] = \
+                            np.sum(input_pad_masked * f_sub[k, :, :, :, :],
+                                   axis=(1, 2, 3, 4))
+
+    return out
+
+
+class TestConv3dOp(OpTest):
+    def setUp(self):
+        self.init_group()
+        self.init_op_type()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv3d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilations': self.dilations
+        }
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv3d_forward_naive(input, filter, self.groups,
+                                      conv3d_param).astype("float32")
+
+        self.inputs = {'Input': input, 'Filter': filter}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestCase1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+
+class TestWithGroup1(TestConv3dOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithGroup2(TestCase1):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1, 1]
+
+    def init_dilation(self):
+        self.dilations = [1, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 6, 6, 6]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 2, 2, 2]
+
+    def init_dilation(self):
+        self.dilations = [2, 2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestCudnn(TestConv3dOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWithGroup1Cudnn(TestWithGroup1):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWithGroup2Cudnn(TestWithGroup2):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWith1x1Cudnn(TestWith1x1):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+# FIXME(typhoonzero): find a way to determine if
+# using cudnn > 6 in python
+# class TestWithDilationCudnn(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv3d_cudnn"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
new file mode 100644
index 0000000000..8fd34b87bf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -0,0 +1,118 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    assert in_c == f_c
+
+    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
+    out_d = (in_d - 1) * stride[0] + f_d
+    out_h = (in_h - 1) * stride[1] + f_h
+    out_w = (in_w - 1) * stride[2] + f_w
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    input_masked = input_[n, :, d, i, j]  # (c)
+                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                    for k in range(out_c):
+                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
+                                         axis=0)
+                        d1, d2 = d * stride[0], d * stride[0] + f_d
+                        i1, i2 = i * stride[1], i * stride[1] + f_h
+                        j1, j2 = j * stride[2], j * stride[2] + f_w
+                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+
+    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
+              pad[2]]
+    return out
+
+
+class TestConv3dTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.init_op_type()
+        self.init_test_case()
+
+        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+        output = conv3dtranspose_forward_naive(
+            input_, filter_, conv3dtranspose_param).astype("float32")
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            # 'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+
+
+class TestWithPad(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithStride(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+# ------------ test_cudnn ------------
+class TestCudnn(TestConv3dTransposeOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv_shift_op.py b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv_shift_op.py
rename to python/paddle/v2/fluid/tests/test_conv_shift_op.py
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_cos_sim_op.py
rename to python/paddle/v2/fluid/tests/test_cos_sim_op.py
diff --git a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
new file mode 100644
index 0000000000..42b6f7a361
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
@@ -0,0 +1,11 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+
+
+class TestDocString(unittest.TestCase):
+    def test_layer_doc_string(self):
+        print layers.dropout.__doc__
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_crf_decoding_op.py
rename to python/paddle/v2/fluid/tests/test_crf_decoding_op.py
index ee2b996bf4..ab573da31d 100644
--- a/python/paddle/v2/framework/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
@@ -20,14 +20,14 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int32")
+            (seq_start_positions[-1], self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int32")
+            (seq_start_positions[-1], 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
         alpha = np.zeros((seq_len, tag_num), dtype="float64")
-        track = np.zeros((seq_len, tag_num), dtype="int32")
+        track = np.zeros((seq_len, tag_num), dtype="int64")
 
         for i in range(tag_num):
             alpha[0, i] = self.a[i] + x[0, i]
@@ -125,10 +125,10 @@ class TestCRFDecodingOp2(OpTest):
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int32") * (TAG_NUM - 1)
-        expected_output = (labels == predicted_labels).astype("int32")
+            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+        expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/framework/tests/test_crop_op.py b/python/paddle/v2/fluid/tests/test_crop_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_crop_op.py
rename to python/paddle/v2/fluid/tests/test_crop_op.py
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_cross_entropy_op.py
rename to python/paddle/v2/fluid/tests/test_cross_entropy_op.py
diff --git a/python/paddle/v2/fluid/tests/test_data_feeder.py b/python/paddle/v2/fluid/tests/test_data_feeder.py
new file mode 100644
index 0000000000..4549693203
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
@@ -0,0 +1,13 @@
+import paddle.v2.fluid as fluid
+
+
+def test_converter():
+    img = fluid.layers.data(name='image', shape=[1, 28, 28])
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
+    print(result)
+
+
+if __name__ == '__main__':
+    test_converter()
diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_default_scope_funcs.py
rename to python/paddle/v2/fluid/tests/test_default_scope_funcs.py
index 09a9850d05..738e69529e 100644
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework.default_scope_funcs import *
+from paddle.v2.fluid.default_scope_funcs import *
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
similarity index 84%
rename from python/paddle/v2/framework/tests/test_dropout_op.py
rename to python/paddle/v2/fluid/tests/test_dropout_op.py
index b14a366fca..4f5ea836b4 100644
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -7,7 +7,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64)).astype('float32')
@@ -24,7 +24,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 1.0, 'is_test': False}
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
             'Mask': np.zeros((32, 64)).astype('float32')
@@ -35,7 +35,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_training': True}
+        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64, 2)).astype('float32')
@@ -46,7 +46,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_training': False}
+        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
@@ -57,7 +57,7 @@ class TestDropoutOp5(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.75, 'is_training': False}
+        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
         self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
 
     def test_check_output(self):
diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
new file mode 100644
index 0000000000..034266c26f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
@@ -0,0 +1,130 @@
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import unittest
+import numpy
+
+
+class TestDynRNN(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.BATCH_SIZE = 2
+        self.train_data = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.BATCH_SIZE)
+
+    def test_plain_while_op(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+
+            rank_table = fluid.layers.lod_rank_table(x=sent_emb)
+
+            sent_emb_array = fluid.layers.lod_tensor_to_array(
+                x=sent_emb, table=rank_table)
+
+            seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            i.stop_gradient = False
+
+            boot_mem = fluid.layers.fill_constant_batch_size_like(
+                input=fluid.layers.array_read(
+                    array=sent_emb_array, i=i),
+                value=0,
+                shape=[-1, 100],
+                dtype='float32')
+            boot_mem.stop_gradient = False
+
+            mem_array = fluid.layers.array_write(x=boot_mem, i=i)
+
+            cond = fluid.layers.less_than(x=i, y=seq_len)
+            cond.stop_gradient = False
+            while_op = fluid.layers.While(cond=cond)
+            out = fluid.layers.create_array(dtype='float32')
+
+            with while_op.block():
+                mem = fluid.layers.array_read(array=mem_array, i=i)
+                ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
+
+                mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
+
+                hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
+
+                fluid.layers.array_write(x=hidden, i=i, array=out)
+                fluid.layers.increment(x=i, in_place=True)
+                fluid.layers.array_write(x=hidden, i=i, array=mem_array)
+                fluid.layers.less_than(x=i, y=seq_len, cond=cond)
+
+            all_timesteps = fluid.layers.array_to_lod_tensor(
+                x=out, table=rank_table)
+            last = fluid.layers.sequence_pool(
+                input=all_timesteps, pool_type='last')
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.SGD(1e-4)
+            sgd.minimize(loss=loss)
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+
+        data = next(self.train_data())
+        val = exe.run(main_program, feed=feeder.feed(data),
+                      fetch_list=[loss])[0]
+        self.assertEqual((1, ), val.shape)
+        print(val)
+        self.assertFalse(numpy.isnan(val))
+
+    def test_train_dyn_rnn(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            rnn = fluid.layers.DynamicRNN()
+
+            with rnn.block():
+                in_ = rnn.step_input(sent_emb)
+                mem = rnn.memory(shape=[100], dtype='float32')
+                out_ = fluid.layers.fc(input=[in_, mem], size=100, act='tanh')
+                rnn.update_memory(mem, out_)
+                rnn.output(out_)
+
+            last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.Adam(1e-3)
+            sgd.minimize(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(self.train_data())
+        loss_0 = exe.run(main_program,
+                         feed=feeder.feed(data),
+                         fetch_list=[loss])[0]
+        for _ in xrange(100):
+            val = exe.run(main_program,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss])[0]
+        # loss should be small after 100 mini-batch
+        self.assertLess(val[0], loss_0[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_add_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_add_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_div_op.py b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_div_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_div_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_mul_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_sub_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
diff --git a/python/paddle/v2/framework/tests/test_exception.py b/python/paddle/v2/fluid/tests/test_exception.py
similarity index 89%
rename from python/paddle/v2/framework/tests/test_exception.py
rename to python/paddle/v2/fluid/tests/test_exception.py
index 5ae048817c..b871f40c4a 100644
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/fluid/tests/test_exception.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 
 
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
new file mode 100644
index 0000000000..b1ef87c5cb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -0,0 +1,30 @@
+import unittest
+
+import numpy
+import paddle.v2.fluid.core as core
+
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.layers import mul, data
+
+
+class TestExecutor(unittest.TestCase):
+    def test_mul(self):
+        a = data(name='a', shape=[784], dtype='float32')
+        b = data(
+            name='b',
+            shape=[784, 100],
+            dtype='float32',
+            append_batch_size=False)
+        out = mul(x=a, y=b)
+        place = core.CPUPlace()
+        a_np = numpy.random.random((100, 784)).astype('float32')
+        b_np = numpy.random.random((784, 100)).astype('float32')
+        exe = Executor(place)
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
+        out = outs[0]
+        self.assertEqual((100, 100), out.shape)
+        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_expand_op.py b/python/paddle/v2/fluid/tests/test_expand_op.py
new file mode 100644
index 0000000000..0440f7a2bb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_expand_op.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExpandOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random(12).astype("float32")}
+        self.attrs = {'expand_times': [2]}
+        output = np.tile(self.inputs['X'], 2)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [2, 3]}
+        output = np.tile(self.inputs['X'], (2, 3))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")}
+        self.attrs = {'expand_times': [3, 2, 1, 2]}
+        output = np.tile(self.inputs['X'], (3, 2, 1, 2))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_feed_fetch_method.py
rename to python/paddle/v2/fluid/tests/test_feed_fetch_method.py
index fbd659ece0..178c85b0dd 100644
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
similarity index 80%
rename from python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
rename to python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
index 319ae52fb3..99de6b5d05 100644
--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
@@ -21,9 +21,14 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
     def setUp(self):
         self.op_type = "fill_constant_batch_size_like"
         self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
-        self.attrs = {'value': 3.5, 'shape': [132, -1, 7], 'dim_idx': 1}
-
-        out = np.random.random((132, 232, 7)).astype("float32")
+        self.attrs = {
+            'value': 3.5,
+            'shape': [132, -1, 7],
+            'input_dim_idx': 0,
+            'output_dim_idx': 1
+        }
+
+        out = np.random.random((132, 219, 7)).astype("float32")
         out.fill(3.5)
         self.outputs = {'Out': out}
 
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_constant_op.py
rename to python/paddle/v2/fluid/tests/test_fill_constant_op.py
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
rename to python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
diff --git a/python/paddle/v2/fluid/tests/test_framework_debug_str.py b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
new file mode 100644
index 0000000000..a4cbabdb36
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
@@ -0,0 +1,13 @@
+import unittest
+from paddle.v2.fluid.framework import Program
+
+
+class TestDebugStringFramework(unittest.TestCase):
+    def test_debug_str(self):
+        p = Program()
+        p.current_block().create_var(name='t', shape=[0, 1])
+        self.assertRaises(ValueError, callableObj=p.__str__)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_ftrl_op.py b/python/paddle/v2/fluid/tests/test_ftrl_op.py
new file mode 100644
index 0000000000..f77ac4659a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_ftrl_op.py
@@ -0,0 +1,62 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFTRLOp(OpTest):
+    def setUp(self):
+        self.op_type = "ftrl"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        sq_accum = np.full((102, 105), 0.1).astype("float32")
+        linear_accum = np.full((102, 105), 0.1).astype("float32")
+        lr = np.array([0.01]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+        lr_power = -0.5
+
+        self.inputs = {
+            'Param': w,
+            'SquaredAccumulator': sq_accum,
+            'LinearAccumulator': linear_accum,
+            'Grad': g,
+            'LearningRate': lr
+        }
+        self.attrs = {
+            'l1': l1,
+            'l2': l2,
+            'lr_power': lr_power,
+            'learning_rate': lr
+        }
+        new_accum = sq_accum + g * g
+        if lr_power == -0.5:
+            linear_out = linear_accum + g - (
+                (np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w
+        else:
+            linear_out = linear_accum + g - ((np.power(
+                new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w
+
+        x = (l1 * np.sign(linear_out) - linear_out)
+        if lr_power == -0.5:
+            y = (np.sqrt(new_accum) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+        else:
+            y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+
+        sq_accum_out = sq_accum + g * g
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'SquaredAccumOut': sq_accum_out,
+            'LinearAccumOut': linear_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gather_op.py b/python/paddle/v2/fluid/tests/test_gather_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gather_op.py
rename to python/paddle/v2/fluid/tests/test_gather_op.py
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_gaussian_random_op.py
rename to python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 0dc7e091a5..627ab4e235 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -1,6 +1,6 @@
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
new file mode 100644
index 0000000000..fa2c5a53ec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
@@ -0,0 +1,158 @@
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_lstm_op import identity, sigmoid, tanh, relu
+
+
+class TestGRUOp(OpTest):
+    lod = [[0, 2, 6, 9]]
+    batch_size = lod[0][-1]
+    frame_size = 5
+    activate = {
+        'identity': identity,
+        'sigmoid': sigmoid,
+        'tanh': tanh,
+        'relu': relu
+    }
+
+    @staticmethod
+    def seq_to_batch(lod, is_reverse):
+        idx_in_seq_list = []
+        seq_starts = lod[0]
+        seq_lens = []
+        for i in range(len(seq_starts) - 1):
+            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        sorted_seqs = sorted(
+            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+        num_batch = seq_lens[sorted_seqs[0]]
+        for batch_idx in range(num_batch):
+            idx_in_seq = []
+            for i in range(len(seq_lens)):
+                if seq_lens[sorted_seqs[i]] <= batch_idx:
+                    break
+                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
+                       ) if is_reverse else (
+                           seq_starts[sorted_seqs[i]] + batch_idx)
+                idx_in_seq.append(idx)
+            idx_in_seq_list.append(idx_in_seq)
+        return idx_in_seq_list, sorted_seqs
+
+    def gru_step(self, x, h_p, w, b):
+        batch_size = x.shape[0]
+        frame_size = w.shape[0]
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * c + (1 - u) * h_p
+        return g, r_h_p, h
+
+    def gru(self):
+        input, lod = self.inputs['Input']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, self.frame_size * 3))
+        batch_gate = self.outputs['BatchGate']
+        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
+        batch_hidden = self.outputs['BatchHidden']
+        hidden = self.outputs['Hidden']
+        idx_in_seq_list = self.idx_in_seq_list
+        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
+            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
+        num_batch = len(idx_in_seq_list)
+        end_idx = 0
+        for batch_idx in range(num_batch):
+            x = input[idx_in_seq_list[batch_idx]]
+            g, r_h_p, h = self.gru_step(x, h_p, w, b)
+            if batch_idx < (num_batch - 1):
+                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+            start_idx = end_idx
+            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+            batch_gate[start_idx:end_idx] = g
+            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+            batch_hidden[start_idx:end_idx] = h
+            hidden[idx_in_seq_list[batch_idx]] = h
+        return batch_gate, batch_reset_hidden_prev, hidden
+
+    def set_data(self):
+        lod = self.lod
+        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
+            lod, self.is_reverse)
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
+        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
+                            frame_size).astype('float64')
+        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
+        bias = np.random.rand(1, frame_size * 3).astype('float64')
+
+        self.inputs = {
+            'Input': (input, lod),
+            'H0': h0,
+            'Weight': weight,
+            'Bias': bias
+        }
+
+        self.outputs = {
+            'BatchGate': np.zeros(
+                (batch_size, frame_size * 3), dtype='float64'),
+            'BatchResetHiddenPrev': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'BatchHidden': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'Hidden': np.zeros(
+                (batch_size, frame_size), dtype='float64')
+        }
+
+    def set_confs(self):
+        self.is_reverse = False
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+    def setUp(self):
+        self.op_type = "gru"
+        self.set_confs()
+        self.set_data()
+        self.gru()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpNoInitial(TestGRUOp):
+    def set_data(self):
+        super(TestGRUOpNoInitial, self).set_data()
+        self.inputs.pop('H0')
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpReverse(TestGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
similarity index 86%
rename from python/paddle/v2/framework/tests/test_gru_unit_op.py
rename to python/paddle/v2/fluid/tests/test_gru_unit_op.py
index f356f6e9ec..501d5aa579 100644
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
@@ -28,8 +28,8 @@ def relu(x):
 
 
 class TestGRUUnitOp(OpTest):
-    batch_size = 3
-    frame_size = 5
+    batch_size = 5
+    frame_size = 10
     activate = {
         GRUActivationType.identity: identity,
         GRUActivationType.sigmoid: sigmoid,
@@ -77,7 +77,7 @@ class TestGRUUnitOp(OpTest):
         c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
                                                     g[:, frame_size * 2:])
         g = np.hstack((u_r, c))
-        h = u * h_p + (1 - u) * c
+        h = u * c + (1 - u) * h_p
         self.outputs = {
             'Gate': g.astype('float64'),
             'ResetHiddenPrev': r_h_p.astype('float64'),
@@ -92,10 +92,7 @@ class TestGRUUnitOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight'],
-            ['Hidden', 'ResetHiddenPrev', 'Gate'],
-            max_relative_error=0.007)
+        self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
 
 
 class TestGRUUnitOpWithBias(TestGRUUnitOp):
@@ -104,18 +101,20 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid
         }
 
     def test_check_grad(self):
+        self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
+
+    def test_check_grad_ingore_input(self):
         self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
-            max_relative_error=0.007)
+            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            no_grad_set=set('Input'))
 
 
 if __name__ == '__main__':
-    exit(0)  # FIXME(yuyang18): This unittest is not pass. Fix it later
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
new file mode 100644
index 0000000000..a8757a891f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
@@ -0,0 +1,28 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestHingeLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'hinge_loss'
+        samples_num = 64
+        logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32')
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32')
+
+        self.inputs = {
+            'Logits': logits,
+            'Labels': labels,
+        }
+        loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Logits'], 'Loss', max_relative_error=0.008)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
similarity index 87%
rename from python/paddle/v2/framework/tests/test_huber_loss_op.py
rename to python/paddle/v2/fluid/tests/test_huber_loss_op.py
index 003e7d7ed7..a24fcbec6c 100644
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
@@ -21,7 +21,8 @@ class TestHuberLossOp(OpTest):
             'Y': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
         }
         residual = self.inputs['Y'] - self.inputs['X']
-        loss = np.vectorize(huber_loss_forward)(residual, delta)
+        loss = np.vectorize(huber_loss_forward)(residual,
+                                                delta).astype('float32')
         self.attrs = {'delta': delta}
         self.outputs = {
             'Residual': residual,
@@ -43,6 +44,5 @@ class TestHuberLossOp(OpTest):
             ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
 
 
-# TODO(typhoonzero): should add this back till we fix it
-#if __name__ == '__main__':
-#    unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
new file mode 100644
index 0000000000..2fd609d447
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -0,0 +1,110 @@
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
+
+
+def conv_block(input,
+               num_filter,
+               groups,
+               dropouts,
+               main_program=None,
+               startup_program=None):
+    return nets.img_conv_group(
+        input=input,
+        pool_size=2,
+        pool_stride=2,
+        conv_num_filter=[num_filter] * groups,
+        conv_filter_size=3,
+        conv_act='relu',
+        conv_with_batchnorm=True,
+        conv_batchnorm_drop_rate=dropouts,
+        pool_type='max',
+        main_program=main_program,
+        startup_program=startup_program)
+
+
+class TestLayer(unittest.TestCase):
+    def test_batch_norm_layer(self):
+        main_program = Program()
+        startup_program = Program()
+        images = fluid.layers.data(
+            name='pixel',
+            shape=[3, 48, 48],
+            dtype='float32',
+            main_program=main_program)
+        hidden1 = fluid.layers.batch_norm(
+            input=images,
+            main_program=main_program,
+            startup_program=startup_program)
+        hidden2 = fluid.layers.fc(input=hidden1,
+                                  size=128,
+                                  act='relu',
+                                  main_program=main_program)
+        hidden3 = fluid.layers.batch_norm(
+            input=hidden2,
+            main_program=main_program,
+            startup_program=startup_program)
+
+        print str(main_program)
+
+    def test_dropout_layer(self):
+        main_program = Program()
+        startup_program = Program()
+        images = fluid.layers.data(
+            name='pixel',
+            shape=[3, 48, 48],
+            dtype='float32',
+            main_program=main_program)
+        fluid.layers.dropout(
+            x=images,
+            dropout_prob=0.5,
+            main_program=main_program,
+            startup_program=startup_program)
+
+        # print str(main_program)
+
+    def test_img_conv_group(self):
+        main_program = Program()
+        startup_program = Program()
+
+        images = fluid.layers.data(
+            name='pixel',
+            shape=[3, 48, 48],
+            dtype='float32',
+            main_program=main_program,
+            startup_program=startup_program)
+        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
+                           startup_program)
+        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
+                           startup_program)
+
+        # print str(main_program)
+
+    def test_elementwise_add_with_act(self):
+        main_program = Program()
+        startup_program = Program()
+        image1 = fluid.layers.data(
+            name='pixel1',
+            shape=[3, 48, 48],
+            dtype='float32',
+            main_program=main_program,
+            startup_program=startup_program)
+        image2 = fluid.layers.data(
+            name='pixel2',
+            shape=[3, 48, 48],
+            dtype='float32',
+            main_program=main_program,
+            startup_program=startup_program)
+        out = fluid.layers.elementwise_add(
+            x=image1,
+            y=image2,
+            act='relu',
+            main_program=main_program,
+            startup_program=startup_program)
+        # print(main_program)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/fluid/tests/test_infer_shape.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_infer_shape.py
rename to python/paddle/v2/fluid/tests/test_infer_shape.py
index 2b2995f5e2..9f6695ce02 100644
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/fluid/tests/test_infer_shape.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestInferShape(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
similarity index 56%
rename from python/paddle/v2/framework/tests/test_inference_model_io.py
rename to python/paddle/v2/fluid/tests/test_inference_model_io.py
index e9c9cd27d9..60aed62ead 100644
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -1,13 +1,13 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program
-from paddle.v2.framework.io import save_inference_model, load_inference_model
-import paddle.v2.framework.executor as executor
 import unittest
+
 import numpy as np
+import paddle.v2.fluid.core as core
+
+import paddle.v2.fluid.executor as executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.io import save_inference_model, load_inference_model
 
 
 class TestBook(unittest.TestCase):
@@ -19,32 +19,32 @@ class TestBook(unittest.TestCase):
         x = layers.data(
             name='x',
             shape=[2],
-            data_type='float32',
-            program=program,
-            init_program=init_program)
+            dtype='float32',
+            main_program=program,
+            startup_program=init_program)
         y = layers.data(
             name='y',
             shape=[1],
-            data_type='float32',
-            program=program,
-            init_program=init_program)
+            dtype='float32',
+            main_program=program,
+            startup_program=init_program)
 
         y_predict = layers.fc(input=x,
                               size=1,
                               act=None,
-                              program=program,
-                              init_program=init_program)
+                              main_program=program,
+                              startup_program=init_program)
 
         cost = layers.square_error_cost(
             input=y_predict,
             label=y,
-            program=program,
-            init_program=init_program)
+            main_program=program,
+            startup_program=init_program)
         avg_cost = layers.mean(
-            x=cost, program=program, init_program=init_program)
+            x=cost, main_program=program, startup_program=init_program)
 
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-        opts = sgd_optimizer.minimize(avg_cost, init_program)
+        sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -52,25 +52,20 @@ class TestBook(unittest.TestCase):
         exe.run(init_program, feed={}, fetch_list=[])
 
         for i in xrange(100):
-            x_data = np.array(
+            tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
-            y_data = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
+            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
 
-            tensor_x = core.LoDTensor()
-            tensor_x.set(x_data, place)
-            tensor_y = core.LoDTensor()
-            tensor_y.set(y_data, place)
             exe.run(program,
                     feed={'x': tensor_x,
                           'y': tensor_y},
                     fetch_list=[avg_cost])
 
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
-        outs = exe.run(program,
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        expected = np.array(outs[0])
+        expected = exe.run(program,
+                           feed={'x': tensor_x,
+                                 'y': tensor_y},
+                           fetch_list=[avg_cost])[0]
 
         reload(executor)  # reload to build a new scope
         exe = executor.Executor(place)
@@ -83,7 +78,7 @@ class TestBook(unittest.TestCase):
             feed={feed_var_names[0]: tensor_x,
                   feed_var_names[1]: tensor_y},
             fetch_list=fetch_vars)
-        actual = np.array(outs[0])
+        actual = outs[0]
 
         self.assertEqual(feed_var_names, ["x", "y"])
         self.assertEqual(len(fetch_vars), 1)
diff --git a/python/paddle/v2/framework/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
similarity index 62%
rename from python/paddle/v2/framework/tests/test_initializer.py
rename to python/paddle/v2/fluid/tests/test_initializer.py
index bd4d2e39d7..3175010f48 100644
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -1,8 +1,8 @@
 import numpy as np
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.initializer as initializer
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.initializer as initializer
 
 DELTA = 0.00001
 
@@ -60,6 +60,29 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
+    def test_uniform_initializer_random_seed(self):
+        """Test the uniform initializer with manually setting seed
+        """
+        program = framework.Program()
+        program.random_seed = 123
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer())
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer(seed=456))
+        init_op = block.ops[1]
+        self.assertEqual(init_op.attr("seed"), 123)
+        init_op1 = block.ops[0]
+        self.assertEqual(init_op1.attr("seed"), 456)
+
     def test_uniform_initializer(self):
         """Test uniform initializer with supplied attributes
         """
@@ -223,5 +246,109 @@ class TestXavierInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_uniform_msra_initializer(self):
+        """Test MSRA initializer with uniform distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_uniform_msra_initializer_conv(self):
+        """Test MSRA initializer with uniform distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        receptive_field_size = float(15 * 20)
+        limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer(self):
+        """Test MSRA initializer with normal distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer_conv(self):
+        """Test MSRA initializer with normal distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        receptive_field_size = float(15 * 20)
+        std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_msra_initializer_supplied_arguments(self):
+        """Test the MSRA initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(
+                fan_in=12, seed=134))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / 12)
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 134)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
new file mode 100644
index 0000000000..ed6e3fe24f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -0,0 +1,43 @@
+import unittest
+import numpy as np
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
+
+
+def create_tensor(scope, name, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(np_data.shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class TestIsEmptyOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        # create input variables
+        np_data0 = np.array([0, 1, 2])
+        create_tensor(self.scope, "X0", np_data0)
+
+        np_data1 = np.array([1])
+        t = create_tensor(self.scope, "X1", np_data1)
+        t.set_dims([0])
+
+        # create output variables
+        self.scope.var("out")
+
+    def test_no_empty(self):
+        self.one_case("X0", False)
+
+    def test_empty(self):
+        self.one_case("X1", True)
+
+    def one_case(self, input, target):
+        op = Operator(type="is_empty", X=input, Out="out")
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        op.run(self.scope, ctx)
+        out = self.scope.var("out").get_tensor()
+        self.assertEqual(np.array(out)[0], target)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_l1_norm_op.py b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_l1_norm_op.py
rename to python/paddle/v2/fluid/tests/test_l1_norm_op.py
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
new file mode 100644
index 0000000000..57f6a362de
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -0,0 +1,163 @@
+from __future__ import print_function
+import unittest
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.param_attr import ParamAttr
+
+
+class TestBook(unittest.TestCase):
+    def test_fit_a_line(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
+
+        print(str(program))
+
+    def test_recognize_digits_mlp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
+
+    def test_simple_conv2d(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
+
+        print(str(program))
+
+    def test_conv2d_transpose(self):
+        program = Program()
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
+
+    def test_recognize_digits_conv(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
+
+    def test_word_embedding(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
+
+    def test_linear_chain_crf(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            label_dict_len = 10
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
+            self.assertNotEqual(crf, None)
+            self.assertNotEqual(crf_decode, None)
+
+        print(str(program))
+
+    def test_sigmoid_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            dat = layers.data(name='data', shape=[10], dtype='float32')
+            lbl = layers.data(name='label', shape=[10], dtype='float32')
+            self.assertIsNotNone(
+                layers.sigmoid_cross_entropy_with_logits(
+                    x=dat, label=lbl))
+        print(str(program))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
similarity index 99%
rename from python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
rename to python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
index 6f06a66c82..c26634ff20 100644
--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
new file mode 100644
index 0000000000..8a4be545ed
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -0,0 +1,21 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestLoDArrayLength(unittest.TestCase):
+    def test_array_length(self):
+        tmp = layers.zeros(shape=[10], dtype='int32')
+        i = layers.fill_constant(shape=[1], dtype='int64', value=10)
+        arr = layers.array_write(tmp, i=i)
+        arr_len = layers.array_length(arr)
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        result = exe.run(fetch_list=[arr_len])[0]
+        self.assertEqual(11, result[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
similarity index 72%
rename from python/paddle/v2/framework/tests/test_lod_rank_table.py
rename to python/paddle/v2/fluid/tests/test_lod_rank_table.py
index f635e716bc..30d619fe31 100644
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,7 +1,6 @@
-from paddle.v2.framework.layers import lod_rank_table, data
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.layers import lod_rank_table, data
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
 import numpy
 import unittest
 
@@ -18,8 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-
-        exe.run(g_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/fluid/tests/test_lod_reset_op.py b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
new file mode 100644
index 0000000000..652ccecfa4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
@@ -0,0 +1,64 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLodResetOpByAttr(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 7, 10]
+        self.inputs = {'X': (x, lod)}
+        self.attrs = {'target_lod': target_lod_0}
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLodResetOpByInput(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array([target_lod_0]).astype('int32')
+        }
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+class TestLodResetOpBoth(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0_attr = [0, 7, 10]
+        target_lod_0_in = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array(target_lod_0_in).astype('int32')
+        }
+        self.attrs = {'target_lod': target_lod_0_attr}
+        self.outputs = {'Out': (x, [target_lod_0_in])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
new file mode 100644
index 0000000000..d6d3e23fd8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
@@ -0,0 +1,38 @@
+import unittest
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestLoDTensorArray(unittest.TestCase):
+    def test_get_set(self):
+        scope = core.Scope()
+        arr = scope.var('tmp_lod_tensor_array')
+        tensor_array = arr.get_lod_tensor_array()
+        self.assertEqual(0, len(tensor_array))
+        cpu = core.CPUPlace()
+        for i in xrange(10):
+            t = core.LoDTensor()
+            t.set(numpy.array([i], dtype='float32'), cpu)
+            t.set_lod([[0, 1]])
+            tensor_array.append(t)
+
+        self.assertEqual(10, len(tensor_array))
+
+        for i in xrange(10):
+            t = tensor_array[i]
+            self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
+            self.assertEqual([[0, 1]], t.lod())
+
+            t = core.LoDTensor()
+            t.set(numpy.array([i + 10], dtype='float32'), cpu)
+            t.set_lod([[0, 2]])
+            tensor_array[i] = t
+            t = tensor_array[i]
+            self.assertEqual(
+                numpy.array(t), numpy.array(
+                    [i + 10], dtype='float32'))
+            self.assertEqual([[0, 2]], t.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
new file mode 100644
index 0000000000..0a916a55bc
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -0,0 +1,197 @@
+import unittest
+import paddle.v2.fluid.core as core
+import numpy
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_lod_tensor_to_array_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+        expect = map(lambda x: numpy.array(x).astype('int32'),
+                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
+
+    def test_lod_tensor_to_array_level_0_empty_seq(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 9, 10]])
+        expect = map(lambda x: numpy.array(x).astype('int32'),
+                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
+
+    def test_lod_tensor_to_array_level_1(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+
+        expect = [
+            numpy.array(
+                [9, 10, 0, 1, 2], dtype='int32'), numpy.array(
+                    [11, 12, 13, 14, 15, 16, 3, 4, 5, 6, 7, 8], dtype='int32'),
+            numpy.array(
+                [17, 18, 19], dtype='int32')
+        ]
+
+        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
+
+    def test_lod_tensor_to_array_level_1_empty_seq(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
+
+        tensor.set_lod([[0, 3, 5, 9, 11],
+                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+
+        expect = [
+            numpy.array(
+                item, dtype='int32')
+            for item in [[
+                12, 13, 14, 15, 16, 0, 1, 2, 23, 24, 25, 26, 27, 28, 29
+            ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
+        ]
+
+        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=4)
+
+    def test_lod_tensor_to_array_level_2(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
+                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+
+        expect = [
+            numpy.array(
+                item, dtype='int32')
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
+                22, 39) + range(7, 21), range(39, 46)]
+        ]
+        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
+               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
+
+    def test_lod_tensor_to_array_level_2_skip_level(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
+                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        self.main(
+            tensor=tensor,
+            expect_array=None,
+            expect_lod=None,
+            expect_max_len=4,
+            level=1)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
+        place = self.place()
+        program = Program()
+        x = layers.data(name='x', shape=[10], main_program=program)
+        x.persistable = True
+        table = layers.lod_rank_table(x, level=level, main_program=program)
+        max_len = layers.max_sequence_len(table, main_program=program)
+        max_len.persistable = True
+        array = layers.lod_tensor_to_array(x, table, main_program=program)
+        array.persistable = True
+
+        result = layers.array_to_lod_tensor(array, table, main_program=program)
+        result.persistable = True
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor}, scope=scope)
+        var = scope.find_var(array.name)
+        array = var.get_lod_tensor_array()
+        if expect_array is not None and expect_lod is not None:
+            self.check_array_same(array, expect_array, expect_lod)
+        self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
+
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
+    def check_array_same(self, array, expect_tensor, expect_lod):
+        self.assertEqual(len(expect_tensor), len(array))
+        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
+            exp_tensor, exp_lod = exp
+            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
+            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
+            self.assertEqual(exp_lod, array[i].lod())
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(
+            numpy.allclose(numpy.array(actual), numpy.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        x = layers.data(
+            name='x',
+            shape=[1],
+            dtype='float32',
+            main_program=program,
+            stop_gradient=False)
+        table = layers.lod_rank_table(x, level=0, main_program=program)
+        array = layers.lod_tensor_to_array(x, table, main_program=program)
+        result = layers.array_to_lod_tensor(array, table, main_program=program)
+
+        mean = layers.mean(x=result, main_program=program)
+
+        append_backward_ops(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+
+        exe = Executor(place)
+        g_out = [
+            numpy.array(item).sum()
+            for item in exe.run(program,
+                                feed={'x': tensor},
+                                fetch_list=[g_vars],
+                                return_numpy=False)
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000..2eeaa90758
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_logical_op.py b/python/paddle/v2/fluid/tests/test_logical_op.py
new file mode 100644
index 0000000000..ac90bf839c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_logical_op.py
@@ -0,0 +1,35 @@
+import op_test
+import unittest
+import numpy as np
+
+
+def create_test_class(op_type, callback, binary_op=True):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
+            if binary_op:
+                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
+                c = callback(a, b)
+            else:
+                c = callback(a)
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+            if binary_op:
+                self.inputs = {'X': a, 'Y': b}
+            else:
+                self.inputs = {'X': a}
+
+        def test_output(self):
+            self.check_output()
+
+    Cls.__name__ = op_type
+    globals()[op_type] = Cls
+
+
+create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
+create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
+create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
+create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lookup_table_op.py
rename to python/paddle/v2/fluid/tests/test_lookup_table_op.py
diff --git a/python/paddle/v2/framework/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_lrn_op.py
rename to python/paddle/v2/fluid/tests/test_lrn_op.py
index 7e34b3c91c..9abb09e53a 100644
--- a/python/paddle/v2/framework/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
@@ -23,7 +23,7 @@ class TestLRNOp(OpTest):
         start = -(self.n - 1) / 2
         end = start + self.n
 
-        mid = np.empty((self.N, self.C, self.H, self.W), dtype=float)
+        mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
         mid.fill(self.k)
         for m in range(0, self.N):
             for i in range(0, self.C):
@@ -74,5 +74,4 @@ class TestLRNOp(OpTest):
 
 
 if __name__ == "__main__":
-    exit(0)  # LRN grad implement wrong
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
similarity index 56%
rename from python/paddle/v2/framework/tests/test_lstm_op.py
rename to python/paddle/v2/fluid/tests/test_lstm_op.py
index ff75160083..77f062e8c8 100644
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
@@ -117,8 +117,9 @@ class TestLstmOp(OpTest):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
-        self.has_initial_state = True
+        self.has_initial_state = False
         self.is_reverse = False
+        self.use_peepholes = True
 
     def setUp(self):
         self.set_argument()
@@ -128,18 +129,28 @@ class TestLstmOp(OpTest):
         N = len(self.lod[0]) - 1
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
-        h0 = np.zeros((N, self.D)).astype('float64')
-        c0 = np.zeros((N, self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
         w = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
-        b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
 
         w_b = b[:, 0:4 * self.D]
-        w_c = b[:, 4 * self.D:]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
                     ACTVATION[self.act_gate], ACTVATION[self.act_cell],
                     ACTVATION[self.act_cand])
 
-        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'Bias': b}
+        self.inputs = {'Input': (x, self.lod), 'Weight': w}
+
+        self.inputs['Bias'] = b
+
         if self.has_initial_state:
             self.inputs['H0'] = h0
             self.inputs['C0'] = c0
@@ -149,17 +160,16 @@ class TestLstmOp(OpTest):
             'Cell': (c, self.lod),
         }
         self.attrs = {
-            'usePeepholes': True,
-            'isReverse': self.is_reverse,
-            'gateActivation': self.act_gate,
-            'cellActivation': self.act_cell,
-            'candidateActivation': self.act_cand
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
         }
 
     def test_check_output(self):
         self.check_output(atol=1e-8)
 
-    #TODO(qingqing) add more unit testing case
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0]) - 1
@@ -170,7 +180,7 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasNoInitial(TestLstmOp):
+class TestLstmOpHasInitial(TestLstmOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
@@ -179,8 +189,69 @@ class TestLstmOpHasNoInitial(TestLstmOp):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
-        self.has_initial_state = False
+        self.has_initial_state = True
         self.is_reverse = True
+        self.use_peepholes = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+            max_relative_error=5e-4)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('C0'))
 
 
 class TestLstmOpRerverse(TestLstmOp):
@@ -192,8 +263,23 @@ class TestLstmOpRerverse(TestLstmOp):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
-        self.has_initial_state = True
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = True
+
+
+class TestLstmOpNotUsePeepholes(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
         self.is_reverse = True
+        self.use_peepholes = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lstm_unit_op.py
rename to python/paddle/v2/fluid/tests/test_lstm_unit_op.py
diff --git a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
rename to python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_matmul_op.py
rename to python/paddle/v2/fluid/tests/test_matmul_op.py
diff --git a/python/paddle/v2/framework/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
similarity index 52%
rename from python/paddle/v2/framework/tests/test_maxout_op.py
rename to python/paddle/v2/fluid/tests/test_maxout_op.py
index 4ea1e3c29c..5fbed43e25 100644
--- a/python/paddle/v2/framework/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -3,31 +3,21 @@ import numpy as np
 from op_test import OpTest
 
 
-
-def maxout_forward_naive_2sweetsky(input, groups, num_channels):
-    s0, s1, s2, s3 = input.shape
-    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
-        buffer = input, dtype=input.dtype).max(axis=(2))
-
-
-def maxout_forward_naive(input, groups,num_channels):
+def maxout_forward_naive(input, groups):
     s0, s1, s2, s3 = input.shape
     return np.ndarray([s0, s1 / groups, groups, s2, s3], \
         buffer = input, dtype=input.dtype).max(axis=(2))
 
 
-
-
-class TestMaxOut_Op(OpTest):
+class TestMaxOutOp(OpTest):
     def setUp(self):
         self.op_type = "maxout"
         self.init_test_case()
         input = np.random.random(self.shape).astype("float32")
-        output = self.MaxOut_forward_naive(input, self.groups,
-                self.num_channels).astype("float32")
+        output = self.MaxOut_forward_naive(input, self.groups).astype("float32")
 
         self.inputs = {'X': input}
-        self.attrs = {'groups': self.groups, 'num_channels': self.num_channels}
+        self.attrs = {'groups': self.groups}
 
         self.outputs = {'Out': output.astype('float32')}
 
@@ -35,17 +25,12 @@ class TestMaxOut_Op(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        print self.inputs
-        print self.outputs
-        self.check_grad(['X'], 'Out', max_relative_error=0.5)
+        self.check_grad(['X'], 'Out')
 
     def init_test_case(self):
         self.MaxOut_forward_naive = maxout_forward_naive
         self.shape = [100, 6, 2, 2]
-        self.groups=2
-        self.num_channels=6
-
-
+        self.groups = 2
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/fluid/tests/test_mean_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_mean_op.py
rename to python/paddle/v2/fluid/tests/test_mean_op.py
diff --git a/python/paddle/v2/framework/tests/test_minus_op.py b/python/paddle/v2/fluid/tests/test_minus_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_minus_op.py
rename to python/paddle/v2/fluid/tests/test_minus_op.py
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
new file mode 100644
index 0000000000..50fcc4a72d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
@@ -0,0 +1,138 @@
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import MomentumOptimizer
+import paddle.v2.fluid.core as core
+import paddle.v2 as paddle
+import unittest
+import numpy as np
+
+
+class TestMNISTIfElseOp(unittest.TestCase):
+    def test_raw_api(self):
+        kwargs = {'startup_program': Program(), 'main_program': Program()}
+        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+
+        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+
+        limit = layers.fill_constant_batch_size_like(
+            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
+
+        cond = layers.less_than(x=label, y=limit, **kwargs)
+        true_image, false_image = layers.split_lod_tensor(
+            input=image, mask=cond, **kwargs)
+
+        true_out = layers.create_tensor(dtype='float32', **kwargs)
+        true_cond = layers.ConditionalBlock([true_image], **kwargs)
+
+        with true_cond.block():
+            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            layers.assign(input=prob, output=true_out, **kwargs)
+
+        false_out = layers.create_tensor(dtype='float32', **kwargs)
+        false_cond = layers.ConditionalBlock([false_image], **kwargs)
+
+        with false_cond.block():
+            hidden = layers.fc(input=false_image,
+                               size=200,
+                               act='tanh',
+                               **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            layers.assign(input=prob, output=false_out, **kwargs)
+
+        prob = layers.merge_lod_tensor(
+            in_true=true_out, in_false=false_out, mask=cond, x=image, **kwargs)
+        loss = layers.cross_entropy(input=prob, label=label, **kwargs)
+        avg_loss = layers.mean(x=loss, **kwargs)
+
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        optimizer.minimize(avg_loss, kwargs['startup_program'])
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(kwargs['startup_program'])
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = np.expand_dims(y_data, axis=1)
+
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+    def test_ifelse(self):
+        kwargs = {'startup_program': Program(), 'main_program': Program()}
+        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+
+        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+
+        limit = layers.fill_constant_batch_size_like(
+            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
+
+        cond = layers.less_than(x=label, y=limit, **kwargs)
+
+        ie = layers.IfElse(cond, **kwargs)
+
+        with ie.true_block():
+            true_image = ie.input(image)
+            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            ie.output(prob)
+
+        with ie.false_block():
+            false_image = ie.input(image)
+            hidden = layers.fc(input=false_image,
+                               size=200,
+                               act='tanh',
+                               **kwargs)
+            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+            ie.output(prob)
+
+        prob = ie()
+        loss = layers.cross_entropy(input=prob[0], label=label, **kwargs)
+        avg_loss = layers.mean(x=loss, **kwargs)
+
+        optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+        optimizer.minimize(avg_loss, kwargs['startup_program'])
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(kwargs['startup_program'])
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape((y_data.shape[0], 1))
+
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
rename to python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/fluid/tests/test_momentum_op.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_momentum_op.py
rename to python/paddle/v2/fluid/tests/test_momentum_op.py
index 654d31975a..638095f756 100644
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/fluid/tests/test_momentum_op.py
@@ -37,7 +37,7 @@ class TestMomentumOp1(OpTest):
 
 
 class TestMomentumOp2(OpTest):
-    '''Test Momentum with defaukt values for attributes
+    '''Test Momentum with default values for attributes
     '''
 
     def setUp(self):
@@ -57,7 +57,7 @@ class TestMomentumOp2(OpTest):
             'LearningRate': learning_rate
         }
 
-        self.attrs = {'mu': mu, 'useNesterov': use_nesterov}
+        self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
 
         velocity_out = mu * velocity + grad
         if use_nesterov:
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/fluid/tests/test_mul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_mul_op.py
rename to python/paddle/v2/fluid/tests/test_mul_op.py
diff --git a/python/paddle/v2/framework/tests/test_multiplex_op.py b/python/paddle/v2/fluid/tests/test_multiplex_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_multiplex_op.py
rename to python/paddle/v2/fluid/tests/test_multiplex_op.py
diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py
new file mode 100644
index 0000000000..8aeba69769
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_nce.py
@@ -0,0 +1,98 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def nce(input, weight, bias, sample_weight, labels, num_classes,
+        num_sample_class):
+    samples = []
+    sample_labels = []
+    batch_size = input.shape[0]
+    num_true_class = labels.shape[1]
+    for i in range(batch_size):
+        w = 1 if sample_weight is None else sample_weight[i]
+        for label in labels[i]:
+            samples.append((i, label, True, w))
+            sample_labels.append(label)
+        for num in range(num_sample_class):
+            samples.append((i, num, False, w))
+            sample_labels.append(num)
+    # forward bias
+    sample_out = np.zeros(len(samples)).astype(np.float32)
+    if bias is not None:
+        for i in range(len(samples)):
+            sample_out[i] = bias[samples[i][1]]
+    # forward weight
+    for i in range(len(samples)):
+        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
+
+    # forward activation
+    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
+    # forward cost
+    out = np.zeros(batch_size).astype(np.float32)
+    b = 1.0 / num_classes * num_sample_class
+    for i in range(len(samples)):
+        o = sample_out[i]
+        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
+        out[samples[i][0]] += cost * samples[i][3]
+    return (out[:, np.newaxis], np.array(sample_out).reshape(
+        batch_size, num_sample_class + num_true_class),
+            np.array(sample_labels).reshape(batch_size,
+                                            num_sample_class + num_true_class))
+
+
+class TestNCE(OpTest):
+    def generate_data(self, dim, batch_size, num_classes, num_true_class,
+                      num_neg_samples):
+        input = np.random.randn(batch_size, dim).astype(np.float32)
+        weight = np.random.randn(num_classes, dim).astype(np.float32)
+        bias = np.random.randn(num_classes).astype(np.float32)
+        sample_weight = np.random.randn(batch_size).astype(np.float32)
+        labels = np.random.randint(0, num_classes, (batch_size, num_true_class))
+        self.attrs = {
+            'num_total_classes': num_classes,
+            'num_neg_samples': num_neg_samples,
+            'custom_neg_classes': range(num_neg_samples)
+        }
+        self.inputs = {
+            'Input': input,
+            'Label': labels,
+            'Weight': weight,
+            'Bias': bias,
+            'SampleWeight': sample_weight
+        }
+
+    def set_data(self):
+        self.generate_data(5, 5, 4, 1, 2)
+
+    def compute(self):
+        out = nce(self.inputs['Input'], self.inputs['Weight'],
+                  self.inputs['Bias'], self.inputs['SampleWeight'],
+                  self.inputs['Label'], self.attrs['num_total_classes'],
+                  self.attrs['num_neg_samples'])
+        self.outputs = {
+            'Cost': out[0],
+            'SampleLogits': out[1],
+            'SampleLabels': out[2]
+        }
+
+    def setUp(self):
+        self.op_type = 'nce'
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
+
+
+class TestNCECase1(TestNCE):
+    def set_data(self):
+        self.generate_data(10, 20, 10, 2, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/fluid/tests/test_net.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_net.py
rename to python/paddle/v2/fluid/tests/test_net.py
index 8503257feb..318df08a9e 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/fluid/tests/test_net.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
similarity index 84%
rename from python/paddle/v2/framework/tests/test_op_support_gpu.py
rename to python/paddle/v2/fluid/tests/test_op_support_gpu.py
index dd36c666c4..a0eb4bd5fd 100644
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestOpSupportGPU(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/fluid/tests/test_operator.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_operator.py
rename to python/paddle/v2/fluid/tests/test_operator.py
index 98f6b2f5ee..4aa022ef90 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
@@ -1,7 +1,7 @@
 import unittest
-import paddle.v2.framework.op as op
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.op as op
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
 class TestGetAllProtos(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_operator_desc.py
rename to python/paddle/v2/fluid/tests/test_operator_desc.py
index 7355f72455..ce34d95ac8 100644
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,11 +1,15 @@
 import unittest
-from paddle.v2.framework.framework import Variable, Program, g_program
-import paddle.v2.framework.core as core
+
+import paddle.v2.fluid.core as core
+
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
 
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_program.create_block()
+        block = main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
similarity index 74%
rename from python/paddle/v2/framework/tests/test_optimizer.py
rename to python/paddle/v2/fluid/tests/test_optimizer.py
index 9333df8f7f..2459dfd664 100644
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestOptimizer(unittest.TestCase):
@@ -16,14 +16,18 @@ class TestOptimizer(unittest.TestCase):
             dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
         mul_out = block.create_var(
             dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
         block.append_op(
             type="mul",
             inputs={"X": mul_x,
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-        opts = sgd_optimizer.minimize(mul_out, init_program)
+        opts = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -44,12 +48,16 @@ class TestOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         global_step = block.create_var(
             dtype="float32", shape=[1], lod_level=0, name="step")
         learning_rate = 0.01
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
-        opts = sgd_optimizer.minimize(mul_out, init_program)
+        opts = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 2)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -90,7 +98,11 @@ class TestMomentumOptimizer(unittest.TestCase):
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2)
-        params_grads = append_backward_ops(mul_out)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -98,7 +110,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "momentum")
-        self.assertFalse(sgd_op.attr('useNesterov'))
+        self.assertFalse(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
         accumulators = momentum_optimizer.get_accumulators()
@@ -132,10 +144,14 @@ class TestMomentumOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         momentum_optimizer = self.MockMomentum(
             learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
         opts = momentum_optimizer.create_optimization_pass(
@@ -143,7 +159,7 @@ class TestMomentumOptimizer(unittest.TestCase):
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "momentum")
-        self.assertTrue(sgd_op.attr('useNesterov'))
+        self.assertTrue(sgd_op.attr('use_nesterov'))
 
         # Check accumulators
         accumulators = momentum_optimizer.get_accumulators()
@@ -186,10 +202,14 @@ class TestAdagradOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         adagrad_optimizer = self.MockAdagrad(
             learning_rate=learning_rate, epsilon=1.0e-6)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
         opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -198,7 +218,7 @@ class TestAdagradOptimizer(unittest.TestCase):
         adagrad_op = opts[0]
         self.assertEqual(adagrad_op.type, "adagrad")
 
-        # check accumulators
+        # Check accumulators
         accumulators = adagrad_optimizer.get_accumulators()
         self.assertEqual(len(accumulators), 1)
         self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
@@ -242,10 +262,14 @@ class TestAdamOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         adam_optimizer = self.MockAdam(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
         opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -300,10 +324,14 @@ class TestAdamaxOptimizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         learning_rate = 0.01
         adamax_optimizer = self.MockAdamax(
             learning_rate=learning_rate, beta1=0.9, beta2=0.999)
-        params_grads = append_backward_ops(mul_out)
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
         opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
@@ -331,5 +359,63 @@ class TestAdamaxOptimizer(unittest.TestCase):
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 
 
+class TestDecayedAdagradOptimizer(unittest.TestCase):
+    class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment_str(self):
+            return self._moment_acc_str
+
+    def test_decayed_adagrad_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        decayed_adagrad_optimizer = self.MockDecayedAdagrad(
+            learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
+        params_grads = append_backward_ops(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
+        opts = decayed_adagrad_optimizer.create_optimization_pass(
+            params_grads, mul_out, init_program)
+        self.assertEqual(len(opts), 1)
+        decayed_adagrad_op = opts[0]
+        self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
+
+        # Check accumulators
+        accumulators = decayed_adagrad_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(
+            decayed_adagrad_optimizer.get_moment_str() in accumulators)
+        moment_acc = accumulators[decayed_adagrad_optimizer.get_moment_str()]
+        self.assertEqual(len(moment_acc), 1)
+        self.assertTrue(mul_x.name in moment_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pad_op.py b/python/paddle/v2/fluid/tests/test_pad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pad_op.py
rename to python/paddle/v2/fluid/tests/test_pad_op.py
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
new file mode 100644
index 0000000000..694344acbb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -0,0 +1,35 @@
+import unittest
+from paddle.v2.fluid.framework import default_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.io as io
+from paddle.v2.fluid.initializer import ConstantInitializer
+import numpy as np
+
+main_program = default_main_program()
+
+
+class TestParameter(unittest.TestCase):
+    def test_param(self):
+        shape = [784, 100]
+        val = 1.0625
+        b = main_program.global_block()
+        param = b.create_parameter(
+            name='fc.w',
+            shape=shape,
+            dtype='float32',
+            initializer=ConstantInitializer(val))
+        self.assertIsNotNone(param)
+        self.assertEqual('fc.w', param.name)
+        self.assertEqual((784, 100), param.shape)
+        self.assertEqual(core.DataType.FP32, param.dtype)
+        self.assertEqual(0, param.block.idx)
+        exe = Executor(core.CPUPlace())
+        p = exe.run(main_program, fetch_list=[param])[0]
+        self.assertTrue(np.allclose(p, np.ones(shape) * val))
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
+        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py
similarity index 61%
rename from python/paddle/v2/framework/tests/test_pool2d_op.py
rename to python/paddle/v2/fluid/tests/test_pool2d_op.py
index c93469e119..5dff6270f4 100644
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py
@@ -3,8 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -23,8 +22,7 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
     return out
 
 
-def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -47,6 +45,7 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 class TestPool2d_Op(OpTest):
     def setUp(self):
         self.init_test_case()
+        self.init_global_pool()
         self.init_op_type()
         self.init_pool_type()
         if self.global_pool:
@@ -61,8 +60,8 @@ class TestPool2d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output.astype('float32')}
@@ -75,8 +74,6 @@ class TestPool2d_Op(OpTest):
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
     def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
         self.shape = [2, 3, 5, 5]
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -87,12 +84,14 @@ class TestPool2d_Op(OpTest):
 
     def init_pool_type(self):
         self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
 
 
 class TestCase1(TestPool2d_Op):
     def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -103,12 +102,14 @@ class TestCase1(TestPool2d_Op):
 
     def init_pool_type(self):
         self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
 
 
 class TestCase2(TestPool2d_Op):
     def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
         self.strides = [1, 1]
@@ -119,152 +120,69 @@ class TestCase2(TestPool2d_Op):
 
     def init_pool_type(self):
         self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
 
+    def init_global_pool(self):
+        self.global_pool = False
 
-class TestCase3(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCase3(TestPool2d_Op):
     def init_op_type(self):
         self.op_type = "pool2d"
 
     def init_pool_type(self):
         self.pool_type = "max"
-
-
-class TestCase4(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
         self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+
+class TestCase4(TestCase1):
     def init_op_type(self):
         self.op_type = "pool2d"
 
     def init_pool_type(self):
         self.pool_type = "max"
-
-
-class TestCase5(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
         self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
+
+class TestCase5(TestCase2):
     def init_op_type(self):
         self.op_type = "pool2d"
 
     def init_pool_type(self):
         self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 #--------------------test pool2d_cudnn--------------------
-class TestCaseCudnn1(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
-
+class TestCudnnCase1(TestPool2d_Op):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "avg"
-
-
-class TestCaseCudnn2(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCudnnCase2(TestCase1):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "avg"
-
-
-class TestCaseCudnn3(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = avg_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
+class TestCudnnCase3(TestCase2):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "avg"
-
-
-class TestCaseCudnn4(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCudnnCase4(TestCase3):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "max"
-
-
-class TestCaseCudnn5(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [0, 0]
 
+class TestCudnnCase5(TestCase4):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "max"
-
-
-class TestCaseCudnn6(TestPool2d_Op):
-    def init_test_case(self):
-        self.global_pool = False
-        self.pool2D_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 7, 7]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
+class TestCudnnCase6(TestCase5):
     def init_op_type(self):
         self.op_type = "pool2d_cudnn"
 
-    def init_pool_type(self):
-        self.pool_type = "max"
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py
similarity index 74%
rename from python/paddle/v2/framework/tests/test_pool3d_op.py
rename to python/paddle/v2/fluid/tests/test_pool3d_op.py
index 416f0df7cd..2ba86665a7 100644
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py
@@ -3,8 +3,7 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
@@ -27,8 +26,7 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
     return out
 
 
-def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
-
+def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
@@ -55,6 +53,10 @@ def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 class TestPool3d_Op(OpTest):
     def setUp(self):
         self.init_test_case()
+        self.init_global_pool()
+        self.init_op_type()
+        self.init_pool_type()
+
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype("float32")
@@ -67,8 +69,8 @@ class TestPool3d_Op(OpTest):
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'poolingType': self.pool_type,
-            'globalPooling': self.global_pool,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
         }
 
         self.outputs = {'Out': output.astype('float32')}
@@ -81,74 +83,115 @@ class TestPool3d_Op(OpTest):
             self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
 
     def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "pool3d"
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
         self.shape = [2, 3, 5, 5, 5]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
+
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
-        self.global_pool = False
         self.op_type = "pool3d"
-        self.pool_type = "avg"
-        self.pool3D_forward_naive = avg_pool3D_forward_naive
         self.shape = [2, 3, 7, 7, 7]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
-
-class TestCase2(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = False
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "avg"
         self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase2(TestPool3d_Op):
+    def init_test_case(self):
         self.shape = [2, 3, 7, 7, 7]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
 
 class TestCase3(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = True
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [0, 0, 0]
 
 
-class TestCase4(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = False
+class TestCase4(TestCase1):
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [0, 0, 0]
 
 
-class TestCase5(TestPool3d_Op):
-    def init_test_case(self):
-        self.global_pool = False
+class TestCase5(TestCase2):
+    def init_op_type(self):
         self.op_type = "pool3d"
+
+    def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
+
+
+#--------------------test pool3d_cudnn--------------------
+class TestCudnnCase1(TestPool3d_Op):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase2(TestCase1):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase3(TestCase2):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase4(TestCase3):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase5(TestCase4):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
+
+
+class TestCudnnCase6(TestCase5):
+    def init_op_type(self):
+        self.op_type = "pool3d_cudnn"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/fluid/tests/test_pool_max_op.py
similarity index 70%
rename from python/paddle/v2/framework/tests/test_pool_max_op.py
rename to python/paddle/v2/fluid/tests/test_pool_max_op.py
index cc1a867761..9d2d61c438 100644
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
@@ -3,11 +3,13 @@ import numpy as np
 from op_test import OpTest
 
 
-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
     N, C, D, H, W = x.shape
-    if global_pool == 1:
+    if global_pool:
         ksize = [D, H, W]
+        paddings = [0, 0, 0]
+
     D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
     H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
     W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
@@ -40,11 +42,13 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
     return out, mask
 
 
-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
     N, C, H, W = x.shape
-    if global_pool == 1:
+    if global_pool:
         ksize = [H, W]
+        paddings = [0, 0]
+
     H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
     W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
@@ -74,19 +78,19 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 class TestMaxPoolWithIndex_Op(OpTest):
     def setUp(self):
         self.init_test_case()
-        if self.global_pool:
-            self.paddings = [0 for _ in range(len(self.paddings))]
+        self.init_global()
+
         input = np.random.random(self.shape).astype("float32")
         output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
                                                self.paddings, self.global_pool)
         output = output.astype("float32")
-        mask = mask.astype("float32")
+        mask = mask.astype("int32")
 
         self.attrs = {
             'strides': self.strides,
             'paddings': self.paddings,
             'ksize': self.ksize,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
         }
 
         self.inputs = {'X': input}
@@ -99,41 +103,24 @@ class TestMaxPoolWithIndex_Op(OpTest):
     #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)
 
     def init_test_case(self):
-        self.global_pool = True
-        self.index = "max_pool3d_with_index"
-        self.op_type = "%s" % self.index
+        self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
         self.shape = [2, 3, 5, 5, 5]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
+    def init_global(self):
+        self.global_pool = False
+
 
 class TestCase1(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
         self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
 
 
 class TestCase2(TestMaxPoolWithIndex_Op):
     def init_test_case(self):
-        self.global_pool = False
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 7, 7, 7]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
-
-
-class TestCase3(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = False
         self.op_type = "max_pool3d_with_index"
         self.pool_forward_naive = max_pool3D_forward_naive
         self.shape = [2, 3, 7, 7, 7]
@@ -141,32 +128,18 @@ class TestCase3(TestMaxPoolWithIndex_Op):
         self.strides = [2, 2, 2]
         self.paddings = [0, 0, 0]
 
-
-class TestCase4(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
         self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [1, 1, 1]
-        self.paddings = [1, 1, 1]
 
 
-class TestCase5(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "max_pool3d_with_index"
-        self.pool_forward_naive = max_pool3D_forward_naive
-        self.shape = [2, 3, 5, 5, 5]
-        self.ksize = [3, 3, 3]
-        self.strides = [2, 2, 2]
-        self.paddings = [0, 0, 0]
+class TestCase3(TestCase2):
+    def init_global(self):
+        self.global_pool = False
 
 
-class TestCase6(TestMaxPoolWithIndex_Op):
+#----------------max_pool2d_with_index----------------
+class TestCase4(TestMaxPoolWithIndex_Op):
     def init_test_case(self):
-        self.global_pool = False
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
@@ -174,10 +147,17 @@ class TestCase6(TestMaxPoolWithIndex_Op):
         self.strides = [1, 1]
         self.paddings = [1, 1]
 
+    def init_global(self):
+        self.global_pool = True
+
 
-class TestCase7(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+class TestCase5(TestCase4):
+    def init_global(self):
         self.global_pool = False
+
+
+class TestCase6(TestMaxPoolWithIndex_Op):
+    def init_test_case(self):
         self.op_type = "max_pool2d_with_index"
         self.pool_forward_naive = max_pool2D_forward_naive
         self.shape = [2, 3, 7, 7]
@@ -185,27 +165,13 @@ class TestCase7(TestMaxPoolWithIndex_Op):
         self.strides = [2, 2]
         self.paddings = [0, 0]
 
-
-class TestCase8(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
+    def init_global(self):
         self.global_pool = True
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [1, 1]
-        self.paddings = [1, 1]
 
 
-class TestCase9(TestMaxPoolWithIndex_Op):
-    def init_test_case(self):
-        self.global_pool = True
-        self.op_type = "max_pool2d_with_index"
-        self.pool_forward_naive = max_pool2D_forward_naive
-        self.shape = [2, 3, 5, 5]
-        self.ksize = [3, 3]
-        self.strides = [2, 2]
-        self.paddings = [0, 0]
+class TestCase7(TestCase6):
+    def init_global(self):
+        self.global_pool = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
new file mode 100644
index 0000000000..f6a6c428a2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
@@ -0,0 +1,106 @@
+import unittest
+import itertools
+import numpy as np
+from op_test import OpTest
+
+
+def py_pnpair_op(score, label, query, column=-1, weight=None):
+    # group by query id
+    predictions = {}
+    batch_size = label.shape[0]
+    if weight is None:
+        weight = np.ones(shape=(batch_size, 1)).astype('float32')
+    for s, l, q, w in zip(score, label, query, weight):
+        s, l, q, w = s[column], l[0], q[0], w[0]
+        if q not in predictions:
+            predictions[q] = []
+        predictions[q].append((s, l, w))
+
+    # accumulate statistics
+    pos, neg, neu = 0, 0, 0
+    for _, ranks in predictions.items():
+        for e1, e2 in itertools.combinations(ranks, 2):
+            s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
+            w = (w1 + w2) * 0.5
+            if l1 == l2:
+                continue
+            if s1 == s2:
+                neu += w
+            elif (s1 - s2) * (l1 - l2) > 0:
+                pos += w
+            else:
+                neg += w
+
+    return np.array(pos).astype('float32'), np.array(neg).astype(
+        'float32'), np.array(neu).astype('float32')
+
+
+class TestPositiveNegativePairOp(OpTest):
+    def setUp(self):
+        self.op_type = 'positive_negative_pair'
+        batch_size = 20
+        max_query_id = 5
+        score = np.random.normal(size=(batch_size, 1)).astype('float32')
+        label = np.random.normal(size=(batch_size, 1)).astype('float32')
+        query = np.array(
+            [np.random.randint(max_query_id) for i in range(batch_size)])
+        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
+
+        pos, neg, neu = py_pnpair_op(score, label, query)
+        self.inputs = {'Score': score, 'Label': label, 'QueryID': query}
+        self.attrs = {'column': -1}
+        self.outputs = {
+            'PositivePair': pos,
+            'NegativePair': neg,
+            'NeutralPair': neu
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestPositiveNegativePairOpAccumulateWeight(OpTest):
+    def setUp(self):
+        self.op_type = 'positive_negative_pair'
+        batch_size = 20
+        max_query_id = 5
+        max_random_num = 2 << 15
+        score_dim = 2
+        score = np.random.normal(size=(batch_size, 2)).astype('float32')
+        label = np.random.normal(size=(batch_size, 1)).astype('float32')
+        weight = np.random.normal(size=(batch_size, 1)).astype('float32')
+        query = np.array(
+            [np.random.randint(max_query_id) for i in range(batch_size)])
+        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
+        acc_pos = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_neg = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_neu = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        column = np.random.randint(score_dim)
+
+        pos, neg, neu = py_pnpair_op(
+            score, label, query, column=column, weight=weight)
+        self.inputs = {
+            'Score': score,
+            'Label': label,
+            'QueryID': query,
+            'AccumulatePositivePair': acc_pos,
+            'AccumulateNegativePair': acc_neg,
+            'AccumulateNeutralPair': acc_neu,
+            'Weight': weight
+        }
+        self.attrs = {'column': column}
+        self.outputs = {
+            'PositivePair': pos + acc_pos,
+            'NegativePair': neg + acc_neg,
+            'NeutralPair': neu + acc_neu
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_precision_recall_op.py b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_precision_recall_op.py
rename to python/paddle/v2/fluid/tests/test_precision_recall_op.py
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/fluid/tests/test_prelu_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_prelu_op.py
rename to python/paddle/v2/fluid/tests/test_prelu_op.py
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000..395d0dc36a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,28 @@
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+
+
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
similarity index 68%
rename from python/paddle/v2/framework/tests/test_program.py
rename to python/paddle/v2/fluid/tests/test_program.py
index be020573b7..1a9313c68a 100644
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,36 +1,38 @@
+from __future__ import print_function
 import unittest
 
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.framework import g_program
+from paddle.v2.fluid.framework import Program, default_main_program
+import paddle.v2.fluid.layers as layers
+
+main_program = default_main_program()
 
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_program.rollback()
+        main_program.rollback()
 
-        b = g_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_program.rollback()
-        b = g_program.current_block()
+        main_program.rollback()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
@@ -49,8 +51,8 @@ class TestProgram(unittest.TestCase):
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
-        print prog
-        print prog.clone()
+        print(prog)
+        print(prog.clone())
 
     def test_parse_program_from_string(self):
         prog = Program()
@@ -68,8 +70,8 @@ class TestProgram(unittest.TestCase):
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
 
-        print prog
-        print prog_restored
+        print(prog)
+        print(prog_restored)
 
     def test_append_backward(self):
         prog = Program()
@@ -98,27 +100,46 @@ class TestProgram(unittest.TestCase):
                     "Y": add_y},
             outputs={"Out": add_out},
             attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
 
         self.assertEqual(mul_op.idx, 0)
         self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(add_out, set())
+        param_to_grad = prog.append_backward(mean_out, set())
 
         def grad_name(name):
             return name + "@GRAD"
 
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out"):
+        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
+                         "mean.out"):
             self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
             self.assertEqual(param_to_grad[var_name][1], 0)
 
         expect_ops = [
-            "mul", "elementwise_add", "fill_constant", "elementwise_add_grad",
-            "mul_grad"
+            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
+            "elementwise_add_grad", "mul_grad"
         ]
         actual_ops = []
         for op in block.ops:
             actual_ops.append(op.type)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        kwargs = {
+            'main_program': main_program,
+            'startup_program': startup_program
+        }
+        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        hidden = layers.fc(input=d, size=100, **kwargs)
+        layers.fc(input=hidden, size=100, **kwargs)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/fluid/tests/test_protobuf.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_protobuf.py
rename to python/paddle/v2/fluid/tests/test_protobuf.py
index 848a396b3b..e064374176 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_protobuf_descs.py
rename to python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 2fd3d5d165..d8abe17606 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestOpDesc(unittest.TestCase):
@@ -101,13 +101,13 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
-    def test_data_type(self):
+    def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
         var = block.var('my_var')
         var.set_type(core.VarDesc.VarType.LOD_TENSOR)
-        var.set_data_type(core.DataType.INT32)
-        self.assertEqual(core.DataType.INT32, var.data_type())
+        var.set_dtype(core.DataType.INT32)
+        self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
 
diff --git a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_proximal_gd_op.py b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_proximal_gd_op.py
rename to python/paddle/v2/fluid/tests/test_proximal_gd_op.py
diff --git a/python/paddle/v2/framework/tests/test_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_rank_loss_op.py
rename to python/paddle/v2/fluid/tests/test_rank_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
similarity index 63%
rename from python/paddle/v2/framework/tests/test_recurrent_op.py
rename to python/paddle/v2/fluid/tests/test_recurrent_op.py
index 157befd2ef..694ff0d8dd 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -1,14 +1,11 @@
 import unittest
 
-import logging
-
-from op_test import get_numeric_gradient
-from paddle.v2.framework.layers import *
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class PyRNNBase(object):
@@ -16,8 +13,8 @@ class PyRNNBase(object):
         self.x = np.ones(shape=input_shape).astype("float32")
         self.y = np.zeros(shape=output_shape).astype("float32")
 
-    def step(self):
-        pass
+    def step(self, step_id, x):
+        raise NotImplementedError
 
     def forward(self):
         for step_id in range(self.x.shape[0]):
@@ -99,45 +96,47 @@ class RecurrentOpTest1(unittest.TestCase):
     batch_size = 1
     sent_len = 1
 
-    def init_program(self):
-        self.program = Program()
-        self.init_program = Program()
+    def setup_program(self):
+        self.main_program = Program()
+        self.startup_program = Program()
         self.p_info = {
-            "program": self.program,
-            "init_program": self.init_program
+            "main_program": self.main_program,
+            "startup_program": self.startup_program
         }
         self.place = core.CPUPlace()
 
     def setUp(self):
-        self.init_program()
+        self.setup_program()
         self.data_field = {"x", "h_boot"}
 
         self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
 
-        self.output = mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
 
     def create_rnn_op(self):
-        x = data(
+        x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
-        h_boot = data(
+        x.stop_gradient = False
+        h_boot = layers.data(
             shape=[self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot',
             **self.p_info)
+        h_boot.stop_gradient = False
 
-        rnn = StaticRNN(program=self.program)
+        rnn = layers.StaticRNN(main_program=self.main_program)
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
 
-            h = scale(
-                x=elementwise_add(
+            h = layers.scale(
+                x=layers.elementwise_add(
                     x=h_pre, y=x_t, **self.p_info),
                 scale=self.py_rnn.scale,
                 **self.p_info)
@@ -153,11 +152,11 @@ class RecurrentOpTest1(unittest.TestCase):
             for x in self.data_field
         }
         exe = Executor(self.place)
-        out = exe.run(self.program,
+        out = exe.run(self.main_program,
                       feed=self.feed_map,
                       fetch_list=[self.output])
 
-        return np.array(out[0])
+        return out[0]
 
     def backward(self):
         self.feed_map = {
@@ -165,12 +164,15 @@ class RecurrentOpTest1(unittest.TestCase):
             for x in self.data_field
         }
         fetch_list = [
-            self.program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(x + "@GRAD")
             for x in self.data_field
         ]
 
         exe = Executor(self.place)
-        return exe.run(self.program, feed=self.feed_map, fetch_list=fetch_list)
+        return exe.run(self.main_program,
+                       feed=self.feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
     def test_backward(self):
         self.check_forward()
@@ -237,7 +239,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
     sent_len = 2
 
     def setUp(self):
-        self.init_program()
+        self.setup_program()
 
         self.data_field = {"x", "h_boot", "W", "U"}
 
@@ -245,39 +247,41 @@ class RecurrentOpTest2(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
-        self.output = mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
 
     def create_rnn_op(self):
-        x = data(
+        x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
-        h_boot = data(
+        x.stop_gradient = False
+        h_boot = layers.data(
             shape=[self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot',
             **self.p_info)
+        h_boot.stop_gradient = False
 
-        rnn = StaticRNN(program=self.program)
+        rnn = layers.StaticRNN(main_program=self.main_program)
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
 
-            temp_l = fc(input=x_t,
-                        size=self.input_dim,
-                        param_attr={'name': 'W'},
-                        bias_attr=False,
-                        **self.p_info)
-            temp_r = fc(input=h_pre,
-                        size=self.input_dim,
-                        param_attr={'name': 'U'},
-                        bias_attr=False,
-                        **self.p_info)
-
-            h = sigmoid(
-                x=elementwise_add(
+            temp_l = layers.fc(input=x_t,
+                               size=self.input_dim,
+                               param_attr='W',
+                               bias_attr=False,
+                               **self.p_info)
+            temp_r = layers.fc(input=h_pre,
+                               size=self.input_dim,
+                               param_attr='U',
+                               bias_attr=False,
+                               **self.p_info)
+
+            h = layers.sigmoid(
+                x=layers.elementwise_add(
                     x=temp_l, y=temp_r, **self.p_info),
                 **self.p_info)
 
@@ -287,7 +291,7 @@ class RecurrentOpTest2(RecurrentOpTest1):
         return rnn()
 
 
-class RecurrentOpTest3(RecurrentOpTest1):
+class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
     '''
     Test RNNOp with two memories
     equation:
@@ -304,8 +308,8 @@ class RecurrentOpTest3(RecurrentOpTest1):
 
     class PySimpleRNN3(PyRNNBase):
         def __init__(self, input_shape, output_shape):
-            super(RecurrentOpTest3.PySimpleRNN3, self).__init__(input_shape,
-                                                                output_shape)
+            super(RecurrentOpMultipleMemoryTest.PySimpleRNN3, self).__init__(
+                input_shape, output_shape)
 
             seq_len, batch_size, input_dim = input_shape
             self.h_boot1 = np.random.normal(size=(batch_size,
@@ -333,46 +337,49 @@ class RecurrentOpTest3(RecurrentOpTest1):
     sent_len = 2
 
     def setUp(self):
-        self.init_program()
+        self.setup_program()
 
         self.data_field = {"x", "h_boot1", "h_boot2"}
 
         self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
-        self.py_rnn = RecurrentOpTest3.PySimpleRNN3(self.input_shape,
-                                                    self.output_shape)
+        self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
+            self.input_shape, self.output_shape)
 
-        self.output = mean(x=self.create_rnn_op(), **self.p_info)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
 
     def create_rnn_op(self):
-        x = data(
+        x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='x',
             append_batch_size=False,
             **self.p_info)
-        h_boot1 = data(
+        x.stop_gradient = False
+        h_boot1 = layers.data(
             shape=[self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot1',
             append_batch_size=False,
             **self.p_info)
-        h_boot2 = data(
+        h_boot1.stop_gradient = False
+        h_boot2 = layers.data(
             shape=[self.batch_size, self.input_dim],
-            data_type='float32',
+            dtype='float32',
             name='h_boot2',
             append_batch_size=False,
             **self.p_info)
+        h_boot2.stop_gradient = False
 
-        rnn = StaticRNN(program=self.program)
+        rnn = layers.StaticRNN(main_program=self.main_program)
         with rnn.step():
             h_pre1 = rnn.memory(init=h_boot1)
             h_pre2 = rnn.memory(init=h_boot2)
             x_t = rnn.step_input(x)
 
-            mem1 = scale(x=h_pre1, scale=1.0, **self.p_info)
-            mem2 = scale(x=h_pre2, scale=1.0, **self.p_info)
-            out = sums(input=[mem1, x_t, mem2], **self.p_info)
+            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
+            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
+            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)
 
             rnn.update_memory(h_pre1, mem1)
             rnn.update_memory(h_pre2, mem2)
@@ -381,5 +388,72 @@ class RecurrentOpTest3(RecurrentOpTest1):
         return rnn()
 
 
+class RecurrentOpNoMemBootTest(RecurrentOpTest1):
+    '''
+    Test RNNOp with two memories
+    equation:
+        mem = x + mem_pre
+        y = mem
+    vars:
+        - x
+    memories:
+        - mem
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN4(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(RecurrentOpNoMemBootTest.PySimpleRNN4, self).__init__(
+                input_shape, output_shape)
+            men_dim = input_shape
+            self.mems = np.zeros(shape=men_dim).astype("float32")
+
+        def step(self, step_id, x):
+            if step_id == 0:
+                pre_mem = np.zeros_like(x)
+            else:
+                pre_mem = self.mems[step_id - 1]
+            self.mems[step_id] = pre_mem + x
+            self.y[step_id] = self.mems[step_id]
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
+                                                            self.output_shape)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        print self.main_program
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False,
+            **self.p_info)
+        x.stop_gradient = False
+
+        rnn = layers.StaticRNN(main_program=self.main_program)
+        with rnn.step():
+            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
+            x_t = rnn.step_input(x)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
+            rnn.update_memory(mem_pre, mem)
+            rnn.output(mem)
+
+        return rnn()
+
+
 if __name__ == '__main__':
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
+    exit(0)
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_reduce_op.py
rename to python/paddle/v2/fluid/tests/test_reduce_op.py
diff --git a/python/paddle/v2/framework/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
similarity index 79%
rename from python/paddle/v2/framework/tests/test_regularizer.py
rename to python/paddle/v2/fluid/tests/test_regularizer.py
index b21dceb584..24baf55e90 100644
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -1,9 +1,9 @@
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-import paddle.v2.framework.regularizer as regularizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.regularizer as regularizer
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
@@ -29,7 +29,11 @@ class TestL2DecayRegularizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
-        params_grads = append_backward_ops(mul_out)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
@@ -62,7 +66,11 @@ class TestL1DecayRegularizer(unittest.TestCase):
                     "Y": mul_y},
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
-        params_grads = append_backward_ops(mul_out)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward_ops(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
         params_grads = optimizer.append_regularization_ops(params_grads)
diff --git a/python/paddle/v2/framework/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_reshape_op.py
rename to python/paddle/v2/fluid/tests/test_reshape_op.py
diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_rmsprop_op.py
rename to python/paddle/v2/fluid/tests/test_rmsprop_op.py
diff --git a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
similarity index 79%
rename from python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
rename to python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index 731beff17c..9999165ed5 100644
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -1,16 +1,10 @@
 import unittest
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
-
-
-def create_tensor(np_data, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_data, place)
-    return tensor
+import paddle.v2.fluid.core as core
 
 
 class RNNMemoryHelperOpTest(unittest.TestCase):
@@ -30,13 +24,13 @@ class RNNMemoryHelperOpTest(unittest.TestCase):
 
     def test_forward(self):
         x_np = np.random.normal(size=(2, 3)).astype("float32")
-        self.feed_map = {'X': create_tensor(x_np, self.place)}
+        self.feed_map = {'X': x_np}
         self.fetch_list = [self.Out]
         exe = Executor(self.place)
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), x_np, rtol=1e-5)
+        self.assertTrue(np.allclose(out[0], x_np, rtol=1e-5))
 
 
 class RNNMemoryHelperGradOpTest(unittest.TestCase):
@@ -66,8 +60,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in self.input_names
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -76,7 +69,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), self.feed_map['Out@GRAD'], rtol=1e-5)
+        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
 
 
 class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
@@ -110,8 +103,7 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in ['X', 'Out']
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -120,10 +112,9 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(
-            np.array(out[0]),
-            np.zeros(shape=(2, 3)).astype("float32"),
-            rtol=1e-5)
+        self.assertTrue(
+            np.allclose(
+                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
new file mode 100644
index 0000000000..a28d9c7f82
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -0,0 +1,123 @@
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+
+class TestROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_pool()
+
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
+
+    def init_test_case(self):
+        self.batch_size = 5
+        self.channels = 3
+        self.height = 6
+        self.width = 4
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 4.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.rois_num = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def calc_roi_pool(self):
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = roi[0]
+            roi_start_w = int(round(roi[1] * self.spatial_scale))
+            roi_start_h = int(round(roi[2] * self.spatial_scale))
+            roi_end_w = int(round(roi[3] * self.spatial_scale))
+            roi_end_h = int(round(roi[4] * self.spatial_scale))
+
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
+
+            x_i = self.x[roi_batch_id]
+
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+
+            for c in range(self.channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(math.floor(ph * bin_size_h))
+                        wstart = int(math.floor(pw * bin_size_w))
+                        hend = int(math.ceil((ph + 1) * bin_size_h))
+                        wend = int(math.ceil((pw + 1) * bin_size_w))
+
+                        hstart = min(max(hstart + roi_start_h, 0), self.height)
+                        hend = min(max(hend + roi_start_h, 0), self.height)
+                        wstart = min(max(wstart + roi_start_w, 0), self.width)
+                        wend = min(max(wend + roi_start_w, 0), self.width)
+
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        if is_empty:
+                            out_data[i, c, ph, pw] = 0
+                        else:
+                            out_data[i, c, ph, pw] = -sys.float_info.max
+
+                        argmax_data[i, c, ph, pw] = -1
+
+                        for h in range(hstart, hend):
+                            for w in range(wstart, wend):
+                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
+                                    out_data[i, c, ph, pw] = x_i[c, h, w]
+                                    argmax_data[i, c, ph, pw] = h * \
+                                        self.width + w
+
+        self.outs = out_data.astype('float32')
+        self.argmaxes = argmax_data.astype('int64')
+
+    def make_rois(self):
+        rois = []
+        batch_ids = np.random.randint(0, self.batch_size, size=self.rois_num)
+        for i in range(self.rois_num):
+            x1 = np.random.random_integers(
+                0, self.width / self.spatial_scale - self.pooled_width)
+            y1 = np.random.random_integers(
+                0, self.height / self.spatial_scale - self.pooled_height)
+
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
+            roi = [batch_ids[i], x1, y1, x2, y2]
+            rois.append(roi)
+        self.rois = np.array(rois).astype("int64")
+
+    def setUp(self):
+        self.op_type = "roi_pool"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scale_op.py b/python/paddle/v2/fluid/tests/test_scale_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_scale_op.py
rename to python/paddle/v2/fluid/tests/test_scale_op.py
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/fluid/tests/test_scatter_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_scatter_op.py
rename to python/paddle/v2/fluid/tests/test_scatter_op.py
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/fluid/tests/test_scope.py
similarity index 81%
rename from python/paddle/v2/framework/tests/test_scope.py
rename to python/paddle/v2/fluid/tests/test_scope.py
index 1474365479..e4857b590a 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/fluid/tests/test_scope.py
@@ -1,22 +1,22 @@
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import unittest
 
 
 class TestScope(unittest.TestCase):
     def test_create_destroy(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNotNone(scope)
         scope_with_parent = scope.new_scope()
         self.assertIsNotNone(scope_with_parent)
 
     def test_none_variable(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNone(scope.find_var("test"))
 
     def test_create_var_get_var(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         var_a = scope.var("var_a")
         self.assertIsNotNone(var_a)
@@ -25,7 +25,7 @@ class TestScope(unittest.TestCase):
         self.assertIsNotNone(scope2.find_var('var_a'))
 
     def test_var_get_int(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         var = scope.var("test_int")
         var.set_int(10)
diff --git a/python/paddle/v2/framework/tests/test_selected_rows.py b/python/paddle/v2/fluid/tests/test_selected_rows.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_selected_rows.py
rename to python/paddle/v2/fluid/tests/test_selected_rows.py
index e8a930cb08..93daf37aa2 100644
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/fluid/tests/test_selected_rows.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/test_seq_concat_op.py b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
new file mode 100644
index 0000000000..dccc6ed8af
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
@@ -0,0 +1,104 @@
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+exit(0)
+
+
+def to_abs_lod(lod):
+    if len(lod) == 0 or len(lod) == 1:
+        return lod
+    import copy
+    new_lod = copy.deepcopy(lod)
+    for idx, val in enumerate(lod[0]):
+        new_lod[0][idx] = lod[1][val]
+    return new_lod
+
+
+def seq_concat(inputs, level):
+    lod0 = inputs['X'][0][1][1]
+    lod1 = inputs['X'][1][1][1]
+    x0 = inputs['X'][0][1][0]
+    x1 = inputs['X'][1][1][0]
+    level_idx = len(lod0) - level - 1
+    outs = []
+    for i in range(len(lod0[level_idx]) - 1):
+        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
+            i + 1], :]
+        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
+            i + 1], :]
+        outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
+    return np.concatenate(outs, axis=0)
+
+
+class TestSeqConcatOp(OpTest):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((4, 8, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        axis = 1
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)}
+
+    def setUp(self):
+        self.op_type = "sequence_concat"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out')
+
+
+class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 3, 4)).astype('float32')
+        lod0 = [[0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 3, 4)).astype('float32')
+        lod1 = [[0, 1, 3, 5, 7]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_conv.py b/python/paddle/v2/fluid/tests/test_seq_conv.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_conv.py
rename to python/paddle/v2/fluid/tests/test_seq_conv.py
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/fluid/tests/test_seq_expand.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_expand.py
rename to python/paddle/v2/fluid/tests/test_seq_expand.py
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/fluid/tests/test_seq_pool.py
similarity index 77%
rename from python/paddle/v2/framework/tests/test_seq_pool.py
rename to python/paddle/v2/fluid/tests/test_seq_pool.py
index efc4920124..512d8b315f 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/fluid/tests/test_seq_pool.py
@@ -29,6 +29,9 @@ class TestSeqAvgPool(OpTest):
         self.check_output()
 
     def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
         self.check_grad(["X"], "Out")
 
 
@@ -85,31 +88,53 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
 
     def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
 class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "MAX"}
         for i in range(4):
             sub_x = x[lod[0][i]:lod[0][i + 1], :]
             out[i] = np.amax(sub_x, axis=0)
 
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
-
 
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 1.0
+
+        out = np.zeros((4, 3, 11)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
     def compute(self, x, lod, out):
         self.attrs = {'pooltype': "MAX"}
         for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 17))
-
-    def test_check_grad(self):
-        # Remove MaxPool2D from gradient check to confirm the success of CI.
-        return
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
 class TestSeqLastPool(TestSeqAvgPool):
diff --git a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
new file mode 100644
index 0000000000..ccd9a05343
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
@@ -0,0 +1,47 @@
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+
+class TestSequenceSliceOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        # only supprot one level LoD
+        x = np.random.random(self.x_dim).astype('float32')
+        lod = self.x_lod
+        offset = np.array(self.offset).astype("int64")
+        length = np.array(self.length).astype("int64")
+
+        self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
+        outs = []  #np.zeros((100, 3, 2)).astype('float32')
+        out_lod = [[0]]
+        out_lod_offset = 0
+        for i in range(len(offset)):
+            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+                      length[i, 0], :]
+            out_lod_offset = out_lod_offset + len(sub_x)
+            outs.append(sub_x)
+            out_lod[0].append(out_lod_offset)
+        outs = np.concatenate(outs, axis=0)
+        self.outputs = {'Out': (outs, out_lod)}
+
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.offset = [[1], [2], [3], [4], [5]]
+        self.length = [[10], [8], [6], [4], [2]]
+
+    def setUp(self):
+        self.op_type = "sequence_slice"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sequence_softmax_op.py
rename to python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_sgd_op.py
rename to python/paddle/v2/fluid/tests/test_sgd_op.py
index 01262bba4d..ca05a381f0 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 from op_test import OpTest
 
 
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
new file mode 100644
index 0000000000..86db4c64b4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -0,0 +1,46 @@
+import unittest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import default_main_program
+import numpy
+
+main_program = default_main_program()
+
+
+class TestShrinkRNNMemory(unittest.TestCase):
+    def test_shrink_rnn_memory(self):
+        x = layers.data('x', shape=[100], dtype='float32')
+        x.stop_gradient = False
+        table = layers.lod_rank_table(x=x)
+        i = layers.zeros(dtype='int64', shape=[1])
+        mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        mem2 = layers.shrink_memory(x=mem1, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        mem3 = layers.shrink_memory(x=mem2, i=i, table=table)
+
+        cpu = core.CPUPlace()
+        tensor = core.LoDTensor()
+        tensor.set_lod([[0, 2, 5, 6]])
+        tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
+        tensor.set(tensor_np, cpu)
+        exe = Executor(cpu)
+        outs = exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3])
+        self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
+        self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
+        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
+
+        mem3_mean = layers.mean(x=mem3)
+        append_backward_ops(loss=mem3_mean)
+        x_grad = exe.run(
+            feed={'x': tensor},
+            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
+        self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
similarity index 66%
rename from python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
rename to python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
index e53856b38a..c42f578f72 100644
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -2,11 +2,12 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import unittest
 
 
 class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with binary labels
-    '''
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
 
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -16,16 +17,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
             'X': logit(
                 np.random.uniform(0, 1, (batch_size, num_classes))
                 .astype("float32")),
-            'Labels': np.random.randint(0, 2, (batch_size, num_classes))
+            'Label': np.random.randint(0, 2, (batch_size, num_classes))
             .astype("float32")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
         sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
         self.outputs = {'Out': -term1 - term2}
 
     def test_check_output(self):
@@ -36,8 +37,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
 
 
 class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels
-    '''
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
 
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -47,16 +48,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
             'X': logit(
                 np.random.uniform(0, 1, (batch_size, num_classes))
                 .astype("float32")),
-            'Labels': np.random.uniform(0, 1, (batch_size, num_classes))
+            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
             .astype("float32")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
         sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
         self.outputs = {'Out': -term1 - term2}
 
     def test_check_output(self):
@@ -64,3 +65,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sign_op.py b/python/paddle/v2/fluid/tests/test_sign_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sign_op.py
rename to python/paddle/v2/fluid/tests/test_sign_op.py
diff --git a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
rename to python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/fluid/tests/test_softmax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_softmax_op.py
rename to python/paddle/v2/fluid/tests/test_softmax_op.py
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
similarity index 77%
rename from python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
rename to python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
index f93feb2069..c2f07f9096 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
@@ -12,30 +12,30 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
 
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
-        batch_size = 3
+        batch_size = 2
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
 
         cross_entropy = np.asmatrix(
             [[-np.log(softmax[i][labels[i][0]])]
              for i in range(softmax.shape[0])],
-            dtype="float32")
+            dtype="float64")
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
         }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")
 
 
 class TestSoftmaxWithCrossEntropyOp2(OpTest):
@@ -49,19 +49,19 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         labels = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float32")
+                                   [batch_size, class_num]).astype("float64")
         labels /= np.sum(labels, axis=1, keepdims=True)
 
         cross_entropy = (-labels * np.log(softmax)).sum(
-            axis=1, keepdims=True).astype("float32")
+            axis=1, keepdims=True).astype("float64")
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {
-            "Softmax": softmax.astype('float32'),
-            "Loss": cross_entropy.astype('float32')
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
         }
         self.attrs = {"soft_label": True}
 
@@ -69,9 +69,8 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss")
 
 
 if __name__ == "__main__":
-    exit(0)  # FIXME: xe has bug
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
new file mode 100644
index 0000000000..f5da4e408f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -0,0 +1,186 @@
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_split_and_merge_lod_tensor_no_lod(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+
+        mask_np = np.array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([2, 3, 4, 5]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+
+        expect_false_tensor = np.array([0, 1, 6, 7, 8, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def test_split_and_merge_lod_tensor_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([3, 4, 5, 6, 7, 8]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+        expect_true.set_lod([[0, 6]])
+
+        expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+        expect_false_lod = [[0, 3, 4]]
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+        expect_false.set_lod(expect_false_lod)
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def main(self, tensor, mask, expect_true, expect_false, expect_out,
+             level=0):
+        place = self.place()
+        program = Program()
+        x = layers.data(name='x', shape=[1], main_program=program)
+        x.persistable = True
+
+        y = layers.data(name='y', shape=[1], main_program=program)
+        y.persistable = True
+
+        out_true, out_false = layers.split_lod_tensor(
+            input=x, mask=y, level=level, main_program=program)
+        out_true.persistable = True
+        out_false.persistable = True
+
+        out = layers.merge_lod_tensor(
+            in_true=out_true,
+            in_false=out_false,
+            mask=y,
+            x=x,
+            level=level,
+            main_program=program)
+
+        out.persistable = True
+
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program,
+                feed={'x': tensor,
+                      'y': mask},
+                scope=scope,
+                return_numpy=False)
+
+        var_true = scope.find_var(out_true.name).get_tensor()
+
+        var_false = scope.find_var(out_false.name).get_tensor()
+
+        var_out = scope.find_var(out.name).get_tensor()
+
+        self.check_tensor_same(var_true, expect_true)
+        self.check_tensor_same(var_false, expect_false)
+        self.check_tensor_same(var_out, expect_out)
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        x = layers.data(
+            name='x',
+            shape=[1],
+            dtype='float32',
+            main_program=program,
+            stop_gradient=False)
+        y = layers.data(
+            name='y',
+            shape=[1],
+            dtype='bool',
+            main_program=program,
+            stop_gradient=False)
+
+        level = 0
+
+        out_true, out_false = layers.split_lod_tensor(
+            input=x, mask=y, level=level, main_program=program)
+        out = layers.merge_lod_tensor(
+            in_true=out_true,
+            in_false=out_false,
+            mask=y,
+            x=x,
+            level=level,
+            main_program=program)
+        mean = layers.mean(x=out, main_program=program)
+
+        append_backward_ops(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, place)
+
+        exe = Executor(place)
+        scope = core.Scope()
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+        g_out = [
+            item.sum()
+            for item in map(np.array,
+                            exe.run(program,
+                                    feed={'x': tensor,
+                                          'y': mask},
+                                    fetch_list=[g_vars],
+                                    scope=scope,
+                                    return_numpy=False))
+        ]
+
+        g_out_sum = np.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_split_op.py b/python/paddle/v2/fluid/tests/test_split_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_split_op.py
rename to python/paddle/v2/fluid/tests/test_split_op.py
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
rename to python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
rename to python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_sum_op.py b/python/paddle/v2/fluid/tests/test_sum_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sum_op.py
rename to python/paddle/v2/fluid/tests/test_sum_op.py
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_tensor.py
rename to python/paddle/v2/fluid/tests/test_tensor.py
index e0cd2fa8aa..9f870d9eb3 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy
 
diff --git a/python/paddle/v2/framework/tests/test_top_k_op.py b/python/paddle/v2/fluid/tests/test_top_k_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_top_k_op.py
rename to python/paddle/v2/fluid/tests/test_top_k_op.py
diff --git a/python/paddle/v2/framework/tests/test_transpose_op.py b/python/paddle/v2/fluid/tests/test_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_uniform_random_op.py
rename to python/paddle/v2/fluid/tests/test_uniform_random_op.py
index ded777105e..f736dfb2e8 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000..e87f283042
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,83 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
similarity index 81%
rename from python/paddle/v2/framework/tests/test_variable.py
rename to python/paddle/v2/fluid/tests/test_variable.py
index c670ca19af..f1e4c0ba21 100644
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,13 +1,13 @@
 import unittest
-from paddle.v2.framework.framework import Variable, g_program, Program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
+import paddle.v2.fluid.core as core
 import numpy as np
 
 
 class TestVariable(unittest.TestCase):
     def test_np_dtype_convert(self):
         DT = core.DataType
-        convert = Variable._convert_np_dtype_to_dtype_
+        convert = convert_np_dtype_to_dtype_
         self.assertEqual(DT.FP32, convert(np.float32))
         self.assertEqual(DT.FP16, convert("float16"))
         self.assertEqual(DT.FP64, convert("float64"))
@@ -18,17 +18,17 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_program.current_block()
+        b = default_main_program().current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")
-        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual(core.DataType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
         self.assertEqual(0, w.lod_level)
 
         w = b.create_var(name='fc.w')
-        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual(core.DataType.FP64, w.dtype)
         self.assertEqual((784, 100), w.shape)
         self.assertEqual("fc.w", w.name)
         self.assertEqual(0, w.lod_level)
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
new file mode 100644
index 0000000000..033b03a495
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -0,0 +1,66 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.backward import append_backward_ops
+import numpy
+
+
+class TestWhileOp(unittest.TestCase):
+    def test_simple_forward(self):
+        d0 = layers.data(
+            "d0", shape=[10], append_batch_size=False, dtype='float32')
+        d1 = layers.data(
+            "d1", shape=[10], append_batch_size=False, dtype='float32')
+        d2 = layers.data(
+            "d2", shape=[10], append_batch_size=False, dtype='float32')
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+        init = layers.zeros(shape=[10], dtype='float32')
+        mem_array = layers.array_write(x=init, i=i)
+        data_array = layers.array_write(x=d0, i=i)
+
+        i = layers.increment(i)
+        layers.array_write(d1, i, array=data_array)
+
+        i = layers.increment(i)
+        layers.array_write(d2, i, array=data_array)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len.stop_gradient = True
+        cond = layers.less_than(x=i, y=array_len)
+
+        while_op = layers.While(cond=cond)
+        with while_op.block():
+            d = layers.array_read(array=data_array, i=i)
+            prev = layers.array_read(array=mem_array, i=i)
+            result = layers.sums(input=[d, prev])
+
+            i = layers.increment(x=i, in_place=True)
+            layers.array_write(result, i=i, array=mem_array)
+            layers.less_than(x=i, y=array_len, cond=cond)
+
+        sum_result = layers.array_read(array=mem_array, i=i)
+        loss = layers.mean(x=sum_result)
+
+        append_backward_ops(loss)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        d = []
+
+        for i in xrange(3):
+            d.append(numpy.random.random(size=[10]).astype('float32'))
+
+        outs = exe.run(feed={'d0': d[0],
+                             'd1': d[1],
+                             'd2': d[2]},
+                       fetch_list=[sum_result])
+        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/__init__.py b/python/paddle/v2/framework/__init__.py
deleted file mode 100644
index 5df612bf35..0000000000
--- a/python/paddle/v2/framework/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import sys
-import core
-__all__ = ['proto']
-argv = []
-if core.is_compile_gpu():
-    argv = list(sys.argv) + [
-        "--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"
-    ]
-else:
-    argv = list(sys.argv) + ["--tryfromenv=use_pinned_memory"]
-core.init_gflags(argv)
diff --git a/python/paddle/v2/framework/evaluator.py b/python/paddle/v2/framework/evaluator.py
deleted file mode 100644
index 254dd5f1a3..0000000000
--- a/python/paddle/v2/framework/evaluator.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import paddle.v2.framework.op as op
-import numpy as np
-import paddle.v2.framework.core as core
-
-
-def avg_accumulate(accumulated_var, per_eval, num_batches, place):
-    t = np.array(accumulated_var.get_tensor())
-    t[0] += per_eval[0]
-    accumulated_var.get_tensor().set([t[0] / float(num_batches)], place)
-
-
-class Evaluator(object):
-    def __init__(self,
-                 scope,
-                 operator='accuracy',
-                 input='Inference',
-                 label='Label',
-                 output='Output',
-                 place=core.CPUPlace()):
-        """
-        create an evaluator for evaluating the inference.
-        NOTE: default run on CPUPlace(), running on GPUPlace doesn't improve performance much.
-
-        :param scope: the scope instance contains the input.
-        :type scope: paddle.v2.framework.core.scope
-        :param operator: operator name for caculating the evaluation for each mini-batch.
-        :type operator: string
-        :param input: output variable name of forward network.
-        :type input: string
-        :param label: variable name of label
-        :type label: string
-        """
-        self.scope = scope
-        self.place = place
-        self.output_name = output
-        self.num_batches = 0
-        # create variable to store accumulated evaluator output
-        eval_name = ''.join([operator, "@Eval"])
-        if scope.find_var(eval_name):
-            raise Exception("evaluator already exist in scope: %s" % eval_name)
-        self.accumulated_var = scope.var(eval_name)
-        t = self.accumulated_var.get_tensor()
-        t.set_dims((1, ))
-        t.set([0.0], place)
-        # self.accumulated_var = block.create_var(block, name=eval_name, shape=(1,))
-        # self.accumulated_var.get_tensor().set([0.0])
-        # create operator of evaluation
-        var_map = dict()  # var name -> variable
-        var_map[input] = [input]
-        var_map[label] = [label]
-        var_map[output] = [output]
-        self.op = op.Operator(operator, **var_map)
-
-    def evaluate(self, ctx, accumulator=avg_accumulate):
-        self.op.run(self.scope, ctx)
-        per_eval = np.array(self.scope.find_var(self.output_name).get_tensor())
-        self.num_batches += 1
-        accumulator(self.accumulated_var, per_eval, self.num_batches,
-                    self.place)
diff --git a/python/paddle/v2/framework/executor.py b/python/paddle/v2/framework/executor.py
deleted file mode 100644
index 8268d0d8f5..0000000000
--- a/python/paddle/v2/framework/executor.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Block, Program
-
-g_scope = core.Scope()
-
-
-class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        self.executor = core.Executor(act_places)
-
-    def run(self,
-            program,
-            feed=None,
-            fetch_list=None,
-            feed_var_name='feed',
-            fetch_var_name='fetch',
-            scope=None):
-        if feed is None:
-            feed = {}
-        if fetch_list is None:
-            fetch_list = []
-
-        if not isinstance(program, Program):
-            raise TypeError()
-
-        if scope is None:
-            scope = g_scope
-
-        program = program.clone()
-        global_block = program.global_block()
-        feed_var = global_block.create_var(
-            name=feed_var_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed):
-            out = global_block.var(name)
-            global_block.prepend_op(
-                'feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-            core.set_feed_variable(scope, feed[name], feed_var.name, i)
-
-        fetch_var = global_block.create_var(
-            name=fetch_var_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-        for i, var in enumerate(fetch_list):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [var]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-
-        self.executor.run(program.desc, scope, 0, True)
-        return [
-            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
-        ]
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
deleted file mode 100644
index 8b7d6fc32b..0000000000
--- a/python/paddle/v2/framework/layers.py
+++ /dev/null
@@ -1,750 +0,0 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, Operator
-from paddle.v2.framework.initializer import ConstantInitializer, NormalInitializer
-from paddle.v2.framework.layer_helper import LayerHelper, unique_name
-import re
-
-__all__ = [
-    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
-    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy'
-]
-
-
-def fc(input,
-       size,
-       param_attr=None,
-       bias_attr=True,
-       name=None,
-       act=None,
-       num_flatten_dims=1,
-       program=None,
-       init_program=None):
-    # create helper
-    helper = LayerHelper('fc', **locals())
-
-    dtype = helper.input_dtype()
-
-    # mul
-    mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
-        ] + [size]
-        w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype)
-        tmp = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={
-                "X": input_var,
-                "Y": w,
-            },
-            outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
-        mul_results.append(tmp)
-
-    # sum
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
-    else:
-        pre_bias = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
-    # add activation
-    return helper.append_activation(pre_activation)
-
-
-def embedding(input,
-              size,
-              data_type='float32',
-              is_sparse=False,
-              param_attr=None,
-              program=None,
-              init_program=None):
-    helper = LayerHelper('embedding', **locals())
-    w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=data_type)
-    tmp = helper.create_tmp_variable(data_type)
-    helper.append_op(
-        type='lookup_table',
-        inputs={'Ids': input,
-                'W': w},
-        outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse})
-    return tmp
-
-
-def data(name,
-         shape,
-         data_type='float32',
-         type=core.VarDesc.VarType.LOD_TENSOR,
-         append_batch_size=True,
-         program=None,
-         init_program=None):
-    helper = LayerHelper('data', **locals())
-    shape = list(shape)
-    for i in xrange(len(shape)):
-        if shape[i] is None:
-            shape[i] = -1
-            append_batch_size = False
-        elif shape[i] < 0:
-            append_batch_size = False
-
-    if append_batch_size:
-        shape = [-1] + shape  # append batch size as -1
-
-    return helper.create_global_variable(
-        name=name, shape=shape, dtype=data_type, type=type)
-
-
-def _convert_(name):
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
-
-
-def _create_op_func_(op_type):
-    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
-    not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
-    intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
-
-    if len(not_intermediate_outputs) != 1:
-        raise ValueError(
-            "Only one not intermediate output operator can be automatically generated"
-        )
-
-    if not_intermediate_outputs[0].duplicable:
-        raise ValueError(
-            "Only not duplicable op can be automatically generated")
-
-    for output in intermediate_outputs:
-        if output.duplicable:
-            raise ValueError(
-                "Only when all intermediate ops are not duplicable, "
-                "this op can be automatically generated")
-
-    o_name = not_intermediate_outputs[0].name
-    intermediate_output_names = [output.name for output in intermediate_outputs]
-
-    def func(**kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-        inputs = dict()
-        dtype = None
-        for ipt in op_proto.inputs:
-            name = _convert_(ipt.name)
-            val = kwargs.pop(name, [])
-            if not isinstance(val, list) and not isinstance(val, tuple):
-                val = [val]
-            for each in val:
-                if not isinstance(each, Variable):
-                    raise ValueError("input of {0} must be variable".format(
-                        op_type))
-
-                if dtype is None:
-                    dtype = each.data_type
-                elif dtype != each.data_type:
-                    raise ValueError(
-                        "operator {0} must input same dtype".format(op_type))
-            inputs[ipt.name] = val
-
-        outputs = dict()
-        out = helper.create_tmp_variable(dtype=dtype)
-        outputs[o_name] = [out]
-        for name in intermediate_output_names:
-            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out)
-
-    func.__name__ = op_type
-    globals()[op_type] = func
-    global __all__
-    __all__.append(op_type)
-
-
-_create_op_func_('mean')
-_create_op_func_('mul')
-_create_op_func_('elementwise_add')
-_create_op_func_('dropout')
-_create_op_func_('reshape')
-_create_op_func_('elementwise_add')
-_create_op_func_('sigmoid')
-_create_op_func_('scale')
-
-
-def cast(x, data_type, program=None):
-    helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=data_type)
-    helper.append_op(
-        type='cast',
-        inputs={'X': [x]},
-        outputs={'Out': [out]},
-        attrs={'in_data_type': x.data_type,
-               'out_data_type': out.data_type})
-    return out
-
-
-def concat(input, axis, program=None, init_program=None):
-    helper = LayerHelper('concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(
-        type='concat',
-        inputs={'X': input},
-        outputs={'Out': [out]},
-        attrs={'axis': axis})
-    return out
-
-
-def sums(input, program=None, init_program=None):
-    helper = LayerHelper('sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
-    return out
-
-
-def cos_sim(X, Y, **kwargs):
-    helper = LayerHelper('cos_sim', **kwargs)
-    out = helper.create_tmp_variable(dtype=X.data_type)
-    xnorm = helper.create_tmp_variable(dtype=X.data_type)
-    ynorm = helper.create_tmp_variable(dtype=X.data_type)
-    helper.append_op(
-        type='cos_sim',
-        inputs={'X': [X],
-                'Y': [Y]},
-        outputs={'Out': [out],
-                 'XNorm': [xnorm],
-                 'YNorm': [ynorm]})
-    return out
-
-
-def cross_entropy(input, label, **kwargs):
-    helper = LayerHelper('cross_entropy', **kwargs)
-    out = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='cross_entropy',
-        inputs={'X': [input],
-                'Label': [label]},
-        outputs={'Y': [out]},
-        attrs=kwargs)
-    return out
-
-
-def square_error_cost(input, label, **kwargs):
-    helper = LayerHelper('square_error_cost', **kwargs)
-    minus_out = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_tmp_variable(dtype=input.data_type)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]}, outputs={'Y': [square_out]})
-    return square_out
-
-
-def accuracy(input, label, k=1, **kwargs):
-    helper = LayerHelper("accuracy", **kwargs)
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out_dtype = kwargs.get("out_dtype", "float32")
-    acc_out = helper.create_tmp_variable(dtype=acc_out_dtype)
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={"Accuracy": [acc_out]})
-    return acc_out
-
-
-def sequence_conv(input,
-                  num_filters,
-                  filter_size=3,
-                  filter_stride=1,
-                  act=None,
-                  padding=None,
-                  bias_attr=None,
-                  param_attr=None,
-                  program=None,
-                  init_program=None):
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
-
-    helper = LayerHelper('sequence_conv', **locals())
-    dtype = helper.input_dtype()
-
-    filter_shape = [filter_size * input.shape[1], num_filters]
-    filter = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='sequence_conv',
-        inputs={
-            'X': [input],
-            'Filter': [filter],
-        },
-        outputs={"Out": pre_bias},
-        attrs={
-            'contextStride': filter_stride,
-            'contextStart': -int(filter_size / 2),
-            'contextLength': filter_size
-        })
-    pre_act = helper.append_bias_op(pre_bias)
-    return helper.append_activation(pre_act)
-
-
-def conv2d(input,
-           num_filters,
-           name=None,
-           filter_size=[1, 1],
-           act=None,
-           groups=None,
-           stride=[1, 1],
-           padding=None,
-           bias_attr=None,
-           param_attr=None,
-           program=None,
-           init_program=None):
-    helper = LayerHelper('conv2d', **locals())
-    dtype = helper.input_dtype()
-
-    num_channels = input.shape[1]
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups is not 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels / groups
-
-    if isinstance(filter_size, int):
-        filter_size = [filter_size, filter_size]
-    if isinstance(stride, int):
-        stride = [stride, stride]
-    if isinstance(padding, int):
-        padding = [padding, padding]
-
-    input_shape = input.shape
-    filter_shape = [num_filters, num_filter_channels] + filter_size
-
-    std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-    filter = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        initializer=NormalInitializer(0.0, std, 0))
-    pre_bias = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type='conv2d',
-        inputs={
-            'Input': input,
-            'Filter': filter,
-        },
-        outputs={"Output": pre_bias},
-        attrs={'strides': stride,
-               'paddings': padding,
-               'groups': groups})
-
-    pre_act = helper.append_bias_op(pre_bias, 1)
-
-    return helper.append_activation(pre_act)
-
-
-def sequence_pool(input, pool_type, **kwargs):
-    helper = LayerHelper('sequence_pool', input=input, **kwargs)
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="sequence_pool",
-        inputs={"X": [input]},
-        outputs={"Out": [pool_out]},
-        attrs={"pooltype": pool_type.upper()})
-
-    return pool_out
-
-
-def pool2d(input,
-           pool_size,
-           pool_type,
-           pool_stride=[1, 1],
-           pool_padding=[0, 0],
-           global_pooling=False,
-           program=None,
-           init_program=None):
-    if pool_type not in ["max", "avg"]:
-        raise ValueError(
-            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
-    if isinstance(pool_size, int):
-        pool_size = [pool_size, pool_size]
-    if isinstance(pool_stride, int):
-        pool_stride = [pool_stride, pool_stride]
-    if isinstance(pool_padding, int):
-        pool_padding = [pool_padding, pool_padding]
-
-    helper = LayerHelper('pool2d', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="pool2d",
-        inputs={"X": input},
-        outputs={"Out": pool_out},
-        attrs={
-            "poolingType": pool_type,
-            "ksize": pool_size,
-            "globalPooling": global_pooling,
-            "strides": pool_stride,
-            "paddings": pool_padding
-        })
-
-    return pool_out
-
-
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               program=None,
-               init_program=None):
-    helper = LayerHelper('batch_norm', **locals())
-    dtype = helper.input_dtype()
-
-    input_shape = input.shape
-    if data_layout == 'NCHW':
-        channel_num = input_shape[1]
-    else:
-        if data_layout == 'NHWC':
-            channel_num = input_shape[-1]
-        else:
-            raise ValueError("unsupported data layout:" + data_layout)
-
-    param_shape = [channel_num]
-
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=ConstantInitializer(1.0))
-    bias = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=ConstantInitializer(0.0))
-
-    mean = helper.create_global_variable(
-        dtype=input.data_type, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=mean, initializer=ConstantInitializer(0.0))
-
-    variance = helper.create_global_variable(
-        dtype=input.data_type, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=variance, initializer=ConstantInitializer(1.0))
-
-    # create output
-    # mean and mean_out share the same memory
-    mean_out = mean
-    # variance and variance out share the same memory
-    variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype)
-    saved_variance = helper.create_tmp_variable(dtype)
-
-    batch_norm_out = helper.create_tmp_variable(dtype)
-
-    helper.append_op(
-        type="batch_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-            "Mean": mean,
-            "Variance": variance
-        },
-        outputs={
-            "Y": batch_norm_out,
-            "MeanOut": mean_out,
-            "VarianceOut": variance_out,
-            "SavedMean": saved_mean,
-            "SavedVariance": saved_variance
-        },
-        attrs={"momentum": momentum,
-               "epsilon": epsilon,
-               "is_test": is_test})
-
-    return helper.append_activation(batch_norm_out)
-
-
-class BlockGuard(object):
-    """
-    BlockGuard used to create sub-block in program by using Python `with` 
-    keyword.
-    """
-
-    def __init__(self, program):
-        if not isinstance(program, Program):
-            raise TypeError("BlockGuard takes a program")
-        self.program = program
-
-    def __enter__(self):
-        self.program.create_block()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.program.rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class StaticRNNGuard(BlockGuard):
-    def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes an StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.program)
-        self.rnn = rnn
-
-    def __enter__(self):
-        self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(StaticRNNGuard, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_rnn_op()
-        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-class StaticRNNMemoryLink(object):
-    """
-    :param init: the initial variable for Memory
-    :type init: Variable
-    :param pre_mem: the memory variable in previous time step
-    :type pre_mem: Variable
-    :param mem: the memory variable in current time step
-    :type mem: Variable
-    """
-
-    def __init__(self, init, pre_mem, mem=None):
-        self.init = init
-        self.pre_mem = pre_mem
-        self.mem = mem
-
-
-class StaticRNN(object):
-    BEFORE_RNN_BLOCK = 0
-    IN_RNN_BLOCK = 1
-    AFTER_RNN_BLOCK = 2
-
-    def __init__(self, name=None, program=None):
-        self.helper = LayerHelper("static_rnn", name=name, program=program)
-        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
-        self.inputs = []  # input variable list in current block
-        self.outputs = []  # output variable list in parent block
-        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
-        # sequence length, since it is a static RNN, sequence length are fixed.
-        self.seq_len = None
-
-    def step(self):
-        return StaticRNNGuard(self)
-
-    def _assert_in_rnn_block_(self, method):
-        if self.status != StaticRNN.IN_RNN_BLOCK:
-            raise ValueError("You must invoke {0} in rnn block".format(method))
-
-    def memory(self, init=None, shape=None, dtype=None, init_value=0):
-        self._assert_in_rnn_block_('memory')
-        if init is None:
-            if shape is None or dtype is None:
-                raise ValueError(
-                    "if init is None, memory at least need shape and dtype")
-            parent_block = self.parent_block()
-            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(
-                name=var_name, shape=shape, dtype=dtype, persistable=False)
-
-            parent_block.append_op(
-                type="fill_constant",
-                inputs={},
-                outputs={'Out': [boot_var]},
-                attrs={
-                    'value': init_value,
-                    'shape': [40] + list(boot_var.shape[1:]),
-                    'data_type': boot_var.data_type
-                })
-
-            return self.memory(init=boot_var)
-        else:
-            pre_mem = self.helper.create_variable(
-                name=unique_name("@".join([self.helper.name, "mem"])),
-                dtype=init.data_type,
-                shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(
-                init=init, pre_mem=pre_mem)
-            return pre_mem
-
-    def step_input(self, x):
-        self._assert_in_rnn_block_('step_input')
-        if not isinstance(x, Variable):
-            raise TypeError("step input takes a Variable")
-        if self.seq_len is None:
-            self.seq_len = x.shape[0]
-        elif self.seq_len != x.shape[0]:
-            raise ValueError("Static RNN only take fix seq_len input")
-
-        ipt = self.helper.create_variable(
-            name=x.name,
-            dtype=x.data_type,
-            shape=list(x.shape[1:]),
-            type=x.type)
-        self.inputs.append(ipt)
-        return ipt
-
-    def step_output(self, o):
-        self._assert_in_rnn_block_('step_output')
-        if not isinstance(o, Variable):
-            raise TypeError("step output takes a Variable")
-
-        tmp_o = self.helper.create_tmp_variable(dtype=o.data_type)
-        self.helper.append_op(
-            type='rnn_memory_helper',
-            inputs={'X': [o]},
-            outputs={'Out': tmp_o},
-            attrs={'data_type': o.data_type})
-
-        out_var = self.parent_block().create_var(
-            name=tmp_o.name,
-            shape=[self.seq_len] + list(tmp_o.shape),
-            dtype=tmp_o.data_type)
-
-        self.outputs.append(out_var)
-
-    def output(self, *outputs):
-        for each in outputs:
-            self.step_output(each)
-
-    def update_memory(self, mem, var):
-        if not isinstance(mem, Variable) or not isinstance(var, Variable):
-            raise TypeError("update memory should take variables")
-        self.memories[mem.name].mem = var
-
-    def parent_block(self):
-        prog = self.helper.program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-
-    def complete_rnn_op(self):
-        program = self.helper.program
-        rnn_block = program.current_block()
-        parent_block = self.parent_block()
-
-        local_inputs = set()
-
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-
-        for var in self.inputs:
-            local_inputs.add(var.name)
-        for m in self.memories:
-            local_inputs.add(m)
-
-        params = list()
-        for op in rnn_block.ops:
-            assert isinstance(op, Operator)
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-
-        parameters = [parent_block.var(name) for name in params]
-
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        inlinks = [parent_block.var(i.name) for i in self.inputs]
-        outlinks = self.outputs
-
-        boot_memories = []
-        pre_memories = []
-        memories = []
-        for _, mem in self.memories.iteritems():
-            boot_memories.append(mem.init)
-            pre_memories.append(mem.pre_mem.name)
-            mem_var = rnn_block.var(mem.mem.name)
-            assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.data_type)
-
-            rnn_block.append_op(
-                type='rnn_memory_helper',
-                inputs={'X': [mem_var]},
-                outputs={'Out': [new_mem]},
-                attrs={'data_type': mem_var.data_type})
-
-            memories.append(new_mem.name)
-
-        parent_block.append_op(
-            type='recurrent',
-            inputs={
-                'inputs': inlinks,
-                'initial_states': boot_memories,
-                'parameters': parameters
-            },
-            outputs={'outputs': outlinks,
-                     'step_scopes': [step_scope]},
-            attrs={
-                'ex_states': pre_memories,
-                'states': memories,
-                'step_block': rnn_block
-            })
-
-
-def lod_rank_table(x, level=0, program=None):
-    helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(
-        type=core.VarDesc.VarType.LOD_RANK_TABLE,
-        name=unique_name("lod_rank_table"))
-    helper.append_op(
-        type='lod_rank_table',
-        inputs={'X': x},
-        outputs={'Out': table},
-        attrs={'level': level})
-    return table
diff --git a/python/paddle/v2/framework/tests/test_adagrad_op.py b/python/paddle/v2/framework/tests/test_adagrad_op.py
deleted file mode 100644
index 66bad349e5..0000000000
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAdagradOp1(OpTest):
-    ''' Test Adagrad operator with explicit attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "adagrad"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-        lr = 0.01
-        epsilon = 1e-8
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'LearningRate': np.array([lr]).astype("float32")
-        }
-
-        self.attrs = {'epsilon': epsilon}
-
-        moment_out = moment + grad * grad
-        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestAdagradOp2(OpTest):
-    ''' Test Adagrad operator with default attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "adagrad"
-
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-        lr = 0.01
-        epsilon = 1e-6
-
-        self.inputs = {
-            'Param': param,
-            'Grad': grad,
-            'Moment': moment,
-            'LearningRate': np.array([lr]).astype("float32")
-        }
-
-        self.attrs = {'epsilon': epsilon}
-
-        moment_out = moment + grad * grad
-        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
-
-        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/framework/tests/test_conv2d_op.py
deleted file mode 100644
index f58b96463c..0000000000
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-def conv2d_forward_naive(input, filter, group, conv_param):
-    in_n, in_c, in_h, in_w = input.shape
-    out_c, f_c, f_h, f_w = filter.shape
-    assert f_c * group == in_c
-    assert np.mod(out_c, group) == 0
-    sub_out_c = out_c / group
-
-    stride, pad = conv_param['stride'], conv_param['pad']
-    out_h = 1 + (in_h + 2 * pad[0] - f_h) / stride[0]
-    out_w = 1 + (in_w + 2 * pad[1] - f_w) / stride[1]
-    out = np.zeros((in_n, out_c, out_h, out_w))
-
-    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
-                       mode='constant',
-                       constant_values=0)
-    for i in range(out_h):
-        for j in range(out_w):
-            for g in range(group):
-                input_pad_masked = \
-                    input_pad[:, g * f_c:(g + 1) * f_c,
-                    i * stride[0]:i * stride[0] + f_h,
-                    j * stride[1]:j * stride[1] + f_w]
-
-                f_sub = filter[g * sub_out_c:(g + 1) * sub_out_c, :, :, :]
-                for k in range(sub_out_c):
-                    out[:, g * sub_out_c + k, i, j] = \
-                        np.sum(input_pad_masked * f_sub[k, :, :, :],
-                               axis=(1, 2, 3))
-
-    return out
-
-
-class TestConv2dOp(OpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.init_group()
-        self.init_test_case()
-
-        conv2d_param = {'stride': self.stride, 'pad': self.pad}
-        input = np.random.random(self.input_size).astype("float32")
-        filter = np.random.random(self.filter_size).astype("float32")
-        output = conv2d_forward_naive(input, filter, self.groups,
-                                      conv2d_param).astype('float32')
-
-        self.inputs = {'Input': input, 'Filter': filter}
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'groups': self.groups,
-            'dilations': self.dilations
-        }
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
-
-    def test_check_grad_no_filter(self):
-        self.check_grad(
-            ['Input'],
-            'Output',
-            max_relative_error=0.05,
-            no_grad_set=set(['Filter']))
-
-    def test_check_grad_no_input(self):
-        self.check_grad(
-            ['Filter'],
-            'Output',
-            max_relative_error=0.05,
-            no_grad_set=set(['Input']))
-
-    def init_test_case(self):
-        # self.groups = 1
-        # self.op_type = "conv2d"
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] / self.groups
-        self.filter_size = [6, f_c, 3, 3]
-
-    def init_group(self):
-        self.groups = 1
-
-    def init_op_type(self):
-        self.op_type = "conv2d"
-
-
-class TestWithGroup(TestConv2dOp):
-    def init_group(self):
-        self.groups = 3
-
-    def init_op_type(self):
-        self.op_type = "conv2d"
-
-
-class TestCudnn(TestConv2dOp):
-    def init_group(self):
-        self.groups = 1
-
-    def init_op_type(self):
-        self.op_type = "conv_cudnn"
-
-
-class TestCudnnWithGroup(TestConv2dOp):
-    def init_group(self):
-        self.groups = 3
-
-    def init_op_type(self):
-        self.op_type = "conv_cudnn"
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
deleted file mode 100644
index 70af9dbc49..0000000000
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import logging
-import paddle.v2.framework.core as core
-import unittest
-from paddle.v2.framework.op import Operator, DynamicRecurrentOp
-import numpy as np
-
-# for siplicity, just one level LoD
-lod_py = [[0, 4, 7, 9, 10]]
-input_dim = 30
-num_sents = len(lod_py[0]) - 1
-weight_dim = 15
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class PyRNNStep(object):
-    def __init__(self):
-
-        self.x = np.random.normal(size=(lod_py[0][-1],
-                                        input_dim)).astype("float32")
-        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.h_boot = np.random.normal(size=(num_sents,
-                                             input_dim)).astype("float32")
-
-
-class DynamicRecurrentOpTest(unittest.TestCase):
-    '''
-    Test RNNOp
-
-    equation:
-        h_t = \sigma (W x_t + U h_{t-1})
-    weights:
-        - W
-        - U
-    vars:
-        - x
-    states:
-        - h
-    outputs:
-       - h
-    '''
-
-    py = PyRNNStep()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_rnn_op()
-        self.create_step_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.rnnop.run(self.scope, ctx)
-        state = self.rnnop.get_state("h@state")
-        print 'state size: ', state.size()
-
-        step_inputs = self.rnnop.get_step_input("x")
-        print "x size ", step_inputs.size()
-        for i in range(step_inputs.size()):
-            print "x %d" % i, np.array(step_inputs.read(i).get_dims())
-        step_outputs = self.rnnop.get_step_output('h@state')
-        print 'step_outputs.size ', step_outputs.size()
-        output = self.scope.find_var("h@state").get_tensor()
-        print 'output', np.array(output).shape
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def create_rnn_op(self):
-        # create RNNOp
-        self.rnnop = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.rnnop.set_step_unit(step_unit)
-
-    def test_forward(self):
-        print 'test recurrent op forward'
-        pd_output = self.forward()
-        print 'pd_output', pd_output
-
-
-class RecurrentGradientOpTest(unittest.TestCase):
-    py = PyRNNStep()
-
-    def create_forward_op(self):
-        # create RNNOp
-        self.forward_op = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_gradient_op(self):
-        a = set()
-        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.forward_op.set_step_unit(step_unit)
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def test_grad(self):
-        self.scope = core.Scope()
-        self.create_forward_op()
-        self.create_global_variables()
-        self.create_step_net()
-        self.create_gradient_op()
-
-
-if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_evaluator.py b/python/paddle/v2/framework/tests/test_evaluator.py
deleted file mode 100644
index 37dbfbc06b..0000000000
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from paddle.v2.framework.evaluator import Evaluator
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-import unittest
-import op_test
-import numpy as np
-
-
-class TestEvaluator(unittest.TestCase):
-    def setup(self, scope, inputs, outputs):
-        def __create_var__(var_name, arr):
-            np_arr = np.array(arr)
-            scope.var(var_name)
-            # tensor = var.get_tensor()
-            # tensor.set_dims(np_arr.shape)
-
-        for var_name, arr in inputs.iteritems():
-            __create_var__(var_name, arr)
-
-        for var_name, arr in outputs.iteritems():
-            __create_var__(var_name, arr)
-
-    def test_evaluator(self):
-
-        inputs = {
-            'Inference': np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 1]]).T,
-            'Label': np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
-        }
-        outputs = {'Accuracy': np.array([0.9])}
-        out_name = 'Accuracy'
-
-        places = [core.CPUPlace()]
-        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
-
-        for place in places:
-            scope = core.Scope()
-            self.setup(scope, inputs, outputs)
-
-            evaluator = Evaluator(
-                scope,
-                operator='accuracy',
-                input='Inference',
-                label='Label',
-                output=out_name,
-                place=place)
-            op_test.set_input(scope, evaluator.op, inputs, place)
-            ctx = core.DeviceContext.create(place)
-
-            for i in range(10):  # simulate 10 mini-batches
-                evaluator.evaluate(ctx)
-
-            actual = np.array(scope.find_var(out_name).get_tensor())
-            print actual
-
-            self.assertTrue(
-                np.allclose(
-                    actual, outputs[out_name], atol=1e-5),
-                "output name: " + out_name + " has diff.")
-
-
-if __name__ == '__main__':
-    exit(0)
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_executor_and_mul.py b/python/paddle/v2/framework/tests/test_executor_and_mul.py
deleted file mode 100644
index 35f7757111..0000000000
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import unittest
-from paddle.v2.framework.layers import mul, data
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_program
-import numpy
-
-
-class TestExecutor(unittest.TestCase):
-    def test_mul(self):
-        a = data(name='a', shape=[784], data_type='float32')
-        b = data(
-            name='b',
-            shape=[784, 100],
-            data_type='float32',
-            append_batch_size=False)
-        out = mul(x=a, y=b)
-        place = core.CPUPlace()
-        a_np = numpy.random.random((100, 784)).astype('float32')
-        tensor_a = core.LoDTensor()
-        tensor_a.set(a_np, place)
-        b_np = numpy.random.random((784, 100)).astype('float32')
-        tensor_b = core.LoDTensor()
-        tensor_b.set(b_np, place)
-        exe = Executor(place)
-        outs = exe.run(g_program,
-                       feed={'a': tensor_a,
-                             'b': tensor_b},
-                       fetch_list=[out])
-        out = numpy.array(outs[0])
-        self.assertEqual((100, 100), out.shape)
-        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py
deleted file mode 100644
index 944240629c..0000000000
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program
-from paddle.v2.framework.io import save_persistables, load_persistables
-from paddle.v2.framework.executor import Executor
-
-import numpy as np
-
-init_program = Program()
-program = Program()
-x = layers.data(
-    name='x',
-    shape=[13],
-    data_type='float32',
-    program=program,
-    init_program=init_program)
-
-y_predict = layers.fc(input=x,
-                      size=1,
-                      act=None,
-                      program=program,
-                      init_program=init_program)
-
-y = layers.data(
-    name='y',
-    shape=[1],
-    data_type='float32',
-    program=program,
-    init_program=init_program)
-
-cost = layers.square_error_cost(
-    input=y_predict, label=y, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
-
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, init_program)
-
-BATCH_SIZE = 20
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(init_program, feed={}, fetch_list=[])
-
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/", program=program)
-    load_persistables(exe, "./fit_a_line.model/", program=program)
-    for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
-
-        tensor_x = core.LoDTensor()
-        tensor_x.set(x_data, place)
-        # print tensor_x.get_dims()
-
-        tensor_y = core.LoDTensor()
-        tensor_y.set(y_data, place)
-        # print tensor_y.get_dims()
-        outs = exe.run(program,
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-
-        if out[0] < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-exit(1)
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/framework/tests/test_image_classification_layer.py
deleted file mode 100644
index b4eda13552..0000000000
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import unittest
-
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
-
-
-def conv_block(input,
-               num_filter,
-               groups,
-               dropouts,
-               program=None,
-               init_program=None):
-    return nets.img_conv_group(
-        input=input,
-        pool_size=2,
-        pool_stride=2,
-        conv_num_filter=[num_filter] * groups,
-        conv_filter_size=3,
-        conv_act='relu',
-        conv_with_batchnorm=True,
-        conv_batchnorm_drop_rate=dropouts,
-        pool_type='max',
-        program=program,
-        init_program=init_program)
-
-
-class TestLayer(unittest.TestCase):
-    def test_batch_norm_layer(self):
-        program = Program()
-        init_program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='float32',
-            program=program)
-        layers.batch_norm(
-            input=images, program=program, init_program=init_program)
-
-        # print str(program)
-
-    def test_dropout_layer(self):
-        program = Program()
-        init_program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='float32',
-            program=program)
-        layers.dropout(
-            x=images,
-            dropout_prob=0.5,
-            program=program,
-            init_program=init_program)
-
-        # print str(program)
-
-    def test_img_conv_group(self):
-        program = Program()
-        init_program = Program()
-
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            data_type='float32',
-            program=program,
-            init_program=init_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], program, init_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], program, init_program)
-
-        # print str(program)
-
-    def test_elementwise_add_with_act(self):
-        program = Program()
-        init_program = Program()
-        image1 = layers.data(
-            name='pixel1',
-            shape=[3, 48, 48],
-            data_type='float32',
-            program=program,
-            init_program=init_program)
-        image2 = layers.data(
-            name='pixel2',
-            shape=[3, 48, 48],
-            data_type='float32',
-            program=program,
-            init_program=init_program)
-        out = layers.elementwise_add(
-            x=image1,
-            y=image2,
-            act='relu',
-            program=program,
-            init_program=init_program)
-        # print(program)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/framework/tests/test_image_classification_train.py
deleted file mode 100644
index 7189adbf8f..0000000000
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_init_program, g_program
-from paddle.v2.framework.initializer import XavierInitializer
-
-
-def resnet_cifar10(input, depth=32, program=None, init_program=None):
-    def conv_bn_layer(input,
-                      ch_out,
-                      filter_size,
-                      stride,
-                      padding,
-                      act='relu',
-                      program=None,
-                      init_program=None):
-        tmp = layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=False,
-            program=program,
-            init_program=init_program)
-        return layers.batch_norm(
-            input=tmp, act=act, program=program, init_program=init_program)
-
-    def shortcut(input, ch_in, ch_out, stride, program, init_program):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None, program,
-                                 init_program)
-        else:
-            return input
-
-    def basicblock(input,
-                   ch_in,
-                   ch_out,
-                   stride,
-                   program=program,
-                   init_program=init_program):
-        tmp = conv_bn_layer(
-            input,
-            ch_out,
-            3,
-            stride,
-            1,
-            program=program,
-            init_program=init_program)
-        tmp = conv_bn_layer(
-            tmp,
-            ch_out,
-            3,
-            1,
-            1,
-            act=None,
-            program=program,
-            init_program=init_program)
-        short = shortcut(input, ch_in, ch_out, stride, program, init_program)
-        return layers.elementwise_add(
-            x=tmp,
-            y=short,
-            act='relu',
-            program=program,
-            init_program=init_program)
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride, program,
-                   init_program):
-        tmp = block_func(input, ch_in, ch_out, stride, program, init_program)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1, program, init_program)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    conv1 = conv_bn_layer(
-        input=input,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        program=program,
-        init_program=init_program)
-    res1 = layer_warp(
-        basicblock,
-        conv1,
-        16,
-        16,
-        n,
-        1,
-        program=program,
-        init_program=init_program)
-    res2 = layer_warp(
-        basicblock,
-        res1,
-        16,
-        32,
-        n,
-        2,
-        program=program,
-        init_program=init_program)
-    res3 = layer_warp(
-        basicblock,
-        res2,
-        32,
-        64,
-        n,
-        2,
-        program=program,
-        init_program=init_program)
-    pool = layers.pool2d(
-        input=res3,
-        pool_size=8,
-        pool_type='avg',
-        pool_stride=1,
-        program=program,
-        init_program=init_program)
-    return pool
-
-
-def vgg16_bn_drop(input, program=None, init_program=None):
-    def conv_block(input,
-                   num_filter,
-                   groups,
-                   dropouts,
-                   program=None,
-                   init_program=None):
-        return nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max',
-            program=program,
-            init_program=init_program)
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0], program, init_program)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0], program, init_program)
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], program, init_program)
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], program, init_program)
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], program, init_program)
-
-    drop = layers.dropout(
-        x=conv5, dropout_prob=0.5, program=program, init_program=init_program)
-    fc1 = layers.fc(input=drop,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()},
-                    program=program,
-                    init_program=init_program)
-    reshape1 = layers.reshape(
-        x=fc1,
-        shape=list(fc1.shape + (1, 1)),
-        program=program,
-        init_program=init_program)
-    bn = layers.batch_norm(
-        input=reshape1, act='relu', program=program, init_program=init_program)
-    drop2 = layers.dropout(
-        x=bn, dropout_prob=0.5, program=program, init_program=init_program)
-    fc2 = layers.fc(input=drop2,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()},
-                    program=program,
-                    init_program=init_program)
-    return fc2
-
-
-classdim = 10
-data_shape = [3, 32, 32]
-
-images = layers.data(name='pixel', shape=data_shape, data_type='float32')
-label = layers.data(name='label', shape=[1], data_type='int64')
-
-# Add neural network config
-# option 1. resnet
-# net = resnet_cifar10(images, 32)
-# option 2. vgg
-net = vgg16_bn_drop(images)
-
-# print(program)
-
-predict = layers.fc(input=net, size=classdim, act='softmax')
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
-accuracy = layers.accuracy(input=predict, label=label)
-
-# optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.001)
-opts = optimizer.minimize(avg_cost)
-
-BATCH_SIZE = 128
-PASS_NUM = 1
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
-
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(g_init_program, feed={}, fetch_list=[])
-
-for pass_id in range(PASS_NUM):
-    batch_id = 0
-    for data in train_reader():
-        img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        batch_size = 1
-        for i in y_data.shape:
-            batch_size = batch_size * i
-        y_data = y_data.reshape([batch_size, 1])
-
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(g_program,
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, accuracy])
-
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
-        print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-              " loss:" + str(loss) + " acc:" + str(acc))
-        batch_id = batch_id + 1
-
-        if batch_id > 1:
-            # this model is slow, so if we can train two mini batch, we think it works properly.
-            exit(0)
-exit(1)
diff --git a/python/paddle/v2/framework/tests/test_increment_op.py b/python/paddle/v2/framework/tests/test_increment_op.py
deleted file mode 100644
index e174272b05..0000000000
--- a/python/paddle/v2/framework/tests/test_increment_op.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestIncrementOpPositiveStep(OpTest):
-    """Test increment op with positive step
-    """
-
-    def setUp(self):
-        self.op_type = "increment"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.attrs = {'step': 14.8}
-        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestIncrementOpNegativeStep(OpTest):
-    """Test increment op with negative step
-    """
-
-    def setUp(self):
-        self.op_type = "increment"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.attrs = {'step': -3.8}
-        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
deleted file mode 100644
index 5cbe790e3f..0000000000
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program, g_program
-import paddle.v2.framework.core as core
-import unittest
-
-
-class TestBook(unittest.TestCase):
-    def test_fit_a_line(self):
-        program = Program()
-        x = layers.data(
-            name='x', shape=[13], data_type='float32', program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, program=program)
-
-        y = layers.data(
-            name='y', shape=[1], data_type='float32', program=program)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, program=program)
-
-        avg_cost = layers.mean(x=cost, program=program)
-        self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost)
-        print str(program)
-
-    def test_recognize_digits_mlp(self):
-        program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], data_type='float32', program=program)
-        label = layers.data(
-            name='label', shape=[1], data_type='int32', program=program)
-        hidden1 = layers.fc(input=images, size=128, act='relu', program=program)
-        hidden2 = layers.fc(input=hidden1, size=64, act='relu', program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            program=program)
-        cost = layers.cross_entropy(input=predict, label=label, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
-        self.assertIsNotNone(avg_cost)
-        print str(program)
-
-    def test_simple_conv2d(self):
-        program = Program()
-        images = layers.data(
-            name='pixel', shape=[3, 48, 48], data_type='int32', program=program)
-        layers.conv2d(
-            input=images, num_filters=3, filter_size=[4, 4], program=program)
-
-        print str(program)
-
-    def test_recognize_digits_conv(self):
-        program = Program()
-
-        images = layers.data(
-            name='pixel',
-            shape=[1, 28, 28],
-            data_type='float32',
-            program=program)
-        label = layers.data(
-            name='label', shape=[1], data_type='int32', program=program)
-        conv_pool_1 = nets.simple_img_conv_pool(
-            input=images,
-            filter_size=5,
-            num_filters=2,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            program=program)
-        conv_pool_2 = nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=4,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            program=program)
-
-        predict = layers.fc(input=conv_pool_2,
-                            size=10,
-                            act="softmax",
-                            program=program)
-        cost = layers.cross_entropy(input=predict, label=label, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
-
-        program.append_backward(avg_cost)
-
-        print str(program)
-
-    def test_word_embedding(self):
-        program = Program()
-        dict_size = 10000
-        embed_size = 32
-        first_word = layers.data(
-            name='firstw', shape=[1], data_type='int64', program=program)
-        second_word = layers.data(
-            name='secondw', shape=[1], data_type='int64', program=program)
-        third_word = layers.data(
-            name='thirdw', shape=[1], data_type='int64', program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], data_type='int64', program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], data_type='int64', program=program)
-
-        embed_first = layers.embedding(
-            input=first_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            program=program)
-        embed_second = layers.embedding(
-            input=second_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            program=program)
-
-        embed_third = layers.embedding(
-            input=third_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            program=program)
-        embed_forth = layers.embedding(
-            input=forth_word,
-            size=[dict_size, embed_size],
-            data_type='float32',
-            param_attr={'name': 'shared_w'},
-            program=program)
-
-        concat_embed = layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
-            program=program)
-
-        hidden1 = layers.fc(input=concat_embed,
-                            size=256,
-                            act='sigmoid',
-                            program=program)
-        predict_word = layers.fc(input=hidden1,
-                                 size=dict_size,
-                                 act='softmax',
-                                 program=program)
-        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, program=program)
-        avg_cost = layers.mean(x=cost, program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/framework/tests/test_nccl_init_op.py
deleted file mode 100644
index 054909fdf5..0000000000
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-from op_test import OpTest, create_op, set_input
-
-if not core.is_compile_gpu():
-    exit(0)
-
-gpu_count = core.get_cuda_device_count()
-
-if gpu_count <= 1:
-    exit(0)
-
-g_scope = core.Scope()
-g_ctx = core.DeviceContext.create(core.CPUPlace())
-
-
-class TestNCCLInit(unittest.TestCase):
-    def test_init(self):
-        self.op_type = "ncclInit"
-        self.gpus = range(gpu_count)
-
-        self.inputs = {}
-        self.attrs = {"gpus": self.gpus}
-        g_scope.var("Communicator").get_communicator()
-        self.outputs = {"Communicator": g_scope.find_var("Communicator")}
-        nccl_init = create_op(
-            g_scope,
-            op_type=self.op_type,
-            inputs=self.inputs,
-            outputs=self.outputs,
-            attrs=self.attrs)
-        nccl_init.run(g_scope, g_ctx)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/framework/tests/test_parameter.py
deleted file mode 100644
index 1ac0cdd99f..0000000000
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import unittest
-from paddle.v2.framework.framework import g_program
-import paddle.v2.framework.core as core
-
-
-class TestParameter(unittest.TestCase):
-    def test_param(self):
-        b = g_program.create_block()
-        param = b.create_parameter(
-            name='fc.w',
-            shape=[784, 100],
-            dtype='float32',
-            initialize_attr={
-                'type': 'uniform_random',
-                'seed': 13,
-                'min': -5.0,
-                'max': 5.0
-            })
-        self.assertIsNotNone(param)
-        self.assertEqual('fc.w', param.name)
-        self.assertEqual((784, 100), param.shape)
-        self.assertEqual(core.DataType.FP32, param.data_type)
-        self.assertEqual(0, param.block.idx)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
deleted file mode 100644
index 695236f3df..0000000000
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program
-from paddle.v2.framework.executor import Executor
-
-import numpy as np
-
-init_program = Program()
-program = Program()
-
-images = layers.data(
-    name='pixel',
-    shape=[1, 28, 28],
-    data_type='float32',
-    program=program,
-    init_program=init_program)
-label = layers.data(
-    name='label',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-conv_pool_1 = nets.simple_img_conv_pool(
-    input=images,
-    filter_size=5,
-    num_filters=20,
-    pool_size=2,
-    pool_stride=2,
-    act="relu",
-    program=program,
-    init_program=init_program)
-conv_pool_2 = nets.simple_img_conv_pool(
-    input=conv_pool_1,
-    filter_size=5,
-    num_filters=50,
-    pool_size=2,
-    pool_stride=2,
-    act="relu",
-    program=program,
-    init_program=init_program)
-
-predict = layers.fc(input=conv_pool_2,
-                    size=10,
-                    act="softmax",
-                    program=program,
-                    init_program=init_program)
-cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program)
-accuracy = layers.accuracy(
-    input=predict, label=label, program=program, init_program=init_program)
-
-# optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
-# momentum=0.9)
-optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost, init_program)
-
-BATCH_SIZE = 50
-PASS_NUM = 3
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(init_program, feed={}, fetch_list=[])
-
-for pass_id in range(PASS_NUM):
-    count = 0
-    for data in train_reader():
-        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
-                                data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = y_data.reshape([BATCH_SIZE, 1])
-
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(program,
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, accuracy])
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
-
-        if loss < 10.0 and acc > 0.9:
-            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
-            exit(0)
-exit(1)
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
deleted file mode 100644
index e848db1701..0000000000
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.regularizer import L2DecayRegularizer
-from paddle.v2.framework.initializer import UniformInitializer
-
-import numpy as np
-
-BATCH_SIZE = 128
-init_program = Program()
-program = Program()
-image = layers.data(
-    name='x',
-    shape=[784],
-    data_type='float32',
-    program=program,
-    init_program=init_program)
-
-param_attr = {
-    'name': None,
-    'initializer': UniformInitializer(
-        low=-1.0, high=1.0),
-    'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
-}
-
-hidden1 = layers.fc(input=image,
-                    size=128,
-                    act='relu',
-                    program=program,
-                    init_program=init_program,
-                    param_attr=param_attr)
-hidden2 = layers.fc(input=hidden1,
-                    size=64,
-                    act='relu',
-                    program=program,
-                    init_program=init_program,
-                    param_attr=param_attr)
-
-predict = layers.fc(input=hidden2,
-                    size=10,
-                    act='softmax',
-                    program=program,
-                    init_program=init_program,
-                    param_attr=param_attr)
-
-label = layers.data(
-    name='y',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-
-cost = layers.cross_entropy(
-    input=predict, label=label, program=program, init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
-accuracy = layers.accuracy(
-    input=predict, label=label, program=program, init_program=init_program)
-
-optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost, init_program)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(init_program, feed={}, fetch_list=[])
-
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = np.expand_dims(y_data, axis=1)
-
-        tensor_x = core.LoDTensor()
-        tensor_x.set(x_data, place)
-
-        tensor_y = core.LoDTensor()
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(program,
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost, accuracy])
-        out = np.array(outs[0])
-        acc = np.array(outs[1])
-        if out[0] < 5.0:
-            exit(0)  # if avg cost less than 5.0, we think our code is good.
-exit(1)
diff --git a/python/paddle/v2/framework/tests/test_recommender_system.py b/python/paddle/v2/framework/tests/test_recommender_system.py
deleted file mode 100644
index 7bc3f84a93..0000000000
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ /dev/null
@@ -1,313 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program
-from paddle.v2.framework.executor import Executor
-
-import numpy as np
-
-init_program = Program()
-program = Program()
-is_sparse = True
-use_gpu = False
-BATCH_SIZE = 256
-
-
-def get_usr_combined_features():
-    # FIXME(dzh) : old API integer_value(10) may has range check.
-    # currently we don't have user configurated check.
-
-    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
-
-    uid = layers.data(
-        name='user_id',
-        shape=[1],
-        data_type='int64',
-        program=program,
-        init_program=init_program)
-
-    usr_emb = layers.embedding(
-        input=uid,
-        data_type='float32',
-        size=[USR_DICT_SIZE, 32],
-        param_attr={'name': 'user_table'},
-        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
-
-    usr_fc = layers.fc(input=usr_emb,
-                       size=32,
-                       program=program,
-                       init_program=init_program)
-
-    USR_GENDER_DICT_SIZE = 2
-
-    usr_gender_id = layers.data(
-        name='gender_id',
-        shape=[1],
-        data_type='int64',
-        program=program,
-        init_program=init_program)
-
-    usr_gender_emb = layers.embedding(
-        input=usr_gender_id,
-        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr={'name': 'gender_table'},
-        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
-
-    usr_gender_fc = layers.fc(input=usr_gender_emb,
-                              size=16,
-                              program=program,
-                              init_program=init_program)
-
-    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
-    usr_age_id = layers.data(
-        name='age_id',
-        shape=[1],
-        data_type="int64",
-        program=program,
-        init_program=init_program)
-
-    usr_age_emb = layers.embedding(
-        input=usr_age_id,
-        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=is_sparse,
-        param_attr={'name': 'age_table'},
-        program=program,
-        init_program=init_program)
-
-    usr_age_fc = layers.fc(input=usr_age_emb,
-                           size=16,
-                           program=program,
-                           init_program=init_program)
-
-    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
-    usr_job_id = layers.data(
-        name='job_id',
-        shape=[1],
-        data_type="int64",
-        program=program,
-        init_program=init_program)
-
-    usr_job_emb = layers.embedding(
-        input=usr_job_id,
-        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr={'name': 'job_table'},
-        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
-
-    usr_job_fc = layers.fc(input=usr_job_emb,
-                           size=16,
-                           program=program,
-                           init_program=init_program)
-
-    concat_embed = layers.concat(
-        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
-        axis=1,
-        program=program,
-        init_program=init_program)
-
-    usr_combined_features = layers.fc(input=concat_embed,
-                                      size=200,
-                                      act="tanh",
-                                      program=program,
-                                      init_program=init_program)
-
-    return usr_combined_features
-
-
-def get_mov_combined_features():
-
-    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
-
-    mov_id = layers.data(
-        name='movie_id',
-        shape=[1],
-        data_type='int64',
-        program=program,
-        init_program=init_program)
-
-    mov_emb = layers.embedding(
-        input=mov_id,
-        data_type='float32',
-        size=[MOV_DICT_SIZE, 32],
-        param_attr={'name': 'movie_table'},
-        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
-
-    mov_fc = layers.fc(input=mov_emb,
-                       size=32,
-                       program=program,
-                       init_program=init_program)
-
-    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
-
-    category_id = layers.data(
-        name='category_id',
-        shape=[1],
-        data_type='int64',
-        program=program,
-        init_program=init_program)
-
-    mov_categories_emb = layers.embedding(
-        input=category_id,
-        size=[CATEGORY_DICT_SIZE, 32],
-        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
-
-    mov_categories_hidden = layers.sequence_pool(
-        input=mov_categories_emb,
-        pool_type="sum",
-        program=program,
-        init_program=init_program)
-
-    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
-
-    mov_title_id = layers.data(
-        name='movie_title',
-        shape=[1],
-        data_type='int64',
-        program=program,
-        init_program=init_program)
-
-    mov_title_emb = layers.embedding(
-        input=mov_title_id,
-        size=[MOV_TITLE_DICT_SIZE, 32],
-        is_sparse=is_sparse,
-        program=program,
-        init_program=init_program)
-
-    mov_title_conv = nets.sequence_conv_pool(
-        input=mov_title_emb,
-        num_filters=32,
-        filter_size=3,
-        act="tanh",
-        pool_type="sum",
-        program=program,
-        init_program=init_program)
-
-    concat_embed = layers.concat(
-        input=[mov_fc, mov_categories_hidden, mov_title_conv],
-        axis=1,
-        program=program,
-        init_program=init_program)
-
-    # FIXME(dzh) : need tanh operator
-    mov_combined_features = layers.fc(input=concat_embed,
-                                      size=200,
-                                      act="tanh",
-                                      program=program,
-                                      init_program=init_program)
-
-    return mov_combined_features
-
-
-def model():
-    usr_combined_features = get_usr_combined_features()
-    mov_combined_features = get_mov_combined_features()
-
-    # need cos sim
-    inference = layers.cos_sim(
-        X=usr_combined_features,
-        Y=mov_combined_features,
-        program=program,
-        init_program=init_program)
-
-    label = layers.data(
-        name='score',
-        shape=[1],
-        data_type='float32',
-        program=program,
-        init_program=init_program)
-
-    square_cost = layers.square_error_cost(
-        input=inference,
-        label=label,
-        program=program,
-        init_program=init_program)
-
-    avg_cost = layers.mean(
-        x=square_cost, program=program, init_program=init_program)
-
-    return avg_cost
-
-
-def main():
-    cost = model()
-    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost, init_program=init_program)
-    block = program.block(0)
-
-    if use_gpu:
-        place = core.GPUPlace(0)
-    else:
-        place = core.CPUPlace()
-
-    exe = Executor(place)
-    exe.run(init_program, feed={}, fetch_list=[])
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-        batch_size=BATCH_SIZE)
-
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = core.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
-
-    PASS_NUM = 100
-    for pass_id in range(PASS_NUM):
-        for data in train_reader():
-            outs = exe.run(program,
-                           feed=func_feed(feeding, data),
-                           fetch_list=[cost])
-            out = np.array(outs[0])
-            if out[0] < 6.0:
-                # if avg cost less than 6.0, we think our code is good.
-                exit(0)
-
-
-main()
diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py
deleted file mode 100644
index abd2ebf0b2..0000000000
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import unittest
-import numpy as np
-import sys
-from op_test import OpTest
-
-
-class TestConcatOp(OpTest):
-    def set_data(self):
-        # two level, batch size is 3
-        x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((4, 8, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        axis = 1
-        level = 1
-        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
-        self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
-
-    def setUp(self):
-        self.op_type = "sequence_concat"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-
-
-class TestConcatOpDiffLod(TestConcatOp):
-    def set_data(self):
-        # two level, batch size is 3
-        x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 2, 3, 5]]
-        axis = 0
-        level = 1
-        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
-        self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
-
-
-class TestConcatOpLevelZero(TestConcatOp):
-    def set_data(self):
-        # two level, batch size is 3
-        x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 3, 4)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 3, 4, 5]]
-        axis = 0
-        level = 0
-        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
-        self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(2):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
-
-
-if __name__ == '__main__':
-    sys.exit(0)
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_tensor_array.py b/python/paddle/v2/framework/tests/test_tensor_array.py
deleted file mode 100644
index 50b3e09162..0000000000
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import logging
-import paddle.v2.framework.core as core
-import unittest
-import numpy as np
-
-
-class TestTensorArray(unittest.TestCase):
-    def setUp(self):
-        self.ta = core.TensorArray()
-
-        self.batch_size = 10
-        self.dim = 2
-
-        # create a LoDTensor
-        self.scope = core.Scope()
-        var = self.scope.var("test_tensor")
-        self.place = core.CPUPlace()
-        tensor = var.get_tensor()
-        tensor.set_dims([self.batch_size, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        tensor_array[0, 0] = 0
-        tensor_array[1, 0] = 1
-        tensor_array[2, 0] = 2
-        tensor_array[3, 0] = 3
-        tensor_array[4, 0] = 4
-        tensor_array[5, 0] = 5
-        tensor_array[6, 0] = 6
-        tensor_array[7, 0] = 7
-        tensor_array[8, 0] = 8
-        tensor_array[9, 0] = 9
-
-        lod_py = [[0, 2, 5, 10]]
-        lod_tensor = core.LoDTensor(lod_py)
-        lod_tensor.set(tensor_array, self.place)
-
-        self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
-
-        self.tensor = lod_tensor
-
-    def test_unstack(self):
-        self.ta.unstack(self.tensor)
-        self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
-
-    def test_read(self):
-        self.ta.unstack(self.tensor)
-        for i in range(self.batch_size):
-            tensor = self.ta.read(i)
-
-    def test_write(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_write_shared(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write_shared(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_unpack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        self.assertEqual(self.ta.size(), 5)
-        self.assertEqual(meta, self.py_seq_meta)
-
-    def test_pack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        print "meta", meta
-        tensor = self.ta.pack(0, meta, self.tensor.lod())
-        print np.array(self.tensor)
-        print np.array(tensor)
-        self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
-        self.assertTrue(tensor.lod(), self.tensor.lod())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
deleted file mode 100644
index dcbb34ccfc..0000000000
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program, g_init_program
-from paddle.v2.framework.executor import Executor
-
-import numpy as np
-
-
-def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
-    data = layers.data(name="words", shape=[1], data_type="int64")
-    label = layers.data(name="label", shape=[1], data_type="int64")
-
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = layers.fc(input=[conv_3, conv_4],
-                           size=class_dim,
-                           act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    acc = layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    cost, acc = convolution_net(input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
-
-    exe.run(g_init_program)
-
-    for pass_id in xrange(PASS_NUM):
-        for data in train_data():
-            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
-
-            label = np.array(map(lambda x: x[1], data)).astype("int64")
-            label = label.reshape([BATCH_SIZE, 1])
-
-            tensor_label = core.LoDTensor()
-            tensor_label.set(label, place)
-
-            outs = exe.run(g_program,
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
-
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if cost_val < 1.0 and acc_val > 0.7:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py
deleted file mode 100644
index 2aaf8d6a2b..0000000000
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_program
-from paddle.v2.framework.executor import Executor
-
-import numpy as np
-
-init_program = Program()
-program = Program()
-
-embed_size = 32
-hidden_size = 256
-N = 5
-batch_size = 32
-is_sparse = True
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-
-first_word = layers.data(
-    name='firstw',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-second_word = layers.data(
-    name='secondw',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-third_word = layers.data(
-    name='thirdw',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-forth_word = layers.data(
-    name='forthw',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-next_word = layers.data(
-    name='nextw',
-    shape=[1],
-    data_type='int64',
-    program=program,
-    init_program=init_program)
-
-embed_first = layers.embedding(
-    input=first_word,
-    size=[dict_size, embed_size],
-    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
-embed_second = layers.embedding(
-    input=second_word,
-    size=[dict_size, embed_size],
-    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
-
-embed_third = layers.embedding(
-    input=third_word,
-    size=[dict_size, embed_size],
-    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
-embed_forth = layers.embedding(
-    input=forth_word,
-    size=[dict_size, embed_size],
-    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    program=program,
-    init_program=init_program)
-
-concat_embed = layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth],
-    axis=1,
-    program=program,
-    init_program=init_program)
-
-hidden1 = layers.fc(input=concat_embed,
-                    size=hidden_size,
-                    act='sigmoid',
-                    program=program,
-                    init_program=init_program)
-predict_word = layers.fc(input=hidden1,
-                         size=dict_size,
-                         act='softmax',
-                         program=program,
-                         init_program=init_program)
-cost = layers.cross_entropy(
-    input=predict_word,
-    label=next_word,
-    program=program,
-    init_program=init_program)
-avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
-
-sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, init_program)
-
-train_reader = paddle.batch(
-    paddle.dataset.imikolov.train(word_dict, N), batch_size)
-
-place = core.CPUPlace()
-exe = Executor(place)
-
-exe.run(init_program, feed={}, fetch_list=[])
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    for data in train_reader():
-        input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
-        input_data = map(lambda x: np.array(x).astype("int64"), input_data)
-        input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
-
-        first_data = input_data[0]
-        first_tensor = core.LoDTensor()
-        first_tensor.set(first_data, place)
-
-        second_data = input_data[1]
-        second_tensor = core.LoDTensor()
-        second_tensor.set(second_data, place)
-
-        third_data = input_data[2]
-        third_tensor = core.LoDTensor()
-        third_tensor.set(third_data, place)
-
-        forth_data = input_data[3]
-        forth_tensor = core.LoDTensor()
-        forth_tensor.set(forth_data, place)
-
-        next_data = input_data[4]
-        next_tensor = core.LoDTensor()
-        next_tensor.set(next_data, place)
-
-        outs = exe.run(program,
-                       feed={
-                           'firstw': first_tensor,
-                           'secondw': second_tensor,
-                           'thirdw': third_tensor,
-                           'forthw': forth_tensor,
-                           'nextw': next_tensor
-                       },
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-        if out[0] < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-exit(1)
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 965d965335..7408ea8ef6 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,33 +1,35 @@
-import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
-import os
-import tarfile
-import cPickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
 """
 This file contains some common interfaces for image preprocess.
 Many users are confused about the image layout. We introduce
 the image layout as follows.
 
 - CHW Layout
+
   - The abbreviations: C=channel, H=Height, W=Width
   - The default layout of image opened by cv2 or PIL is HWC.
     PaddlePaddle only supports the CHW layout. And CHW is simply
     a transpose of HWC. It must transpose the input image.
 
 - Color format: RGB or BGR
+
   OpenCV use BGR color format. PIL use RGB color format. Both
   formats can be used for training. Noted that, the format should
   be keep consistent between the training and inference peroid.
 """
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
 
 
 def batch_images_from_tar(data_file,
@@ -36,17 +38,18 @@ def batch_images_from_tar(data_file,
                           num_per_batch=1024):
     """
     Read images from tar file and batch them into batch file.
-    param data_file: path of image tar file
-    type data_file: string
-    param dataset_name: 'train','test' or 'valid'
-    type dataset_name: string
-    param img2label: a dic with image file name as key 
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
                     and image's label as value
-    type img2label: dic
-    param num_per_batch: image number per batch file
-    type num_per_batch: int
-    return: path of list file containing paths of batch file
-    rtype: string
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
     """
     batch_dir = data_file + "_batch"
     out_path = "%s/%s" % (batch_dir, dataset_name)
@@ -99,14 +102,16 @@ def load_image_bytes(bytes, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         with open('cat.jpg') as f:
             im = load_image_bytes(f.read())
 
     :param bytes: the input image bytes array.
-    :type file: str
+    :type bytes: str
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
@@ -121,6 +126,7 @@ def load_image(file, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
 
     :param file: the input image path.
@@ -128,6 +134,7 @@ def load_image(file, is_color=True):
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
@@ -147,6 +154,7 @@ def resize_short(im, size):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
     
@@ -175,6 +183,7 @@ def to_chw(im, order=(2, 0, 1)):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
         im = to_chw(im)
@@ -196,6 +205,7 @@ def center_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = center_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -223,6 +233,7 @@ def random_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = random_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -251,6 +262,7 @@ def left_right_flip(im):
     Example usage:
     
     .. code-block:: python
+
         im = left_right_flip(im)
     
     :paam im: input image with HWC layout
@@ -275,6 +287,7 @@ def simple_transform(im,
     Example usage:
     
     .. code-block:: python
+
         im = simple_transform(im, 256, 224, True)
 
     :param im: The input image with HWC layout.
@@ -285,6 +298,11 @@ def simple_transform(im,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = resize_short(im, resize_size)
     if is_train:
@@ -324,6 +342,7 @@ def load_and_transform(filename,
     Example usage:
     
     .. code-block:: python
+
         im = load_and_transform('cat.jpg', 256, 224, True)
 
     :param filename: The file name of input image.
@@ -334,6 +353,11 @@ def load_and_transform(filename,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = load_image(filename)
     im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
diff --git a/python/paddle/v2/model.py b/python/paddle/v2/model.py
deleted file mode 100644
index 4634db55a9..0000000000
--- a/python/paddle/v2/model.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import errno
-import uuid
-
-import paddle.v2.master
-
-__all__ = ["save_model", "load_model"]
-
-trainer_id = str(uuid.uuid4())
-
-
-def mkdir_p(path):
-    try:
-        os.makedirs(path)
-    except OSError as exc:
-        if exc.errno == errno.EEXIST and os.path.isdir(path):
-            pass
-        else:
-            raise
-
-
-def save_model(parameters, path):
-    need_request = "KUBERNETES_SERVICE_HOST" in os.environ.keys()
-
-    if need_request:
-        # TODO(helin): figure out how MPI trains, since MPI only save
-        # model when trainer_id == "0", we can consolidate the logic
-        # here.
-
-        # TODO(helin): change this environment variable name from
-        # MASTER_IP to ETCD_IP
-        etcd_name = "MASTER_IP"
-        if etcd_name not in os.environ.keys():
-            raise Exception('not find ' + etcd_name +
-                            ' in environment variable.')
-
-        etcd_ip = os.environ.get(etcd_name)
-        client = paddle.v2.master.client("http://" + etcd_ip + ":2379", 5, 0)
-        r = client.request_save_model(trainer_id, 5000)
-        if r == 0:
-            # do not need to save
-            return
-        elif r < 0:
-            # error
-            return
-        else:
-            # save model
-            path = os.path.join(path, trainer_id)
-            path = os.path.join(path, "model.tar")
-
-    mkdir_p(path)
-
-    with open(path, 'wb') as f:
-        parameters.to_tar(f)
-
-
-def load_model(parameters, path):
-    with open(path, 'rb') as f:
-        parameters.from_tar(f)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 29f0945eb4..caef5f484e 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -11,11 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Optimizers(update equation) for SGD method.
-
-TODO(yuyang18): Complete comments.
-"""
 
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
@@ -101,32 +96,37 @@ class Optimizer(object):
 
 class Momentum(Optimizer):
     """
-    SGD Optimizer.
-
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
+    Momentum Optimizer.
 
-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
+    When sparse=False, the momentum update formula is as follows:
 
     ..  math::
 
-        Q(w) = \\sum_{i}^{n} Q_i(w)
+        v_{t} &= k * v_{t-1} - \\gamma_t (g_{t} + \\lambda w_{t-1}) \\\\
+        w_{t} &= w_{t-1} + v_{t} \\\\
 
-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
+    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+    :math:`w_{t}` is the weight as the t'th iteration.
+    And the :math:`v_{t}` is the history momentum variable.
 
-    So, the SGD method will optimize the weight by
+    When sparse=True, the update scheme:
 
     ..  math::
 
-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+    
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+
+    :param momentum: the momentum factor.
+    :type momentum: float
+    :param sparse: with sparse support or not, False by default.
+    :type sparse: bool
     """
 
     def __init__(self, momentum=None, sparse=False, **kwargs):
@@ -146,7 +146,7 @@ class Adam(Optimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 45a4288751..7e457f987d 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,13 +14,16 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader'
 ]
 
+from threading import Thread
+import subprocess
+
+from Queue import Queue
 import itertools
 import random
-from Queue import Queue
-from threading import Thread
+import zlib
 
 
 def map_readers(func, *readers):
@@ -323,3 +326,101 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
                 yield sample
 
     return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+def pipe_reader(left_cmd,
+                parser,
+                bufsize=8192,
+                file_type="plain",
+                cut_lines=True,
+                line_break="\n"):
+    """
+    pipe_reader read data by stream from a command, take it's 
+    stdout into a pipe buffer and redirect it to the parser to
+    parse, then yield data as your desired format.
+
+    You can using standard linux command or call another program
+    to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+    cmd = "hadoop fs -cat /path/to/some/file"
+    cmd = "cat sample_file.tar.gz"
+    cmd = "curl http://someurl"
+    cmd = "python print_s3_bucket.py"
+
+    A sample parser:
+    
+    def sample_parser(lines):
+        # parse each line as one sample data,
+        # return a list of samples as batches.
+        ret = []
+        for l in lines:
+            ret.append(l.split(" ")[1:5])
+        return ret
+
+    :param left_cmd: command to excute to get stdout from.
+    :type left_cmd: string
+    :param parser: parser function to parse lines of data.
+                   if cut_lines is True, parser will receive list
+                   of lines.
+                   if cut_lines is False, parser will receive a
+                   raw buffer each time.
+                   parser should return a list of parsed values.
+    :type parser: callable
+    :param bufsize: the buffer size used for the stdout pipe.
+    :type bufsize: int
+    :param file_type: can be plain/gzip, stream buffer data type.
+    :type file_type: string
+    :param cut_lines: whether to pass lines instead of raw buffer
+                      to the parser
+    :type cut_lines: bool
+    :param line_break: line break of the file, like \n or \r
+    :type line_break: string
+
+    :return: the reader generator.
+    :rtype: callable
+    """
+    if not isinstance(left_cmd, str):
+        raise TypeError("left_cmd must be a string")
+    if not callable(parser):
+        raise TypeError("parser must be a callable object")
+
+    process = subprocess.Popen(
+        left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+    # TODO(typhoonzero): add a thread to read stderr
+
+    # Always init a decompress object is better than
+    # create in the loop.
+    dec = zlib.decompressobj(
+        32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+
+    def reader():
+        remained = ""
+        while True:
+            buff = process.stdout.read(bufsize)
+            if buff:
+                if file_type == "gzip":
+                    decomp_buff = dec.decompress(buff)
+                elif file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" % file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    parsed_list = parser(lines)
+                    for ret in parsed_list:
+                        yield ret
+                else:
+                    for ret in parser(decomp_buff):
+                        yield ret
+            else:
+                break
+
+    return reader
diff --git a/python/setup.py.in b/python/setup.py.in
index 87b3823e52..d59a6a4780 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,8 +1,61 @@
-from setuptools import setup, Distribution
+from setuptools import setup, Distribution, Extension
+import subprocess
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
+MAJOR   = 0
+MINOR   = 10
+PATCH   = 0
+RC      = 0
+ISTAGED = False
+
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    return git_commit
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)d'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)d'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+
+def show():
+    if istaged:
+        print 'full_version:', full_version
+        print 'major:', major
+        print 'minor:', minor
+        print 'patch:', patch
+        print 'rc:', rc
+    else:
+        print 'commit:', commit
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': MAJOR,
+            'minor': MINOR,
+            'patch': PATCH,
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': ISTAGED})
+
+write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+
+
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
@@ -13,15 +66,15 @@ packages=['paddle',
           'paddle.v2.reader',
           'paddle.v2.master',
           'paddle.v2.plot',
-          'paddle.v2.framework',
-          'paddle.v2.framework.proto',
+          'paddle.v2.fluid',
+          'paddle.v2.fluid.proto',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=["opencv-python"]
+    setup_requires+=['opencv-python']
 
 # the prefix is sys.prefix which should always be usr
 paddle_bin_dir = 'opt/paddle/bin'
@@ -41,19 +94,19 @@ setup(name='paddlepaddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
+      ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data={
         'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.v2.framework': ['core.so'],
+        'paddle.v2.fluid': ['core.so'],
         'py_paddle':['*.py','_swig_paddle.so']
       },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.v2.framework.proto will be generated while compiling.
+          # The paddle.v2.fluid.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,
-      distclass=BinaryDistribution,
       data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )