diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 1ac47212b5..4703944c87 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 def process(settings, file_list):
     for i in xrange(1024):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
+        lab = random.randint(0, settings.num_class - 1)
         yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
new file mode 100755
index 0000000000..b6cd6fe03b
--- /dev/null
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -0,0 +1,48 @@
+set -e
+
+unset OMP_NUM_THREADS MKL_NUM_THREADS
+export OMP_DYNAMIC="FALSE"
+export KMP_AFFINITY="granularity=fine,compact,0,0"
+
+function train() {
+  topology=$1
+  bs=$2
+  use_mkldnn=$3
+  if [ $3 == "True" ]; then
+    thread=1
+    log="logs/${topology}-mkldnn-${bs}.log"
+  elif [ $3 == "False" ]; then
+    thread=`nproc`
+    log="logs/${topology}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $3, use True or False."
+  fi
+  args="batch_size=${bs}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+}
+
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#========== mkldnn ==========#
+train vgg 64 True
+train vgg 128 True
+train vgg 256 True
+
+#========== mklml ===========#
+train vgg 64 False
+train vgg 128 False
+train vgg 256 False
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
new file mode 100644
index 0000000000..b8429975f5
--- /dev/null
+++ b/benchmark/paddle/image/vgg.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg('layer_num', int, 19)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def vgg_network(vgg_num=3):
+    tmp = img_conv_group(
+        input=img,
+        num_channels=3,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    channels = []
+    for i in range(vgg_num):
+        channels.append(256)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(512)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 16:
+    vgg = vgg_network(3)
+elif layer_num == 19:
+    vgg = vgg_network(4)
+else:
+    print("Wrong layer number.")
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=vgg, label=lab)
+outputs(loss)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 0bbf922931..ff9868fc4e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -253,7 +253,7 @@ function(nv_library TARGET_NAME)
       foreach(source_file ${nv_library_SRCS})
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
       add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
diff --git a/cmake/util.cmake b/cmake/util.cmake
index ac911052eb..d1aee3e170 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -97,6 +97,10 @@ function(link_paddle_exe TARGET_NAME)
         target_link_libraries(${TARGET_NAME} log)
     endif(ANDROID)
 
+    if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
+      target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+    endif()
+
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
new file mode 100644
index 0000000000..ac7e98ccf1
--- /dev/null
+++ b/doc/design/refactor/distributed_architecture.md
@@ -0,0 +1,222 @@
+# Design Doc: Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle v0.10.0 uses the "trainer-parameter server"
+architecture. We run multiple replicated instances of trainers (runs
+the same code written by the user) and parameter servers for
+distributed training. This architecture served us well, but has some
+limitations:
+
+1. Need to write special code to handle tasks which should only be run
+  by a single trainer. E.g., initializing model and saving model.
+
+2. Model parallelism is hard: need to write if-else branches conditioned
+  on the trainer ID to partition model onto each trainer, and manually
+  write the inter-model-shard communication code.
+
+3. The user can not directly specify the parameter update rule: need
+   to modify the parameter server C++ code and compile a new
+   binary. This adds complication for researchers: A lot of extra
+   effort is required. Besides, the training job submission program
+   may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training
+architecture that addresses the above limitations.
+
+## Analysis
+
+We will assume the user writes the trainer program by Python, the same
+analysis holds if the trainer program is written in C++.
+
+### Limitation 1
+
+If we look at the Python code that the user writes, there are two
+kinds of functionalities:
+
+- The training logic such as load / save model and print log.
+- The neural network definition such as the definition of the data
+  layer, the fully connected layer, the cost function and the
+  optimizer.
+
+When we training with PaddlePaddle v0.10.0 distributedly, multiple
+replicated Python instances are running on different nodes: both the
+training logic and the neural network computation is replicated.
+
+The tasks that should only run once all belong to the training logic,
+if we only replicate the neural network computation, but do **not**
+replicate the training logic, the limitation could be solved.
+
+### Limitation 2
+
+Model parallelism means running a single model on multiple nodes by
+partitioning the model onto different nodes and managing the
+inter-model-shard communications.
+
+PaddlePaddle should be able to modify the nerual network computation
+definition to support model parallelism automatically. However, the
+computation is only specified in Python code, and PaddlePaddle can not
+modify Python code.
+
+Just like compiler uses a intermediate representation (IR) so that
+programmer does not need to manually optimize their code in most of
+the cases - the compiler will optimize the IR:
+
+<img src="src/compiler.png"/>
+
+We can have our own IR too: PaddlePaddle can support model parallel by
+converting the IR so the user no longer need to manually do it in
+Python:
+
+<img src="src/paddle-compile.png"/>
+
+The IR for PaddlePaddle after refactor is called `Block`, it specifies
+the computation dependency graph and the variables used in the
+computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the
+parameter server because the parameter server does not use the same
+computation definition as the trainer. Instead, the update rule is
+baked in the parameter server. The user can not specify the update
+rule in the same way of specifying the trainer computation.
+
+This could be fixed by making the parameter server run the same
+computation definition as the trainer. For a detailed explanation,
+please
+see
+[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+
+## Distributed Training Architecture
+
+The new distributed training architecture can address the above
+limitations. Below is the illustration:
+
+<img src="src/distributed_architecture.png"/>
+
+The architecture includes major components: *PaddlePaddle Python*,
+*PaddlePaddle converter* and *PaddlePaddle runtime*:
+
+### PaddlePaddle Python
+
+PaddlePaddle Python is the Python library that user's Python trainer
+invoke to build the neural network topology, start training, etc.
+
+```Python
+paddle.init()
+input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
+img, label = input[0], input[1]
+hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
+prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
+cost = paddle.layer.classification_cost(input=prediction, label=label)
+optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
+session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
+for i in range(1000):
+	_, cost_val = session.eval(targets=[cost, optimizer])
+	print cost_val
+```
+
+The code above is a typical Python trainer code, the neural network
+topology is built using helper functions such as
+`paddle.layer.fc`. The training is done by calling `session.eval`
+iteratively.
+
+#### session.eval
+
+As shown in the graph, `session.eval` sends the IR and the evaluation
+inputs/targets to the PaddlePaddle cluster for evaluation. The
+targets can be any variable in the computation graph. When the target
+is the `optimizer` variable, the neural network will be optimized
+once. When the target is the `cost` variable, `session.eval` returns
+the cost value.
+
+The Python `session` is a wrapper of the C++ `Session` class. For more
+information about `Session`, please
+see [Design Doc: Session](./session.md).
+
+### PaddlePaddle Converter
+
+PaddlePaddle converter automatically converts the IR in the request
+(IR and evaluation inputs/targets) from PaddlePaddle Python to new
+partitioned IRs and dispatch the new IRs and evaluation inputs/targets
+to different PaddlePaddle runtimes. Below are the steps:
+
+1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
+   fetches the eval targets to the IR.
+
+1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
+   the boundary. The runtime does not need to run the OP that is not
+   dependent by the `fetch` OP.
+
+1. Optimizes the computation graph.
+
+1. Place the OPs in the graph onto different devices on different
+   PaddlePaddle runtime according to a placement algorithm and device
+   constraint specified by the user.
+
+1. Partition the graph according to runtime boundaries and add `send` /
+   `recv` OP pair on the runtime boundaries.
+
+1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
+   results back to the converter, the convert reports the evaluation
+   results back to the PaddlePaddle Python.
+   
+The output IRs will be cached to optimize the conversion latency.
+
+
+#### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server"
+placement: the parameters, initializers, and optimizers are placed on
+the PaddlePaddle runtimes with the parameter server role. And
+everything else will be placed on the PaddlePaddle runtimes with the
+trainer role. This has the same functionality of our
+"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
+is more general and flexible.
+
+In the future, we will implement the general placement algorithm,
+which makes placements according to the input IR, and a model of
+device computation time and device communication time. Model
+parallelism requires the general placement algorithm.
+
+
+### PaddlePaddle Runtime
+
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
+runs the IR. The runtime does not need to do OP placement since it's
+already done by the converter.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed
+training architecture, the differences are everything runs locally,
+and there is just one PaddlePaddle runtime:
+
+<img src="src/local_architecture.png"/>
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `session.eval`. However should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/design/ops/dist_train.md b/doc/design/refactor/parameter_server.md
similarity index 100%
rename from doc/design/ops/dist_train.md
rename to doc/design/refactor/parameter_server.md
diff --git a/doc/design/refactor/src/compiler.graffle b/doc/design/refactor/src/compiler.graffle
new file mode 100644
index 0000000000..8cc678fea3
Binary files /dev/null and b/doc/design/refactor/src/compiler.graffle differ
diff --git a/doc/design/refactor/src/compiler.png b/doc/design/refactor/src/compiler.png
new file mode 100644
index 0000000000..65d34f841a
Binary files /dev/null and b/doc/design/refactor/src/compiler.png differ
diff --git a/doc/design/ops/src/dist-graph.graffle b/doc/design/refactor/src/dist-graph.graffle
similarity index 100%
rename from doc/design/ops/src/dist-graph.graffle
rename to doc/design/refactor/src/dist-graph.graffle
diff --git a/doc/design/ops/src/dist-graph.png b/doc/design/refactor/src/dist-graph.png
similarity index 100%
rename from doc/design/ops/src/dist-graph.png
rename to doc/design/refactor/src/dist-graph.png
diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle
new file mode 100644
index 0000000000..f8496e5732
Binary files /dev/null and b/doc/design/refactor/src/distributed_architecture.graffle differ
diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png
new file mode 100644
index 0000000000..410c4510c6
Binary files /dev/null and b/doc/design/refactor/src/distributed_architecture.png differ
diff --git a/doc/design/ops/src/local-graph.graffle b/doc/design/refactor/src/local-graph.graffle
similarity index 100%
rename from doc/design/ops/src/local-graph.graffle
rename to doc/design/refactor/src/local-graph.graffle
diff --git a/doc/design/ops/src/local-graph.png b/doc/design/refactor/src/local-graph.png
similarity index 100%
rename from doc/design/ops/src/local-graph.png
rename to doc/design/refactor/src/local-graph.png
diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle
new file mode 100644
index 0000000000..cc7783c453
Binary files /dev/null and b/doc/design/refactor/src/local_architecture.graffle differ
diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png
new file mode 100644
index 0000000000..4b999538b7
Binary files /dev/null and b/doc/design/refactor/src/local_architecture.png differ
diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/design/refactor/src/paddle-compile.graffle
new file mode 100644
index 0000000000..a6348cc3db
Binary files /dev/null and b/doc/design/refactor/src/paddle-compile.graffle differ
diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/design/refactor/src/paddle-compile.png
new file mode 100644
index 0000000000..e0f13d551a
Binary files /dev/null and b/doc/design/refactor/src/paddle-compile.png differ
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index acbf4c87ae..d69d111917 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -247,11 +247,11 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
     CMake Warning at cmake/version.cmake:20 (message):
       Cannot add paddle version from git tag
-          
+
 那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
 
 12. A protocol message was rejected because it was too big
-----------------------------------------------------------
+------------------------------------------------------------
 
 如果在训练NLP相关模型时，出现以下错误：
 
@@ -316,10 +316,42 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 模型一直不收敛，发散到了一个数值特别大的地方。
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
-主要的解决办法是减小学习率或者对数据进行归一化处理。
+这里有两种有效的解决方法：
+
+1. 设置 :code:`gradient_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+optimizer = paddle.optimizer.RMSProp(
+    learning_rate=1e-3,
+    gradient_clipping_threshold=10.0,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. 设置 :code:`error_clipping_threshold` 参数，示例代码如下：
+
+..  code-block:: python
+
+decoder_inputs = paddle.layer.fc(
+    act=paddle.activation.Linear(),
+    size=decoder_size * 3,
+    bias_attr=False,
+    input=[context, current_word],
+    layer_attr=paddle.attr.ExtraLayerAttribute(
+        error_clipping_threshold=100.0))
+
+完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+两种方法的区别：
+
+1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
+2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
+
+除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
 
 15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
-------------------------------------------------------------------------
+------------------------------------------------------------------------------------------
 先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
 
 pip uninstall py_paddle paddle
@@ -329,7 +361,7 @@ pip uninstall py_paddle paddle
 pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
 
 16. PaddlePaddle存储的参数格式是什么，如何和明文进行相互转化
----------------------------------------------------------
+---------------------------------------------------------------------
 
 PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中，1~4字节表示PaddlePaddle版本信息，请直接填充0；5~8字节表示每个参数占用的字节数，当保存的网络参数为float类型时为4，double类型时为8；9~16字节表示保存的参数总个数。
 
@@ -381,7 +413,7 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
     parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
 
 18. 集群多节点训练，日志中保存均为网络通信类错误
-------------------------------
+-----------------------------------------------------------
 
 集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
 此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
@@ -390,4 +422,171 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
 
 * 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
 
-* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
\ No newline at end of file
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
+
+19.  如何调用 infer 接口输出多个layer的预测结果
+-----------------------------------------------------------
+
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入，代码如下：
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例，代码如下：
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+需要注意的是：
+
+* 如果指定了2个layer作为输出层，实际上需要的输出结果是两个矩阵；
+* 假设第一个layer的输出A是一个 N1 * M1 的矩阵，第二个 Layer 的输出B是一个 N2 * M2 的矩阵；
+* paddle.v2 默认会将A和B 横向拼接，当N1 和 N2 大小不一样时，会报如下的错误：
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+多个层的输出矩阵的高度不一致导致拼接失败，这种情况常常发生在：
+
+* 同时输出序列层和非序列层；
+* 多个输出层处理多个不同长度的序列;
+
+此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤，来解决上面的问题。这时，infer接口的返回值是一个python list:
+
+* list 中元素的个数等于网络中输出层的个数；
+* list 中每个元素是一个layer的输出结果矩阵，类型是numpy的ndarray；
+* 每一个layer输出矩阵的高度，在非序列输入时：等于样本数；序列输入时等于：输入序列中元素的总数；宽度等于配置中layer的size；
+
+20. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出，该layer是通过参数 :code:`name` 指定，即，:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer，并将该layer上一时间步的输出作为自身当前时间步的输出。
+
+* PaddlePaddle的所有layer都有唯一的name，用户通过参数 :code:`name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer，其name由参数 :code:`memory_name` 设定，当用户没有显式设定时，PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer，需要用户显式设定。
+
+21. 两种使用 drop_out 的方法有何区别？
+-----------------------------------------------------
+
+* 在PaddlePaddle中使用dropout有两种方式
+
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * 使用 :code:`paddle.layer.dropout`，以 :code:`paddle.layer.fc` 为例，代码如下：
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`，并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+
+* PaddlePaddle在激活函数里实现dropout，而不是在layer里实现。
+
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活，所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout，可采用第二种方式，即使用 :code:`paddle.layer.dropout`。
+
+22. 如何设置学习率退火（learning rate annealing）
+------------------------------------------------
+
+在相应的优化算法里设置learning_rate_schedule及相关参数，以使用Adam算法为例，代码如下：
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+
+PaddlePaddle目前支持8种learning_rate_schedule，这8种learning_rate_schedule及其对应学习率计算方式如下：
+
+* "constant"
+
+  lr = learning_rate
+
+* "poly"
+
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  其中，num_samples_processed为已训练样本数，下同。
+
+* "caffe_poly"
+
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  在该示例中，当已训练样本数小于等于1000时，学习率为 :code:`1e-3 * 1.0`；当已训练样本数大于1000小于等于2000时，学习率为 :code:`1e-3 * 0.9`；当已训练样本数大于2000时，学习率为 :code:`1e-3 * 0.8`。
+
+* "pass_manual"
+
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时，用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数，当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例，代码如下：
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  在该示例中，当已训练pass数小于等于1时，学习率为 :code:`1e-3 * 1.0`；当已训练pass数大于1小于等于2时，学习率为 :code:`1e-3 * 0.9`；当已训练pass数大于2时，学习率为 :code:`1e-3 * 0.8`。
+
+23. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时，先找出参数 :code:`name` 取值相同的layer，然后将这些layer的参数 :code:`name` 设置为不同的值。
+
+24. PaddlePaddle 中不同的 recurrent layer 的区别
+--------------------------------------------------
+以LSTM为例，在PaddlePaddle中包含以下 recurrent layer：
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+按照具体实现方式可以归纳为2类：
+
+1. 由 recurrent_group 实现的 recurrent layer：
+
+  * 用户在使用这一类recurrent layer时，可以访问由recurrent unit在一个时间步内计算得到的中间值（例如：hidden states, memory cells等）；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ；
+
+2. 将recurrent layer作为一个整体来实现：
+
+  * 用户在使用这一类recurrent layer，只能访问它们的输出值；
+  * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现；
+
+将recurrent layer作为一个整体来实现， 能够针对CPU和GPU的计算做更多优化， 所以相比于recurrent group的实现方式， 第二类 recurrent layer 计算效率更高。 在实际应用中，如果用户不需要访问LSTM的中间变量，而只需要获得recurrent layer计算的输出，我们建议使用第二类实现。
+
+此外，关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元：
+
+  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程，它并不是一个完整的recurrent layer，也不能接收序列数据作为输入；
+  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用；
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 84e3317774..30b144d849 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -20,7 +20,7 @@ Docker使用入门
 		  
      docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
+  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
 
 - *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
   实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
new file mode 100644
index 0000000000..60681cdd71
--- /dev/null
+++ b/doc/howto/dev/new_op_en.md
@@ -0,0 +1,356 @@
+# How to write a new operator
+
+ - [Background](#background)
+ - [Implementing C++ Types](#implementing-c++-types)
+   - [Defining ProtoMaker](#defining-protoMaker)
+   - [Defining Operator](#defining-operator)
+   - [Registering Operator](#registering-operator)
+   - [Compilation](#compilation)
+ - [Python Binding](#python-binding)
+ - [Unit Tests](#unit-tests)
+   - [Testing Forward Operators](#testing-forward-operators)
+   - [Testing Backward Operators](#testing-backward-operators)
+   - [Compiling and Running](#compiling-and-running)
+ - [Remarks](#remarks)
+## Background
+
+Here are the base types needed. For details, please refer to the design docs.
+
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
+- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+
+An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+
+
+ Information           | Where is it defined
+--------------  | :----------------------
+OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
+Op definition           | `.cc` files
+Kernel implementation       | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+
+
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
+
+
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+
+
+## Implementing C++ Types
+
+
+### 1. Defining Class ProtoMaker
+
+Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
+
+First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+
+   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
+   - `framework::OpAttrChecker` is used to validate variable attributes.
+
+The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
+
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+
+
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+
+There are two changes in this example:
+
+- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
+
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+
+
+### 2. Defining Operator
+
+The following code defines the interface for MulOp:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
+
+  - 1). validate and error out early: it checks input data dimensions and types.
+  - 2). configures the tensor shape in the output.
+
+Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
+
+### 3. Defining OpKernel
+
+`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
+
+- `typename  Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+
+- `typename T` denotes data type, such as `float` or `double`.
+
+`MulKernel` types need to rewrite the interface for `Compute`.
+- `Compute` takes one input variable `const framework::ExecutionContext& context`.
+- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
+- `Compute` implements the computation logics of an `OpKernel`.
+
+`MulKernel`'s implementation of `Compute` is as follows:
+
+  ```cpp
+  template <typename Place, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto* device_context =
+        const_cast<platform::DeviceContext*>(context.device_context_);
+    math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+
+`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, `Eigen unsupported Tensor` module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+
+
+This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
+
+The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
+
+### 4. Registering Operator
+
+- In `.cc` files, register forward and backward operator classes and the CPU kernel.
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUPlace, float>);
+    ```
+
+   In that code block,
+
+    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+
+
+- Registering GPU Kernel in `.cu` files
+    - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
+    REGISTER_OP_GPU_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::GPUPlace, float>);
+    ```
+
+### 5. Compilation
+
+Run the following commands to compile.
+
+```
+make mul_op
+```
+
+## Python Binding
+
+The system will automatically bind to Python and link it to a generated library.
+
+## Unit Tests
+
+Unit tests for an operator include
+
+1. comparing a forward operator's implementations on different devices,
+
+2. comparing a backward operator's implementation on different devices, and
+
+3. a scaling test for the backward operator.
+
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+
+### Testing Forward Operators
+
+A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following:
+
+1. Defining input, output and relevant attributes in `setUp` method.
+
+2. Generating random input data.
+
+3. Implementing the same computation logic in a Python script:
+
+  ```python
+  import unittest
+  import numpy as np
+  from gradient_checker import GradientChecker, create_op
+  from op_test_util import OpTestMeta
+
+  class TestMulOp(unittest.TestCase):
+      __metaclass__ = OpTestMeta
+
+      def setUp(self):
+          self.type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+  ```
+Get its output, and compare it with the forward operator's own output.
+
+The code above first loads required packages. In addition, we have
+
+- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.inputs` defines input, with type `numpy.array` and initializes it.
+- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
+
+### Testing Backward Operators
+
+A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
+
+```python
+class TestMulGradOp(GradientChecker):
+    def setUp(self):
+        self.op = create_op("mul")
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+
+    def test_cpu_gpu_compare(self):
+        self.compare_grad(self.op, self.inputs)
+
+    def test_normal(self):
+        # mul op will enlarge the relative error
+        self.check_grad(
+            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+
+    def test_ignore_x(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["Y"],
+            "Out",
+            max_relative_error=0.5,
+            no_grad_set={"X"})
+
+    def test_ignore_y(self):
+        self.check_grad(
+            self.op,
+            self.inputs, ["X"],
+            "Out",
+            max_relative_error=0.5,
+            no_grad_set={"Y"})
+```
+
+Some key points in the code above include:
+
+- `create_op("mul")` creates the backward operator's corresponding forward operator.
+- `compare_grad` compares results between utilizing the CPU and the GPU.
+- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
+  - The first variable `self.op` denotes the forward operator.
+  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
+  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The fourth variable `"Out"` points to the network's final output target `Out`.
+- `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
+
+### Compiling and Running
+
+
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+
+Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
+
+After successfully compiling the project, run the following command to run unit tests:
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+Or,
+
+```bash
+ctest -R test_mul_op
+```
+
+## Remarks
+
+- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
+- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index e535f84dba..5b0c18cc6c 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -26,7 +26,7 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
-cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker)
+cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index fda89252e3..510dc28c57 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -28,47 +28,6 @@ ProgramDesc& GetProgramDesc() {
   return *g_program_desc;
 }
 
-template <>
-AttrType AttrTypeID<bool>() {
-  return BOOLEAN;
-}
-template <>
-AttrType AttrTypeID<int>() {
-  return INT;
-}
-template <>
-AttrType AttrTypeID<float>() {
-  return FLOAT;
-}
-template <>
-AttrType AttrTypeID<std::string>() {
-  return STRING;
-}
-template <>
-AttrType AttrTypeID<std::vector<bool>>() {
-  return BOOLEANS;
-}
-template <>
-AttrType AttrTypeID<std::vector<int>>() {
-  return INTS;
-}
-template <>
-AttrType AttrTypeID<std::vector<float>>() {
-  return FLOATS;
-}
-template <>
-AttrType AttrTypeID<std::vector<std::string>>() {
-  return STRINGS;
-}
-template <>
-AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
-  return INT_PAIRS;
-}
-template <>
-AttrType AttrTypeID<BlockDesc>() {
-  return BLOCK;
-}
-
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
     case framework::AttrType::BOOLEAN: {
@@ -111,14 +70,6 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       }
       return val;
     }
-    case framework::AttrType::INT_PAIRS: {
-      std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
-      for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
-        val[i].first = attr_desc.int_pairs(i).first();
-        val[i].second = attr_desc.int_pairs(i).second();
-      }
-      return val;
-    }
     case framework::AttrType::BLOCK: {
       return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
     }
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 48b54b5422..c7559cefb6 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,10 +27,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-typedef boost::variant<boost::blank, bool, int, float, std::string,
-                       std::vector<bool>, std::vector<int>, std::vector<float>,
-                       std::vector<std::string>,
-                       std::vector<std::pair<int, int>>, BlockDesc*>
+// The order should be as same as framework.proto
+typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                       std::vector<float>, std::vector<std::string>, bool,
+                       std::vector<bool>, BlockDesc*>
     Attribute;
 
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
@@ -38,10 +38,28 @@ typedef std::unordered_map<std::string, Attribute> AttributeMap;
 ProgramDesc& GetProgramDesc();
 
 template <typename T>
-AttrType AttrTypeID();
+inline AttrType AttrTypeID() {
+  Attribute tmp = T();
+  return static_cast<AttrType>(tmp.which() - 1);
+}
 
 Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
 
+class AttrReader {
+ public:
+  explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
+
+  template <typename T>
+  inline const T& Get(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+ private:
+  const AttributeMap& attrs_;
+};
+
 // check whether a value(attribute) fit a certain limit
 template <typename T>
 class GreaterThanChecker {
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
index 0a6d762bc8..ac60be5724 100644
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
@@ -2,7 +2,7 @@
 
 ## Motivation
 
-In Neural Network, many model is solved by the the backpropagation algorithm(known as BP) at present. Technically it caculates the gradient of the loss function, then distributed back through the networks. Follows the chain rule, so we need a module chains the gradient operators/expressions together with to construct the backward pass. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass. 
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass. 
 
 ## Implementation
 
@@ -24,9 +24,9 @@ A backward network is built up with several backward operators. Backward operato
 | **Operator::inputs_**  | Inputs       | Inputs, Outputs, OutputGradients |	
 | **Operator::outputs_** | Outputs          | InputGradients            |
 
- In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
+ In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced.
 
-For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:
+For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro:
 
 ```cpp
 REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
@@ -48,7 +48,7 @@ The function `BuildGradOp` will sequentially execute following processes:
 
 1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
 
-2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
+2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
 
 3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
 
@@ -56,11 +56,11 @@ The function `BuildGradOp` will sequentially execute following processes:
 
 ### Backward Network Building
 
-A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and append them together one by one. There is some corner case need to process specially.
+A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing.
 
 1. Op 
 
-   When the input forward network is an Op, return its gradient Operator Immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
+   When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
 
 2. NetOp 
 
@@ -68,33 +68,33 @@ A backward network is a series of backward operators. The main idea of building
 
 3. RnnOp
 
-   RnnOp is a nested stepnet operator.  Backward module need to recusively call `Backward` for every stepnet.
+   RnnOp is a nested stepnet operator.  Backward module needs to recusively call `Backward` for every stepnet.
 
 4. Sharing Variables
 
-   **sharing variables**. As illustrated in the pictures, two operator's share the same variable name of W@GRAD, which will overwrite their sharing input variable. 
+   As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable. 
 
 <p align="center">
 <img src="./images/duplicate_op.png" width="50%" ><br/>
 
-​	pic 1. Sharing variables in operators. 
+​	Figure 1. Sharing variables in operators. 
 
 </p>
 
-​	Sharing variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator to replace the overwrite links. 
+​	Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting. 
 
 <p align="center">
 <img src="images/duplicate_op2.png" width="40%" ><br/>
 
-​	pic 2. Replace sharing variable's gradient with `Add` operator.
+​	Figure 2. Replace sharing variable's gradient with `Add` operator.
 
 </p>
 
-​	Because our framework finds variables accord to their names, we need to rename the output links. We add a suffix of number to represent its position in clockwise. 
+​	Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction. 
 
-5. Part of Gradient is Zero.
+5. Part of the Gradient is Zero.
 
-   In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator,  we need to fill a same shape gradient matrix in the position. In our implement, we insert a special `fillZeroLike` operator.
+   In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator,  we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator.
 
 
 Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 6fcfe6de25..951c7afbc1 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -22,17 +22,11 @@ enum AttrType {
   INTS = 3;
   FLOATS = 4;
   STRINGS = 5;
-  INT_PAIRS = 6;
-  BOOLEAN = 7;
-  BOOLEANS = 8;
-  BLOCK = 9;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
 }
 
-message IntPair {
-  required int32 first = 1;
-  required int32 second = 2;
-};
-
 // OpDesc describes an instance of a C++ framework::OperatorBase
 // derived class type.
 message OpDesc {
@@ -46,7 +40,6 @@ message OpDesc {
     repeated int32 ints = 6;
     repeated float floats = 7;
     repeated string strings = 8;
-    repeated IntPair int_pairs = 9;
     optional bool b = 10;
     repeated bool bools = 11;
     optional int32 block_idx = 12;
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index e00c6e8d90..b8fdf69683 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -174,4 +174,4 @@ TEST(OpRegistry, CustomChecker) {
   op->Run(scope, dev_ctx);
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index fcbfc3e437..a3f28339aa 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/framework/operator.h"
 #include <algorithm>
-#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -33,6 +32,24 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 }
 #endif
 
+const Tensor* GetTensorFromVar(const Variable* var) {
+  if (var->IsType<LoDTensor>()) {
+    return &var->Get<LoDTensor>();
+  }
+  PADDLE_ENFORCE(var->IsType<Tensor>(),
+                 "The Input must be LoDTensor or Tensor.");
+  return &var->Get<Tensor>();
+}
+
+Tensor* GetTensorFromVar(Variable* var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->GetMutable<LoDTensor>();
+  }
+  PADDLE_ENFORCE(var->IsType<Tensor>(),
+                 "The Input must be LoDTensor or Tensor.");
+  return var->GetMutable<Tensor>();
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 2d6d5510ef..77c7c855c0 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
+#include "paddle/framework/shape_inference.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
@@ -56,6 +57,9 @@ class OperatorBase;
 class InferShapeContext;
 class ExecutionContext;
 
+extern const Tensor* GetTensorFromVar(const Variable* var);
+extern Tensor* GetTensorFromVar(Variable* var);
+
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -262,15 +266,6 @@ class InferShapeContext {
     return res;
   }
 
-  const Tensor* GetTensorFromVar(const Variable* var) const {
-    if (var->IsType<LoDTensor>()) {
-      return &var->Get<LoDTensor>();
-    }
-    PADDLE_ENFORCE(var->IsType<Tensor>(),
-                   "The Input(%s) must be LoDTensor or Tensor.");
-    return &var->Get<Tensor>();
-  }
-
   void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) const {
     PADDLE_ENFORCE_LT(i, InputSize(in));
@@ -340,6 +335,78 @@ class ExecutionContext : public InferShapeContext {
   const platform::DeviceContext& device_context_;
 };
 
+class RuntimeInferShapeContext : public InferShapeContextBase {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const {
+    auto ipt = op_.Input(name);
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasOutput(const std::string& name) const {
+    auto ipt = op_.Output(name);
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  DDim GetInputDim(const std::string& name) const {
+    return GetDim(op_.Input(name));
+  }
+
+  void SetInputDim(const std::string& name, const DDim& dim) {
+    SetDim(op_.Input(name), dim);
+  }
+
+  DDim GetOutputDim(const std::string& name) const {
+    return GetDim(op_.Output(name));
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) {
+    SetDim(op_.Output(name), dim);
+  }
+
+  AttrReader Attrs() const { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(const std::string& name) const {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(const std::string& name) const {
+    return op_.Outputs(name);
+  }
+
+ private:
+  template <bool Allocate>
+  Tensor* GetTensor(const std::string& name) const {
+    Tensor* t = nullptr;
+    auto* var = scope_.FindVar(name);
+    if (!var->IsType<LoDTensor>() && !var->IsType<Tensor>()) {
+      if (Allocate) {
+        t = var->GetMutable<LoDTensor>();
+      } else {
+        PADDLE_THROW("Variable(%s) should be tensor", name);
+      }
+    } else {
+      t = GetTensorFromVar(scope_.FindVar(name));
+    }
+    return t;
+  }
+
+  DDim GetDim(const std::string& name) const {
+    return GetTensor<false>(name)->dims();
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) {
+    GetTensor<true>(name)->Resize(dim);
+  }
+
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
 class OpKernel {
  public:
   /**
@@ -383,8 +450,10 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
+  // runtime infershape
   void InferShape(const Scope& scope) const override {
-    InferShape(InferShapeContext(*this, scope));
+    auto c = RuntimeInferShapeContext(*this, scope);
+    InferShape(&c);
   }
 
   void Run(const Scope& scope,
@@ -406,7 +475,7 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
-  virtual void InferShape(const InferShapeContext& ctx) const = 0;
+  virtual void InferShape(InferShapeContextBase* ctx) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 0beab0fac5..8b4bb01a7b 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
@@ -114,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
   using OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {}
+  void InferShape(framework::InferShapeContextBase* ctx) const override {}
 };
 
 template <typename T1, typename T2>
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
new file mode 100644
index 0000000000..b07fc78812
--- /dev/null
+++ b/paddle/framework/shape_inference.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace framework {
+
+class InferShapeContextBase {
+ public:
+  virtual ~InferShapeContextBase() {}
+  virtual bool HasInput(const std::string &name) const = 0;
+  virtual bool HasOutput(const std::string &name) const = 0;
+  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+  std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
+    const std::vector<std::string> &names = Inputs(name);
+    return GetDims(names);
+  }
+  virtual void SetInputDim(const std::string &name,
+                           const framework::DDim &dim) = 0;
+  void SetInputsDim(const std::string &name,
+                    const std::vector<framework::DDim> &dims) {
+    auto &names = Inputs(name);
+    SetDims(names, dims);
+  }
+  virtual framework::DDim GetOutputDim(const std::string &name) const = 0;
+  std::vector<framework::DDim> GetOutputsDim(const std::string &name) const {
+    const std::vector<std::string> &names = Outputs(name);
+    return GetDims(names);
+  }
+  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  void SetOutputsDim(const std::string &name,
+                     const std::vector<framework::DDim> &dims) {
+    auto &names = Outputs(name);
+    SetDims(names, dims);
+  }
+  virtual AttrReader Attrs() const = 0;
+  virtual const std::vector<std::string> &Inputs(
+      const std::string &name) const = 0;
+  virtual const std::vector<std::string> &Outputs(
+      const std::string &name) const = 0;
+  // TODO(qiao) implement this function
+  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) const {}
+
+ protected:
+  virtual framework::DDim GetDim(const std::string &name) const = 0;
+  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+  std::vector<framework::DDim> GetDims(
+      const std::vector<std::string> &names) const {
+    std::vector<framework::DDim> ret;
+    ret.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(ret),
+        [this](const std::string &name) { return this->GetDim(name); });
+    return ret;
+  }
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<framework::DDim> &dims) {
+    size_t length = names.size();
+    PADDLE_ENFORCE_EQ(length, dims.size());
+    for (size_t i = 0; i < length; ++i) {
+      SetDim(names[i], dims[i]);
+    }
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/variable.md b/paddle/framework/variable.md
index f44d5ea46e..442ef6b718 100644
--- a/paddle/framework/variable.md
+++ b/paddle/framework/variable.md
@@ -7,7 +7,7 @@ Variable is also known as *blob* in MxNet and Caffe2.  It is the input and outpu
 
 For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
 
-To use the minimum amount of memory, we'd like that a variable to allocate memory when it has to, or, lazy memory allocation.  Let's take the following example:
+To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation.  Let's take the following example:
 
 ```cpp
 Variable vr, v1, v2;
@@ -38,7 +38,7 @@ This syntax for lazy memory allocation when we call `Randomize` and `Mult`, thos
 
 To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
 
-Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, who can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
 
 But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
 
@@ -49,4 +49,4 @@ Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the
 
 ## Conclusion
 
-The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from definition something like `caffe2::TypeMata`, which takes hundreds of lines of C++ code.
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code.
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index ac50937ef3..18c5638100 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -27,31 +27,53 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
 #define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
 
 /**
- * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ * @def BEGIN_MKLDNN_ACTIVATION
+ */
+#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
+/**
+ * @def END_MKLDNN_ACTIVATION
  */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(ACT_TYPE, ALPHA, BWD_ALPHA)        \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)                              \
-      : public MKLDNNEltwiseActivation {                                    \
-  private:                                                                  \
-    static const std::string name;                                          \
-    static const float alpha;                                               \
-    static const float bwdAlpha;                                            \
-                                                                            \
-  public:                                                                   \
-    const std::string& getName() const { return name; }                     \
-    float getAlpha() const { return alpha; }                                \
-    float getBwdAlpha() const { return bwdAlpha; }                          \
-  };                                                                        \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name =          \
-      "mkldnn_" #ACT_TYPE;                                                  \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA;        \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA; \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {              \
-    gMKLDNNActivationRegistrar                                              \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(             \
-            "mkldnn_" #ACT_TYPE);                                           \
+#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
+private:                                                           \
+  static const std::string name;                                   \
+                                                                   \
+public:                                                            \
+  const std::string& getName() const { return name; }              \
+  }                                                                \
+  ;                                                                \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
+      "mkldnn_" #ACT_TYPE;                                         \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
+    gMKLDNNActivationRegistrar                                     \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
+            "mkldnn_" #ACT_TYPE);                                  \
   });
 
+/**
+ * @def DEFINE_MKLDNN_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)
+
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
+    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
+private:                                                             \
+  static const float alpha;                                          \
+  static const float bwdAlpha;                                       \
+                                                                     \
+public:                                                              \
+  float getAlpha() const { return alpha; }                           \
+  float getBwdAlpha() const { return bwdAlpha; }                     \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
+
 /**
  * @brief MKLDNN Relu Activation.
  * Actually mkldnn_relu is Leaky Relu.
@@ -59,19 +81,129 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
  *  f(x) = negative_slope * x  (x <  0)
  * @note the negative_slope should be -0.f in forward
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, -0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
 
 /**
  * @brief MKLDNN Tanh Activation.
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, 0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
 
 /**
  * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
  *  f(x) = x                              (x >= 0)
  *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, 0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
+  const std::map<std::string, mkldnn::algorithm> algoMap = {
+      {"relu", algorithm::eltwise_relu},
+      {"tanh", algorithm::eltwise_tanh},
+      {"elu", algorithm::eltwise_elu}};
+  type.erase(0, 7);  // remove mkldnn_
+  algorithm algo = (algorithm)0;
+  mapGet(type, algoMap, &algo);
+  return algo;
+}
+
+void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  // note: alpha represents the NegativeSlope when used in relu.
+  float alpha = getAlpha();
+  float beta = getBeta();
+  algorithm algo = getAlgo(this->getName());
+  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                   algo,
+                                   val_->getMemoryDesc(),
+                                   alpha,
+                                   beta);
+  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
+  // use inplace for forward but save input value before submit
+  inVal_ = val_;
+  copyInVal_ = nullptr;
+  if (act.grad && algo == algorithm::eltwise_tanh) {
+    // tanh need save src input for backward
+    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+    CHECK(copyInVal_) << "should not be emptry";
+    pipelineFwd_.push_back(*copyInVal_);
+  }
+  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+  needResetBwd_ = true;
+}
+
+void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
+  if (!needResetBwd_) {
+    return;
+  }
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+  needResetBwd_ = false;
+  algorithm algo = getAlgo(this->getName());
+  float alpha = getBwdAlpha();
+  float beta = getBeta();
+  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  auto eng = CPUEngine::Instance().getEngine();
+  auto bwdDesc = eltwise_bwd::desc(
+      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+  CHECK(inVal_);
+  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwd_);
+}
+
+/**
+ * @brief MKLDNN Softmax Activation
+ */
+DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
+
+void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  int axis = 1;
+  auto fwdDesc = softmax_fwd::desc(
+      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
+  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
+  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+}
+
+Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  real* v = act.value->getData();
+  real threshold = exp(-64);
+#pragma omp parallel for
+  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
+    v[i] = v[i] < threshold ? threshold : v[i];
+  }
+  return Error();
+}
+
+Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+  Matrix::resizeOrCreate(sftMaxDot_,
+                         outputG->getHeight(),
+                         outputG->getWidth(),
+                         /* trans */ false,
+                         /* useGpu */ false);
+  Matrix::resizeOrCreate(sftMaxSum_,
+                         outputG->getHeight(),
+                         1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  sftMaxDot_->dotMul(*outputG, *outputV);
+  sftMaxSum_->colMerge(*sftMaxDot_);
+  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  return Error();
+}
 
 ActivationFunction* MKLDNNActivation::create(const std::string& type) {
   return gMKLDNNActivationRegistrar.createByType(type);
@@ -84,4 +216,34 @@ std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
   return types;
 }
 
+void MKLDNNActivation::resetFwd(Argument& act) {
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+  cnt_ = act.value->getElementCnt();
+  pipelineFwd_.clear();
+  stream_.reset(new MKLDNNStream());
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+  if (val_ == nullptr) {
+    int bs = act.getBatchSize();
+    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+    int ic = cnt_ / bs / ih / iw;
+    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+    val_ = MKLDNNMatrix::create(
+        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+    CHECK(val_);
+    val_->downSpatial();
+  }
+}
+
+Error __must_check MKLDNNActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  return Error();
+}
+Error __must_check MKLDNNActivation::backward(Argument& act) {
+  resetBwd(act);
+  stream_->submit(pipelineBwd_);
+  return Error();
+}
 }  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index 86ffe38736..dd16421fd6 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -36,6 +36,7 @@ protected:
   // mkldnn matrix, primitive, stream and pipeline
   MKLDNNMatrixPtr val_;
   MKLDNNMatrixPtr grad_;
+  std::shared_ptr<mkldnn::engine> engine_;
   std::shared_ptr<MKLDNNStream> stream_;
   std::shared_ptr<mkldnn::primitive> fwd_;
   std::shared_ptr<mkldnn::primitive> bwd_;
@@ -48,8 +49,18 @@ public:
   static ActivationFunction* create(const std::string& type);
   static std::vector<std::string> getAllRegisteredTypes();
   virtual const std::string& getName() const = 0;
-  virtual Error __must_check forward(Argument& act) = 0;
-  virtual Error __must_check backward(Argument& act) = 0;
+  /**
+   * reset the forward primitives
+   */
+  virtual void resetFwd(Argument& act);
+  /**
+   * reset the backward primitives,
+   * can not merge this functions into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  virtual void resetBwd(Argument& act) {}
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
 };
 
 /**
@@ -59,6 +70,7 @@ public:
 class MKLDNNEltwiseActivation : public MKLDNNActivation {
   typedef mkldnn::eltwise_forward eltwise_fwd;
   typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
 
 protected:
   // save the forward primitive desc, which can be used backward
@@ -70,9 +82,7 @@ protected:
 
 public:
   MKLDNNEltwiseActivation() {}
-
   ~MKLDNNEltwiseActivation() {}
-
   virtual const std::string& getName() const = 0;
 
   // in common, the alpha of forward and backward should be equal.
@@ -80,104 +90,30 @@ public:
   virtual float getAlpha() const = 0;
   virtual float getBwdAlpha() const = 0;
   virtual float getBeta() const { return 0.f; }
-  virtual mkldnn::algorithm getAlgo(const std::string& type) const {
-    if (type == "mkldnn_relu") {
-      return mkldnn::algorithm::eltwise_relu;
-    } else if (type == "mkldnn_tanh") {
-      return mkldnn::algorithm::eltwise_tanh;
-    } else if (type == "mkldnn_elu") {
-      return mkldnn::algorithm::eltwise_elu;
-    } else {
-      LOG(FATAL) << "Unkown eltwise activation type: " << type;
-    }
-    return (mkldnn::algorithm)0;
-  }
-
-  /**
-   * reshape and reset the forward primitives
-   */
-  void resetFwd(Argument& act) {
-    if (cnt_ == act.value->getElementCnt()) {
-      return;
-    }
-    cnt_ = act.value->getElementCnt();
-    stream_.reset(new MKLDNNStream());
-    auto eng = CPUEngine::Instance().getEngine();
-
-    // get algo setting
-    mkldnn::algorithm algo = getAlgo(this->getName());
-    // note: alpha represents the NegativeSlope when used in relu.
-    float alpha = getAlpha();
-    float beta = getBeta();
-
-    /// forward
-    pipelineFwd_.clear();
-    val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-    if (val_ == nullptr) {
-      int bs = act.getBatchSize();
-      int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-      int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-      int ic = cnt_ / bs / ih / iw;
-      CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-      val_ = MKLDNNMatrix::create(
-          act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, eng);
-      CHECK(val_);
-    }
-    auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                     algo,
-                                     val_->getMemoryDesc(),
-                                     alpha,
-                                     beta);
-    fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, eng));
-    // use inplace for forward but save input value before submit
-    inVal_ = val_;
-    copyInVal_ = nullptr;
-    if (act.grad && algo == mkldnn::algorithm::eltwise_tanh) {
-      // tanh need save src input for backward
-      inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
-      copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-      CHECK(copyInVal_) << "should not be emptry";
-      pipelineFwd_.push_back(*copyInVal_);
-    }
-    fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-    pipelineFwd_.push_back(*fwd_);
-    needResetBwd_ = true;
-  }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
+};
 
-  /**
-   * reset the backward primitives, can not merge into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  void resetBwd(Argument& act) {
-    if (!needResetBwd_) {
-      return;
-    }
-    needResetBwd_ = false;
-    mkldnn::algorithm algo = getAlgo(this->getName());
-    float alpha = getBwdAlpha();
-    float beta = getBeta();
-    grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
-    auto eng = CPUEngine::Instance().getEngine();
-    auto bwdDesc = eltwise_bwd::desc(
-        algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-    auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-    CHECK(inVal_);
-    bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-    pipelineBwd_.clear();
-    pipelineBwd_.push_back(*bwd_);
-  }
+/**
+ * @brief Base class of MKLDNN softmax Activation,
+ * only have mkldnn forward, use cpu implement for backward.
+ */
+class MKLDNNSoftmaxActivation : public MKLDNNActivation {
+  typedef mkldnn::softmax_forward softmax_fwd;
 
-  Error __must_check forward(Argument& act) {
-    resetFwd(act);
-    stream_->submit(pipelineFwd_);
-    return Error();
-  }
+private:
+  // for backward
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sftMaxDot_;
 
-  Error __must_check backward(Argument& act) {
-    resetBwd(act);
-    stream_->submit(pipelineBwd_);
-    return Error();
-  }
+public:
+  MKLDNNSoftmaxActivation() {}
+  ~MKLDNNSoftmaxActivation() {}
+  virtual const std::string& getName() const = 0;
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 88b047c89b..9a0abd291a 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -64,7 +64,7 @@ bool MKLDNNConvLayer::init(const LayerMap& layerMap,
 
   // create biases
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
   }
   return true;
 }
@@ -251,22 +251,31 @@ void MKLDNNConvLayer::resetInValue(
   // create buffer and reorder if input value do not match
   cpuInVal_ = nullptr;
   cvtInVal_ = nullptr;
-  if (inputIsOnlyMKLDNN()) {
-    MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-    CHECK(dnnIn) << "Input should be MKLDNNMatrix";
-    if (dnnIn->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      CHECK_EQ(dnnIn->getFormat(), format::nc);
+
+  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
+  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
+    in = dnnIn;
+    return;
+  }
+  if (dnnIn) {
+    if (dnnIn->getFormat() == format::nc) {
       CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
       // create a new one with nchw format and same data
       memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
       dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-      CHECK(dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc());
     }
-    in = dnnIn;
+    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
+      in = dnnIn;
+      return;
+    }
+    cpuInVal_ = dnnIn;
+    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
+    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
+    CHECK(cvtInVal_) << "should not be emptry";
   } else {
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
     memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(cpuIn, inDims, format::nchw, engine_);
+    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
     if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
       // create new mkldnn matrix
       in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
@@ -535,7 +544,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
   } else {
     wgtValBwdData_ = wgtVal_;
   }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data"
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
                     << wgtValBwdData_->getFormat();
 }
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index afd092666b..8cbfbd0d2b 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -49,7 +49,7 @@ bool MKLDNNFcLayer::init(const LayerMap& layerMap,
 
   // create biases
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
   }
   return true;
 }
@@ -161,9 +161,16 @@ void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
 
 void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
                                       MKLDNNMatrixPtr& bias) {
+  format wgtFmt = format::oihw;
+  if (inVal_->getFormat() == format::nChw8c) {
+    wgtFmt = format::oIhw8i;
+  } else if (inVal_->getFormat() == format::nChw16c) {
+    wgtFmt = format::oIhw16i;
+  }
   wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, format::oihw, engine_);
+      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
   wgt->downSpatial();
+  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
 
   bias = (biases_ && biases_->getW())
              ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index d8555a8331..c09fd89462 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -115,6 +115,7 @@ public:
       copySeqInfoToOutputs();
       size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
       if (inputElemenCnt_ != elemenCnt) {
+        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
@@ -142,6 +143,7 @@ public:
 
   void backward(const UpdateCallback& callback) override {
     if (needResetBwd_) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
       needResetBwd_ = false;
     }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 1bfbbde424..857d07df3e 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -222,8 +222,8 @@ static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
 }
 
 void testActivation(std::string& actType, const testActDesc& pm) {
-  // TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
-  if (actType == "mkldnn_softmax" || actType == "mkldnn_elu") {
+  // TODO(TJ): remove me when paddle support elu activation
+  if (actType == "mkldnn_elu") {
     return;
   }
   const std::string compareTypes[] = {actType, actType.erase(0, 7)};
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f8b0bce681..e56895c63a 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -88,10 +88,14 @@ add_subdirectory(math)
 
 set(DEPS_OPS
     recurrent_op
-    cond_op)
+    cond_op
+    cross_entropy_op
+    softmax_with_cross_entropy_op)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
+op_library(cross_entropy_op DEPS cross_entropy_function)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy_function softmax_function)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 70e4f9da12..82010bfb53 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -22,25 +22,23 @@ class AccuracyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("Inference"),
-        "Input(Inference) of AccuracyOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) of AccuracyOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Accuracy"),
-        "Output(Accuracy) of AccuracyOp should not be null.");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input(Inference) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
+                   "Output(Accuracy) of AccuracyOp should not be null.");
 
-    auto *inference = ctx.Input<framework::Tensor>("Inference");
-    auto *label = ctx.Input<framework::Tensor>("Label");
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
 
-    PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label must be a vector");
-    PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
+    PADDLE_ENFORCE_EQ(label_dim.size(), 1, "label must be a vector");
+    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
                       "inference size must be the same as label size");
 
-    ctx.Output<framework::Tensor>("Accuracy")->Resize({1});
-    ctx.ShareLoD("Inference", /*->*/ "Accuracy");
+    ctx->SetOutputDim("Accuracy", {1});
+    ctx->ShareLoD("Inference", /*->*/ "Accuracy");
   }
 };
 
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 0a6a0fd15c..75e8a98903 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -69,8 +69,12 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
       return;
     }
 
-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS>>>(
-        num_samples, infer_width, inference_data, label_data, accuracy_data);
+    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
+        1, PADDLE_CUDA_NUM_THREADS, 0,
+        reinterpret_cast<const platform::CUDADeviceContext&>(
+            ctx.device_context())
+            .stream()>>>(num_samples, infer_width, inference_data, label_data,
+                         accuracy_data);
   }
 };
 
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 06654702bc..f77e1c572e 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -22,10 +22,9 @@ class ActivationOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::Tensor>("Y")->Resize(
-        ctx.Input<framework::Tensor>("X")->dims());
-    ctx.ShareLoD("X", /*->*/ "Y");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -34,9 +33,8 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<framework::Tensor>("Y")->dims());
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
   }
 };
 
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index ed11d09697..3914d13230 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -22,25 +22,23 @@ class AddOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of AddOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of AddOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of AddOp should not be null.");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of AddOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of AddOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AddOp should not be null.");
 
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
-                      ctx.Input<Tensor>("Y")->dims(),
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims,
                       "Two input of Add Op's dimension must be same.");
-    ctx.Output<framework::Tensor>("Out")->Resize(
-        ctx.Input<Tensor>("X")->dims());
+    ctx->SetOutputDim("Out", x_dims);
   }
 };
 
 class AddOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AddOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of add op");
     AddInput("Y", "The second input of add op");
@@ -58,7 +56,7 @@ class AddOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {}
+  void InferShape(framework::InferShapeContextBase* ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index e5a54bc4b2..316d28f174 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -22,24 +22,24 @@ class ClipOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ClipOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of ClipOp should not be null.");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto max = Attr<float>("max");
-    auto min = Attr<float>("min");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
     PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
-    ctx.Output<Tensor>("Out")->Resize(x_dims);
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ClipOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input of clip op."
@@ -61,14 +61,13 @@ class ClipOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    if (x_grad != nullptr) {
-      x_grad->Resize(x_dims);
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     }
   }
 };
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 07f847079e..01cbfc33ef 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -24,31 +24,30 @@ class ConcatOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of ConcatOp should not be null.");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ConcatOp should not be null.");
 
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    auto ins = ctx->GetInputsDim("X");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
     size_t n = ins.size();
 
     PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
 
-    auto out_dims = ins[0]->dims();
+    auto out_dims = ins[0];
     size_t in_zero_dims_size = out_dims.size();
     for (size_t i = 1; i < n; i++) {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
-          out_dims[axis] += ins[i]->dims()[j];
+          out_dims[axis] += ins[i][j];
           continue;
         }
-        PADDLE_ENFORCE_EQ(out_dims[j], ins[i]->dims()[j],
+        PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                           "Input tensors should have the same "
                           "elements except the specify axis.")
       }
     }
-    out->Resize(out_dims);
+    ctx->SetOutputDim("Out", out_dims);
   }
 };
 
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index 8262a7a5c8..1d44782b21 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -215,7 +215,7 @@ class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Sample dependent Cond Operator:
 Given Cond[i] as a 1/0 vector to indicate true/false
-The equation is: 
+The equation is:
 Out[i] = subnet_t[i], if Cond[i] == true
 Out[i] = subnet_t[i], if Cond[i] == false
 )DOC");
diff --git a/paddle/operators/conv2d_op.cc b/paddle/operators/conv2d_op.cc
index c3281db096..5cc82944bb 100644
--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -27,27 +27,25 @@ class Conv2DOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Input"),
-                            "Input(Input) of Conv2DOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Filter"),
-                            "Input(Filter) of Conv2DOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Output"),
-                            "Output(Output) of Conv2DOp should not be null.");
-
-    auto in = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto out = ctx.Output<framework::Tensor>("Output");
-    std::vector<int> strides = Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = Attr<std::vector<int>>("paddings");
-    int groups = Attr<int>("groups");
-    int input_channels = in->dims()[1];
-    int output_channels = filter->dims()[0];
-
-    PADDLE_ENFORCE_EQ(in->dims().size(), 4, "Conv2DOp input should be 4-D.");
-    PADDLE_ENFORCE_EQ(filter->dims().size(), 4,
-                      "Conv2DOp filter should be 4-D.");
-    PADDLE_ENFORCE_EQ(input_channels, filter->dims()[1] * groups,
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of Conv2DOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of Conv2DOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of Conv2DOp should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    int groups = ctx->Attrs().Get<int>("groups");
+    int input_channels = in_dims[1];
+    int output_channels = filter_dims[0];
+
+    PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
+    PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
                       "The number of input channels should be equal to filter "
                       "channels * groups.");
     PADDLE_ENFORCE_EQ(
@@ -55,17 +53,17 @@ class Conv2DOp : public framework::OperatorWithKernel {
         "The number of output channels should be divided by groups.");
 
     auto output_height =
-        outputSize(in->dims()[2], filter->dims()[2], paddings[0], strides[0]);
+        outputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
     auto output_width =
-        outputSize(in->dims()[3], filter->dims()[3], paddings[1], strides[1]);
-    out->Resize(
-        {in->dims()[0], filter->dims()[0], output_height, output_width});
+        outputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
+    ctx->SetOutputDim(
+        "Output", {in_dims[0], filter_dims[0], output_height, output_width});
   }
 };
 
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  Conv2DOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "Input",
@@ -108,14 +106,15 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto in = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
-    auto d_filter =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Filter"));
-    if (d_in) d_in->Resize(in->dims());
-    if (d_filter) d_filter->Resize(filter->dims());
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    if (ctx->HasOutput(framework::GradVarName("Input"))) {
+      ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index b56ee2047b..040546f1a6 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -24,22 +24,22 @@ class CosSimOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
     // notnull check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of CosSimOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of CosSimOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of CosSimOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("XNorm"),
-                            "Output(XNorm) of CosSimOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("YNorm"),
-                            "Output(YNorm) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("XNorm"),
+                   "Output(XNorm) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("YNorm"),
+                   "Output(YNorm) of CosSimOp should not be null.");
 
     // shape check
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
                       "Ranks of Input(X) and Input(Y) must be equal.");
@@ -54,16 +54,16 @@ class CosSimOp : public framework::OperatorWithKernel {
                    " just 1 (which will be broadcasted to match Input(X)).");
 
     // resize tensor
-    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
-    ctx.Output<framework::Tensor>("XNorm")->Resize({x_dims[0], 1});
-    ctx.Output<framework::Tensor>("YNorm")->Resize({y_dims[0], 1});
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->SetOutputDim("XNorm", {x_dims[0], 1});
+    ctx->SetOutputDim("YNorm", {y_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  CosSimOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The 1st input of cos_sim op.");
     AddInput("Y", "The 2nd input of cos_sim op.");
@@ -98,27 +98,23 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
     // notnull check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
-                            "Input(XNorm) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
-                            "Input(YNorm) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Out"),
-                            "Input(Out) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) must not be null.");
 
     // shape check
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
-    auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
-    auto out_dims = ctx.Input<Tensor>("Out")->dims();
-    auto out_grad_dims =
-        ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto xnorm_dims = ctx->GetInputDim("XNorm");
+    auto ynorm_dims = ctx->GetInputDim("YNorm");
+    auto out_dims = ctx->GetInputDim("Out");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                       "Ranks of Input(X) and Input(Y) must be equal.");
@@ -143,10 +139,14 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
                       "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
 
     // resize tensor
-    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    if (x_grad) x_grad->Resize(x_dims);
-    if (y_grad) y_grad->Resize(y_dims);
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 52a1123348..9b2305e90e 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -25,16 +25,14 @@ class CropOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of CropOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of CropOp should not be null.");
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *out = ctx.Output<Tensor>("Out");
-    if (y == nullptr) {
-      auto shape = Attr<std::vector<int>>("shape");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of CropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of CropOp should not be null.");
+    auto x_dim = ctx->GetInputDim("X");
+    if (!ctx->HasInput("Y")) {
+      auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
       PADDLE_ENFORCE_EQ(
           int64_t(shape.size()), x_dim.size(),
           "Shape size should be equal to dimention size of input tensor.");
@@ -42,19 +40,20 @@ class CropOp : public framework::OperatorWithKernel {
       for (size_t i = 0; i < shape.size(); ++i) {
         tensor_shape[i] = static_cast<int64_t>(shape[i]);
       }
-      out->Resize(framework::make_ddim(tensor_shape));
+      ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape));
     } else {
-      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y->dims()),
+      auto y_dim = ctx->GetInputDim("Y");
+      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim),
                         "Tensor rank of both CropOp's "
                         "inputs must be same.");
-      out->Resize(y->dims());
+      ctx->SetOutputDim("Out", y_dim);
     }
   }
 };
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CropOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  CropOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -78,12 +77,12 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
 Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
-There are two ways to set shape: 
+There are two ways to set shape:
 1. referenc input: crop input X as shape as reference input.
-                    The dimension of reference input should 
+                    The dimension of reference input should
                     be as same as input X.
 2. shape list: crop input X by shape described by a list<int>.
-               The size of shape list should be as same as 
+               The size of shape list should be as same as
                dimension size of  input X.
 
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
@@ -94,15 +93,15 @@ Given:
          [0, 3, 4, 0, 0]
          [0, 0, 0, 0, 0]]
 
-and 
+and
 
     offsets = [0, 1]
 
 and
- 
+
     shape = [2, 2]
 
-then we get 
+then we get
 
     Out = [[1, 2],
            [3, 4]]
@@ -116,14 +115,14 @@ class CropOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    if (x_grad != nullptr) {
-      x_grad->Resize(x_dims);
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
 };
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index 2f40c05903..ac3aeaf41e 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -38,10 +38,10 @@ class CropKernel : public framework::OpKernel {
     auto out_stride = framework::stride(out->dims());
     auto offsets = context.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
-        x->dims().size(), offsets.size(),
+        x->dims().size(), static_cast<int64_t>(offsets.size()),
         "Offsets size should be equal to dimension size of input tensor.");
     int64_t offset = 0;
-    for (int i = 0; i < offsets.size(); ++i) {
+    for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
     }
     StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
@@ -57,7 +57,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     d_x->mutable_data<T>(context.GetPlace());
     auto offsets = context.Attr<std::vector<int>>("offsets");
     Eigen::array<std::pair<int, int>, D> paddings;
-    for (int i = 0; i < D; ++i) {
+    for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
     }
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index b11dc1472d..26fc9b51c4 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -22,32 +22,30 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"), "Output(Y) must not be null.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto label = ctx.Input<Tensor>("Label");
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
-                      "Input(Label)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Label) must "
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    if (ctx.Attr<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == true, The 2nd dimension of "
-                        "Input(X) and Input(Label) must be equal.");
+    if (ctx->Attrs().Get<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == false, The 2nd dimension of "
-                        "Input(Label) must be 1.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
     }
 
-    ctx.Output<Tensor>("Y")->Resize({x->dims()[0], 1});
-    ctx.ShareLoD("X", /*->*/ "Y");
+    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -56,66 +54,79 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) must not be null.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto label = ctx.Input<Tensor>("Label");
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(dy->dims().size(), 2, "Input(Y@Grad)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(label->dims().size(), 2,
-                      "Input(Label)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Label) must "
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    PADDLE_ENFORCE_EQ(x->dims()[0], dy->dims()[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) must "
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
                       "be equal.");
-    PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) must be 1.");
-    if (ctx.Attr<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == true, The 2nd dimension of "
-                        "Input(X) and Input(Label) must be equal.");
+    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    if (ctx->Attrs().Get<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == false, The 2nd dimension of "
-                        "Input(Label) must be 1.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
     }
-
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->Resize(x->dims());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
 };
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CrossEntropyOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  CrossEntropyOpMaker(framework::OpProto* proto,
+                      framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of CrossEntropyOp");
-    AddInput("Label", "The second input of CrossEntropyOp");
-    AddOutput("Y", "The output of CrossEntropyOp");
-    AddAttr<bool>("soft_label", "Is soft label. Default zero.")
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput(
+        "Label",
+        "(Tensor, default Tensor<int>), the ground truth which is "
+        "a 2-D tensor. "
+        "When softLabel is set to false, `Label` is a Tensor<int> with shape "
+        "[N x 1]. "
+        "When softLabel is set to true, `Label` is a Tensor<float/double> "
+        "with shape [N x K].");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 2-D tensor "
+              "with shape [N x 1]. The cross entropy loss.");
+    AddAttr<bool>(
+        "softLabel",
+        "(bool, default false), a flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
         .SetDefault(false);
-
     AddComment(R"DOC(
 CrossEntropy Operator.
 
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = False, Label[i, 0] indicates the class index for sample i:
+    softLabel = false, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    soft_label = True, Label[i, j] indicates the soft label of class j
+    softLabel = true, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 1d6361a814..1cfeb7a53b 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -12,42 +12,15 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/cross_entropy_op.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
+namespace {
+// TODO(qingqing): make zero setting a common function.
 template <typename T>
-__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
-                                   const int N, const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
-    Y[i] = -tolerable_value(log(X[i * D + label[i]]));
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
-                                       const int N, const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    T sum = static_cast<T>(0);
-    for (int j = 0; j < D; j++) {
-      sum += label[i * D + j] * tolerable_value(log(X[i * D + j]));
-    }
-    Y[i] = -sum;
-  }
-}
-
-// TODO(qingqing): make zero setting an common function.
-template <typename T>
-__global__ void zero(T* X, const int N) {
+__global__ void Zero(T* X, const int N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     X[i] = 0.0;
@@ -71,45 +44,27 @@ template <typename T>
 __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
                                                const T* label, const int N,
                                                const int D) {
-  // TOOD(qingqing): optimize for this kernel
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    for (int j = 0; j < D; ++j) {
-      int idx = i * D + j;
-      dX[idx] = -label[idx] * dY[i] / X[idx];
-    }
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < N * D) {
+    int row_ids = ids / D;
+    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
   }
 }
+}  // namespace
 
 template <typename T>
 class CrossEntropyOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-    auto label = ctx.Input<Tensor>("Label");
-
-    auto* x_data = x->data<T>();
+                   "This kernel only runs on GPU device.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
-    int block = 512;
-    int grid = (n + block - 1) / block;
-    // TODO(qingqing) launch kernel on specified stream
-    // base on ExecutionContext.
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      SoftCrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n,
-                                                 d);
-    } else {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
-      CrossEntropyKernel<T><<<grid, block>>>(y_data, x_data, label_data, n, d);
-    }
+    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
+        ctx, y, x, label, ctx.Attr<bool>("softLabel"));
   }
 };
 
@@ -118,33 +73,44 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use GPUPlace.");
+                   "This kernel only runs on GPU device.");
 
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
 
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+    const T* dy_data =
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
+
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
     int block = 512;
-    int grid = (n * d + block - 1) / block;
-    zero<T><<<grid, block>>>(dx_data, n * d);
-    grid = (n + block - 1) / block;
-    // TODO(qingqing): launch kernel on specified stream
-    // base on ExecutionContext.
-    if (ctx.Attr<bool>("soft_label")) {
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block>>>(
-          dx_data, dy_data, x_data, label_data, n, d);
+      SoftCrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(dx_data, dy_data, x_data, label_data,
+                                           batch_size, class_num);
     } else {
+      Zero<T><<<grid, block, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(
+                    ctx.device_context())
+                    .stream()>>>(dx_data, batch_size * class_num);
+
       auto* label_data = label->data<int>();
-      CrossEntropyGradientKernel<T><<<grid, block>>>(dx_data, dy_data, x_data,
-                                                     label_data, n, d);
+      grid = (batch_size + block - 1) / block;
+      CrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(dx_data, dy_data, x_data, label_data,
+                                           batch_size, class_num);
     }
   }
 };
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 69caba5ff3..1f67461d3f 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,62 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/platform/hostdevice.h"
+#include "paddle/operators/math/cross_entropy.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE T tolerable_value(const T x) {
-  PADDLE_ASSERT(std::is_floating_point<T>::value);
-  const T kApproInf = 1e20;
-  if (x == INFINITY) {
-    return kApproInf;
-  }
-  if (x == -INFINITY) {
-    return -kApproInf;
-  }
-  return x;
-}
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
 class CrossEntropyOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-
-    auto* x_data = x->data<T>();
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
 
-    int batch_size = x->dims()[0];
-    int class_num = x->dims()[1];
-
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        T sum = static_cast<T>(0);
-        for (int j = 0; j < class_num; ++j) {
-          sum += label_data[index] * tolerable_value(std::log(x_data[index]));
-          y_data[i] = -sum;
-          index++;
-        }
-      }
-    } else {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<int>();
-      for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
-        y_data[i] = -tolerable_value(std::log(x_data[index]));
-      }
-    }
+    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
+        ctx, y, x, labels, ctx.Attr<bool>("softLabel"));
   }
 };
 
@@ -77,33 +46,32 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
-
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
-
-    // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        for (int j = 0; j < class_num; ++j) {
-          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
-          index++;
-        }
-      }
+    if (ctx.Attr<bool>("softLabel")) {
+      auto x_mat = EigenMatrix<T>::From(*x);
+      auto dy_mat = EigenMatrix<T>::From(*dy);
+      auto lbl_mat = EigenMatrix<T>::From(*label);
+      auto dx_mat = EigenMatrix<T>::From(*dx);
+
+      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -(lbl_mat * dy_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) /
+            x_mat);
     } else {
-      auto* label_data = label->data<int>();
+      int batch_size = x->dims()[0];
+      const T* dy_data = dy->data<T>();
+      const T* x_data = x->data<T>();
+      const int* label_data = label->data<int>();
+
+      // TODO(qingqing): make zero setting a common function.
       memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+
       for (int i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
         int index = i * class_num + label_data[i];
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 2130eda6a4..a669b5cf00 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -24,25 +24,25 @@ class DropoutOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_GE(ctx.Attr<float>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx.Attr<float>("dropout_prob"), 1);
-
-    auto dims = ctx.Input<Tensor>("X")->dims();
-    ctx.Output<Tensor>("Out")->Resize(dims);
-    if (ctx.Attr<bool>("is_training")) {
-      ctx.Output<Tensor>("Mask")->Resize(dims);
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    if (ctx->Attrs().Get<bool>("is_training") == 1) {
+      ctx->SetOutputDim("Mask", x_dims);
     }
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DropoutOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  DropoutOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<AttrType>("dropout_prob", "Probability of setting units to zero.")
         .SetDefault(.5f);
@@ -70,27 +70,26 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.Attr<bool>("is_training"),
-                   "GradOp is only callable when is_training is true");
-
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Mask"), "Mask must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) must not be null.");
-
-    PADDLE_ENFORCE_GE(ctx.Attr<AttrType>("dropout_prob"), 0);
-    PADDLE_ENFORCE_LE(ctx.Attr<AttrType>("dropout_prob"), 1);
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), 1,
+                      "GradOp is only callable when is_training is true");
+
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) must not be null.");
+
+    PADDLE_ENFORCE_GE(ctx->Attrs().Get<AttrType>("dropout_prob"), 0);
+    PADDLE_ENFORCE_LE(ctx->Attrs().Get<AttrType>("dropout_prob"), 1);
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims, out_dims,
                       "Dimensions of Input(X) and Out@Grad must be the same.");
-    auto mask_dims = ctx.Input<Tensor>("Mask")->dims();
+    auto mask_dims = ctx->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(x_dims, mask_dims,
                       "Dimensions of Input(X) and Mask must be the same.");
 
-    auto *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    x_grad->Resize(x_dims);
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
 };
 
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index f224722c1b..c4777a00d6 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -202,21 +202,20 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
  protected:
   using Tensor = framework::Tensor;
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of elementwise op should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of elementwise op should not be null");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of elementwise op should not be null.");
-
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto y_dim = ctx.Input<Tensor>("Y")->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of elementwise op should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of elementwise op should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of elementwise op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                       "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -234,7 +233,7 @@ must be small or equal to X's dimensions.
 )DOC");
     AddAttr<int>("axis",
                  R"DOC(
-When the shape(Y) does not equal the shape(X),Y will be broadcasted 
+When the shape(Y) does not equal the shape(X),Y will be broadcasted
 to match the shape of X and axis should be dimension index Y in X
         )DOC")
         .SetDefault(-1)
@@ -244,7 +243,7 @@ to match the shape of X and axis should be dimension index Y in X
     comment_ = R"DOC(
 Limited elementwise {name} operator.The equation is: Out = {equation}.
 1. The shape of Y should be same with X or
-2. Y's shape is a subset of X. 
+2. Y's shape is a subset of X.
    Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
 
    example:
@@ -284,27 +283,26 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                       "Rank of first input must >= rank of second input.")
 
-    if (x_grad) {
-      x_grad->Resize(x_dims);
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
     }
-
-    if (y_grad) {
-      y_grad->Resize(y_dims);
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
 };
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 761a527a55..e164de6584 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -22,15 +22,13 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
-                            "Output(Y) of FillZerosLikeOp should not be null.");
-
-    ctx.Output<framework::Tensor>("Y")->Resize(
-        ctx.Input<framework::Tensor>("X")->dims());
-    ctx.ShareLoD("X", /*->*/ "Y");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Y");
   }
 };
 
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index fecd1ce214..0e3cd174ad 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -23,19 +23,19 @@ class GatherOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of GatherOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Index"),
-                            "Input(Index) of GatherOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of GatherOp should not be null.");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GatherOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Index"),
+                   "Input(Index) of GatherOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of GatherOp should not be null.");
 
-    int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
+    int batch_size = ctx->GetInputDim("Index")[0];
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
-    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
+    framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
-    ctx.Output<framework::Tensor>("Out")->Resize(output_dims);
+    ctx->SetOutputDim("Out", output_dims);
   }
 };
 
@@ -44,23 +44,20 @@ class GatherGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto X = ctx.Input<Tensor>("X");
-
-    X_grad->Resize(X->dims());
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GatherOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  GatherOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
-Gather Operator by selecting from the first axis, 
+Gather Operator by selecting from the first axis,
 
 Out = X[Index]
 )DOC");
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 5b7cbb5cc7..05120a6e7b 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -43,13 +43,10 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of GaussianRandomOp should not be null.");
-
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
-    auto dims = Attr<std::vector<int>>("dims");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of GaussianRandomOp should not be null.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
     for (auto dim : dims) {
@@ -57,7 +54,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
     }
     PADDLE_ENFORCE(dims.size() > 0UL,
                    "dims can be one int or array. dims must be set.");
-    tensor->Resize(framework::make_ddim(temp));
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 04ac24662e..9b1314bfba 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -22,27 +22,26 @@ class LookupTableOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("W"),
-                            "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Ids"),
-                            "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of LookupTableOp should not be null.");
-
-    auto table_t = ctx.Input<Tensor>("W");
-    auto ids_t = ctx.Input<Tensor>("Ids");
-    auto output_t = ctx.Output<framework::Tensor>("Out");
-
-    output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
-    ctx.ShareLoD("Ids", /*->*/ "Out");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(W) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LookupTableOp should not be null.");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+
+    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
+    ctx->ShareLoD("Ids", /*->*/ "Out");
   }
 };
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupTableOpMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
+  LookupTableOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("W",
              "An input represents embedding tensors,"
@@ -66,11 +65,9 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &context) const override {
-    auto table = context.Input<Tensor>("W");
-    auto d_table =
-        context.Output<framework::Tensor>(framework::GradVarName("W"));
-    d_table->Resize(table->dims());
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
   }
 };
 
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 7083440467..62f63b4f3c 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -77,7 +77,10 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTable<T, 128, 8, 8><<<grids, threads>>>(output, table, ids, N, K, D);
+    LookupTable<T, 128, 8, 8><<<
+        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(output, table, ids, N, K, D);
   }
 };
 
@@ -102,8 +105,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTableGrad<T, 128, 8, 8><<<grids, threads>>>(d_table, d_output, ids, N,
-                                                      K, D);
+    LookupTableGrad<T, 128, 8, 8><<<
+        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                               context.device_context())
+                               .stream()>>>(d_table, d_output, ids, N, K, D);
   }
 };
 
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
new file mode 100644
index 0000000000..bd75b001cb
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lstm_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LstmUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
+                   "Input(C_prev) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("C"),
+                   "Output(C) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("H"),
+                   "Output(H) of LSTM should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto c_prev_dims = ctx->GetInputDim("C_prev");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE(x_dims[0] == c_prev_dims[0],
+                   "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE(x_dims[1] == c_prev_dims[1] * 4,
+                   "Dimension of FC should equal to prev state * 4");
+
+    int b_size = c_prev_dims[0];  // batch size
+    int s_dim = c_prev_dims[1];   // state dim
+    ctx->SetOutputDim("C", {b_size, s_dim});
+    ctx->SetOutputDim("H", {b_size, s_dim});
+  }
+};
+
+template <typename AttrType>
+class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LstmUnitOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "FC input before the non-linear activation.");
+    AddInput(
+        "C_prev",
+        "The cell state tensor of last time-step in the Lstm Unit operator.");
+    AddOutput("C", "The cell tensor of Lstm Unit operator.");
+    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
+
+    AddComment(R"DOC(Lstm-Unit Operator
+
+Equation:
+  i, f, o, j = split(X)
+  C = C_prev * sigm(f + forget_bias) + sigm(i) * tanh(j)
+  H = C * sigm(o)
+
+)DOC");
+    AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+  }
+};
+
+class LstmUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
+                   "Input(C@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
+                   "Input(H@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("C_prev"),
+                      ctx->GetInputDim("C_prev"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>,
+            lstm_unit_grad, ops::LstmUnitGradOp);
+REGISTER_OP_CPU_KERNEL(lstm_unit,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
new file mode 100644
index 0000000000..6e5e497899
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.cu
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype cuda_tanh(const Dtype x) {
+  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
+}
+
+template <typename T>
+__global__ void LSTMUnitKernel(const int nthreads, const int dim,
+                               const T* C_prev, const T* X, T* C, T* H,
+                               const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+
+    const T* X_offset = X + 4 * dim * n;
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = f * c_prev + i * g;
+    C[index] = c;
+    const T tanh_c = cuda_tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename T>
+__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
+                                       const T* C_prev, const T* X, const T* C,
+                                       const T* H, const T* C_diff,
+                                       const T* H_diff, T* C_prev_diff,
+                                       T* X_diff, const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    T* c_prev_diff = C_prev_diff + index;
+    T* X_diff_offset = X_diff + 4 * dim * n;
+    T* i_diff = X_diff_offset + d;
+    T* f_diff = X_diff_offset + 1 * dim + d;
+    T* o_diff = X_diff_offset + 2 * dim + d;
+    T* g_diff = X_diff_offset + 3 * dim + d;
+
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = C[index];
+    const T tanh_c = cuda_tanh(c);
+    const T c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    *c_prev_diff = c_term_diff * f;
+    *i_diff = c_term_diff * g * i * (1 - i);
+    *f_diff = c_term_diff * c_prev * f * (1 - f);
+    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
+    *g_diff = c_term_diff * i * (1 - g * g);
+  }
+}
+
+template <typename T, typename AttrType = T>
+class LstmUnitOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int block = 512;
+    int n = b_size * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
+  }
+};
+
+template <typename T, typename AttrType = T>
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    int block = 512;
+    int n = N * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
+                                               H_diff, C_prev_diff, X_diff,
+                                               forget_bias);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
new file mode 100644
index 0000000000..683034fe15
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Place, typename T, typename AttrType = T>
+class LstmUnitKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    for (int n = 0; n < b_size; ++n) {
+      for (int d = 0; d < D; ++d) {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = f * c_prev + i * g;
+        C[d] = c;
+        const T tanh_c = tanh(c);
+        H[d] = o * tanh_c;
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+    }
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LstmUnitGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+
+    for (int n = 0; n < N; ++n) {
+      for (int d = 0; d < D; ++d) {
+        T* c_prev_diff = C_prev_diff + d;
+        T* i_diff = X_diff + d;
+        T* f_diff = X_diff + 1 * D + d;
+        T* o_diff = X_diff + 2 * D + d;
+        T* g_diff = X_diff + 3 * D + d;
+
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = C[d];
+        const T tanh_c = tanh(c);
+        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+        *c_prev_diff = c_term_diff * f;
+        *i_diff = c_term_diff * g * i * (1 - i);
+        *f_diff = c_term_diff * c_prev * f * (1 - f);
+        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+        *g_diff = c_term_diff * i * (1 - g * g);
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+      C_diff += D;
+      H_diff += D;
+      X_diff += 4 * D;
+      C_prev_diff += D;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index f8333f34f7..91ae3d49f1 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,9 +1,15 @@
-
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc 
-    im2col.cu DEPS cblas device_context)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
+      im2col.cu DEPS cblas device_context operator)
+    nv_library(softmax_function SRCS softmax.cc softmax.cu
+      DEPS operator)
+    nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
+      DEPS operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
+    cc_library(math_function SRCS math_function.cc im2col.cc
+      DEPS cblas device_context operator)
+    cc_library(softmax_function SRCS softmax.cc DEPS operator)
+    cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
new file mode 100644
index 0000000000..a5a426bc7b
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const framework::ExecutionContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel) {
+    const int batch_size = prob->dims()[0];
+    if (softLabel) {
+      auto in = EigenMatrix<T>::From(*prob);
+      auto lbl = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*out);
+
+      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
+    } else {
+      const int class_num = prob->dims()[1];
+      const T* prob_data = prob->data<T>();
+      T* loss_data = out->data<T>();
+
+      const int* label_data = labels->data<int>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
+      }
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::CPUPlace, float>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
new file mode 100644
index 0000000000..d14a75a30c
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.cu
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
+                                   const int N, const int D) {
+  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
+  // CUDA_1D_KERNEL_LOOP(i, N) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int class_num) {
+  int tid = threadIdx.x;
+  extern __shared__ T d_sum[];
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] +=
+        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
+  }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
+}
+}  // namespace
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CrossEntropyFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const framework::ExecutionContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, bool softLabel) {
+    const T* prob_data = prob->data<T>();
+    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = prob->dims()[0];
+    int class_num = prob->dims()[1];
+
+    if (softLabel) {
+      const T* label_data = labels->data<T>();
+      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<
+          T><<<batch_size, block, block * sizeof(T),
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(loss_data, prob_data, label_data, class_num);
+    } else {
+      const int* label_data = labels->data<int>();
+      int block = 512;
+      int grid = (batch_size + block - 1) / block;
+      CrossEntropyKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(loss_data, prob_data, label_data,
+                                           batch_size, class_num);
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::GPUPlace, float>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
new file mode 100644
index 0000000000..18e637cf91
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+template <typename Place, typename T>
+class CrossEntropyFunctor {
+ public:
+  // (TODO caoying) it is much better to use DeviceContext as the first
+  // parameter.
+  void operator()(const framework::ExecutionContext& context,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index def4b01da0..ba653afa2c 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -48,6 +48,32 @@ void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
               beta, C, ldc);
 }
 
+template <>
+void gemm<platform::CPUPlace, float>(const platform::DeviceContext& context,
+                                     const bool transA, const bool transB,
+                                     const int M, const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const int lda, const float* B,
+                                     const int ldb, const float beta, float* C,
+                                     const int ldc) {
+  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUPlace, double>(const platform::DeviceContext& context,
+                                      const bool transA, const bool transB,
+                                      const int M, const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const int lda, const double* B,
+                                      const int ldb, const double beta,
+                                      double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
 template <>
 void matmul<platform::CPUPlace, float>(
     const platform::DeviceContext& context, const framework::Tensor& matrix_a,
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 71563b77b4..649f1f352c 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -63,6 +63,42 @@ void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
       cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
+template <>
+void gemm<platform::GPUPlace, float>(const platform::DeviceContext& context,
+                                     const bool transA, const bool transB,
+                                     const int M, const int N, const int K,
+                                     const float alpha, const float* A,
+                                     const int lda, const float* B,
+                                     const int ldb, const float beta, float* C,
+                                     const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+}
+
+template <>
+void gemm<platform::GPUPlace, double>(const platform::DeviceContext& context,
+                                      const bool transA, const bool transB,
+                                      const int M, const int N, const int K,
+                                      const double alpha, const double* A,
+                                      const int lda, const double* B,
+                                      const int ldb, const double beta,
+                                      double* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
+}
+
 template <>
 void matmul<platform::GPUPlace, float>(
     const platform::DeviceContext& context, const framework::Tensor& matrix_a,
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index d8518e77fa..43306fca73 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -70,6 +70,13 @@ void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
           const T alpha, const T* A, const T* B, const T beta, T* C);
 
+// gemm wrapper with stride args for matrix uncontinuous in memory
+template <typename Place, typename T>
+void gemm(const platform::DeviceContext& context, const bool transA,
+          const bool transB, const int M, const int N, const int K,
+          const T alpha, const T* A, const int lda, const T* B, const int ldb,
+          const T beta, T* C, const int ldc);
+
 // matrix multiply with continuous memory
 template <typename Place, typename T>
 void matmul(const platform::DeviceContext& context,
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 7e339457f7..f272f7e513 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -72,4 +72,174 @@ TEST(math_function, trans_mul_notrans) {
   EXPECT_EQ(out_ptr[8], 29);
   delete gpu_place;
 }
+
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+
+  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+
+  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
 #endif
+
+TEST(math_function, gemm_notrans_cblas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+
+TEST(math_function, gemm_trans_clbas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUPlace, float>(
+      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
new file mode 100644
index 0000000000..1224c05810
--- /dev/null
+++ b/paddle/operators/math/softmax.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::GPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
new file mode 100644
index 0000000000..4c3df0550e
--- /dev/null
+++ b/paddle/operators/math/softmax.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::GPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
new file mode 100644
index 0000000000..3d2f0d0aec
--- /dev/null
+++ b/paddle/operators/math/softmax.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = -64.;
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename Place, typename T>
+class SoftmaxFunctor {
+ public:
+  void operator()(const framework::ExecutionContext& context,
+                  const framework::Tensor* X, framework::Tensor* Y) {
+    auto logits = EigenMatrix<T>::From(*X);
+    auto softmax = EigenMatrix<T>::From(*Y);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class))
+                              .unaryExpr(ValueClip<T>());
+
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+    softmax.device(context.GetEigenDevice<Place>()) =
+        (softmax *
+         softmax.sum(along_class)
+             .inverse()
+             .eval()
+             .reshape(batch_by_one)
+             .broadcast(one_by_class));
+  }
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index b04384bda8..d799239d4e 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -22,18 +22,18 @@ class MeanOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of MeanOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of MeanOp should not be null.");
-    ctx.Output<framework::Tensor>("Out")->Resize({1});
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MeanOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MeanOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
   }
 };
 
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op").NotInGradient();
@@ -47,9 +47,8 @@ class MeanGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<Tensor>("X")->dims());
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 29cb85489b..ce049d4d7b 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -26,22 +26,22 @@ class MinusOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of MinusOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of MinusOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of MinusOp should not be null.");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MinusOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of MinusOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MinusOp should not be null.");
 
-    auto *left_tensor = ctx.Input<framework::Tensor>("X");
-    auto *right_tensor = ctx.Input<framework::Tensor>("Y");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_EQ(
-        left_tensor->numel(), right_tensor->numel(),
+        x_dims, y_dims,
         "Minus operator must take two tensor with same num of elements");
-    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 8606c0d1e1..84212a2b3b 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -22,20 +22,19 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& context) const override {
-    PADDLE_ENFORCE_NOT_NULL(context.InputVar("X"), "X must be initialized.");
-    PADDLE_ENFORCE_NOT_NULL(context.InputVar("Y"), "Y must be initialized.");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
 
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x->dims(), y->dims(),
-                      "The shape of X and Y must be the same.");
-    PADDLE_ENFORCE_EQ(x->dims().size(), 2, "The tensor rank of X must be 2.");
-    PADDLE_ENFORCE_EQ(x->dims()[1], 1, "The 2nd dimension of X must be 1.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
 
-    context.Output<framework::Tensor>("IntermediateVal")->Resize(x->dims());
-    context.Output<framework::Tensor>("Out")->Resize({x->dims()[0], 1});
+    ctx->SetOutputDim("IntermediateVal", x_dims);
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
   }
 };
 
@@ -75,27 +74,28 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* intermediate_val = context.Input<Tensor>("IntermediateVal");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_NOT_NULL(x, "X must be initialized.");
-    PADDLE_ENFORCE_NOT_NULL(y, "Y must be initialized.");
-    PADDLE_ENFORCE_NOT_NULL(intermediate_val,
-                            "Intermediate value must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(out_grad, "Input(Out@Grad) must not be null.");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),
+                   "Intermediate value must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_EQ(
-        intermediate_val->dims(), x->dims(),
+        intermediate_dims, x_dims,
         "The shape of X and intermediate value must be the same.");
-    PADDLE_ENFORCE_EQ(out_grad->dims(), x->dims(),
+    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
                       "The shape of Input(Out@Grad) and X must be the same.");
 
-    if (x_grad) x_grad->Resize(x->dims());
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 7047718a3f..9858c4d9c2 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -24,27 +24,23 @@ class MulOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of MulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
-                            "Input(Y) of MulOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of MulOp should not be null.");
-
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    int x_num_col_dims = Attr<int>("x_num_col_dims");
-    int y_num_col_dims = Attr<int>("y_num_col_dims");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MulOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
+    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
 
     PADDLE_ENFORCE(x_dims.size() > x_num_col_dims,
-                   "The rank of input tensor X(%s) should be larger than "
-                   "`mul_op`'s `x_num_col_dims`.",
-                   ctx.op().Input("X"));
+                   "The rank of input tensor X should be larger than "
+                   "`mul_op`'s `x_num_col_dims`.");
     PADDLE_ENFORCE(y_dims.size() > y_num_col_dims,
-                   "The rank of input tensor Y(%s) should be larger than "
-                   "`mul_op`'s `y_num_col_dims`.",
-                   ctx.op().Input("Y"));
+                   "The rank of input tensor Y should be larger than "
+                   "`mul_op`'s `y_num_col_dims`.");
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
@@ -52,24 +48,23 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], y_mat_dims[0],
         "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<framework::Tensor>("Out")->Resize(
-        {x_mat_dims[0], y_mat_dims[1]});
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", {x_mat_dims[0], y_mat_dims[1]});
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of mul op");
     AddInput("Y", "The second input of mul op");
     AddOutput("Out", "The output of mul op");
     AddAttr<int>(
         "x_num_col_dims",
-        R"DOC(mul_op can take tensors with more than two dimensions as input `X`, 
-            in that case, tensors will be reshaped to a matrix. The matrix's first 
-            dimension(column length) will be the product of tensor's last 
+        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
+            in that case, tensors will be reshaped to a matrix. The matrix's first
+            dimension(column length) will be the product of tensor's last
             `num_col_dims` dimensions, and the matrix's second dimension(row length)
             will be the product of tensor's first `rank - num_col_dims` dimensions.
         )DOC")
@@ -100,16 +95,14 @@ class MulOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     auto x_mat_dims =
         framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));
@@ -125,8 +118,15 @@ class MulOpGrad : public framework::OperatorWithKernel {
         "The second dimension of Out@GRAD must equal to the second "
         "dimension of the second operand.");
 
-    if (x_grad) x_grad->Resize(x_dims);
-    if (y_grad) y_grad->Resize(y_dims);
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 6e77b86b56..9896d269cc 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -18,61 +18,64 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 
 class MultiplexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
-                   "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) shouldn't be null.");
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto *out = ctx.Output<LoDTensor>("Out");
-    auto num_ins = ins.size();
-    PADDLE_ENFORCE(num_ins > 2,
-                   "multiplex operator should have more than 2 inputs.");
-    PADDLE_ENFORCE_EQ(ins[0]->dims().size(), 1,
-                      "The first input must be a index vector.");
-    auto in_dim = ins[1]->dims();
-
-    for (size_t i = 2; i < num_ins; i++) {
-      auto dim = ins[i]->dims();
-      PADDLE_ENFORCE(
-          in_dim == dim,
-          "All the input tensors except the first one must have the same size");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
+    PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
+                   "MultiInput(X) shouldn't be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto ids_dim = ctx->GetInputDim("Ids");
+    PADDLE_ENFORCE(
+        ids_dim.size() == 2 && ids_dim[1] == 1,
+        "The index tensor must be a vector with size batchSize x 1.");
+
+    auto ins_dims = ctx->GetInputsDim("X");
+    auto num_ins = ins_dims.size();
+    PADDLE_ENFORCE(num_ins > 1,
+                   "multiplex operator should have more than "
+                   "one candidate input tensors.");
+
+    auto in_dim = ins_dims[0];
+    PADDLE_ENFORCE(in_dim.size() >= 2,
+                   "The rank of candidate tensors must be not less than 2.");
+    for (size_t i = 1; i < num_ins; i++) {
+      auto dim = ins_dims[i];
+      PADDLE_ENFORCE(in_dim == dim,
+                     "All the candidate tensors must have the same size.");
     }
-    out->Resize(in_dim);
+    ctx->SetOutputDim("Out", in_dim);
   }
 };
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiplexOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  MultiplexOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensors of multiplex operator.").AsDuplicable();
+    AddInput("Ids", "The index tensor of multiplex operator.");
+    AddInput("X", "The candidate tensors of multiplex operator.")
+        .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
     AddComment(R"DOC(Multiplex operator
 
-Multiplex multiple tensors according to the index provided by the first
-input tensor.
+Multiplex multiple tensors according to the index provided by the index tensor.
 
-ins[0]: the index tensor.
-ins[1:N]: the candidate output tensors.
+Ids: the index tensor.
+X[0 : N - 1]: the candidate tensors for output (N >= 2).
 For each index i from 0 to batchSize - 1, the output is the i-th row of the
-the (index[i] + 1)-th tensor.
+the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+y[i] = x_{k}[i]
 
 where y is the output tensor. `x_{k}` is the k-th input tensor
-and `k = x{0}[i] + 1`.
-
+and `k = Ids[i]`.
 )DOC");
   }
 };
@@ -82,21 +85,19 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
-                   "Input(X) should not be null");
-    PADDLE_ENFORCE(!ctx.MultiOutputVar(framework::GradVarName("X")).empty(),
-                   "Output(X@Grad) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) shouldn't be null.");
-    auto d_ins = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
-    auto ins = ctx.MultiInput<Tensor>("X");
-    // don't compute gradient for index (ins[0])
-    for (size_t i = 1; i < ins.size(); i++) {
-      if (d_ins[i]) {
-        d_ins[i]->Resize(ins[i]->dims());
-      }
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
+    PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
+                   "Output(X@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    std::vector<framework::DDim> d_ins;
+    auto ins = ctx->GetInputsDim("X");
+    // No need to compute gradient for Input(Ids)
+    for (size_t i = 0; i < ins.size(); i++) {
+      d_ins.push_back(ins[i]);
     }
+    ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
   }
 };
 
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 4736f15bd5..70e46815fc 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -18,27 +18,30 @@
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename Place, typename T>
 class MultiplexGPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto* ids = ctx.Input<Tensor>("Ids");
+    auto* out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto rows = ins[1]->dims()[0];
-    auto cols = ins[1]->dims()[1];
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
     // copy index to cpu
-    framework::Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
-    auto* index = index_t_cpu.data<T>();
+    Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace());
+    auto* index = index_t_cpu.data<int32_t>();
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
                       .stream();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
+      int32_t k = index[i];
+      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
       PADDLE_ENFORCE_LT(k, ins.size(),
                         "index exceeds the number of candidate tensors.");
       memory::Copy(place, out->data<T>() + i * cols, place,
@@ -51,11 +54,11 @@ template <typename Place, typename T>
 class MultiplexGradGPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto d_ins =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    for (size_t i = 1; i < d_ins.size(); i++) {
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto* ids = ctx.Input<Tensor>("Ids");
+    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    for (size_t i = 0; i < d_ins.size(); i++) {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
@@ -63,19 +66,19 @@ class MultiplexGradGPUKernel : public framework::OpKernel {
       }
     }
 
-    auto rows = ins[1]->dims()[0];
-    auto cols = ins[1]->dims()[1];
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
     // copy index to cpu
-    framework::Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
-    auto* index = index_t_cpu.data<T>();
+    Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace());
+    auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
                       .stream();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
+      size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
         memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
                      d_out->data<T>() + i * cols, cols * sizeof(T), stream);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index 44e8e0c199..637c63a34a 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -27,17 +27,19 @@ class MultiplexCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto ids = ctx.Input<framework::Tensor>("Ids");
+    auto* out = ctx.Output<framework::Tensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto rows = ins[1]->dims()[0];
-    auto cols = ins[1]->dims()[1];
-    auto* index = ins[0]->data<T>();
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    auto index = ids->data<int32_t>();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
-      PADDLE_ENFORCE_LT(k, ins.size(),
+      int32_t k = index[i];
+      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
+      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
                         "index exceeds the number of candidate tensors.");
       memory::Copy(place, out->data<T>() + i * cols, place,
                    ins[k]->data<T>() + i * cols, cols * sizeof(T));
@@ -50,10 +52,11 @@ class MultiplexGradCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* ids = ctx.Input<framework::Tensor>("Ids");
     auto ins = ctx.MultiInput<framework::Tensor>("X");
     auto d_ins =
         ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    for (size_t i = 1; i < d_ins.size(); i++) {
+    for (size_t i = 0; i < d_ins.size(); i++) {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
         auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
@@ -61,12 +64,12 @@ class MultiplexGradCPUKernel : public framework::OpKernel {
       }
     }
 
-    auto rows = ins[1]->dims()[0];
-    auto cols = ins[1]->dims()[1];
-    auto* index = ins[0]->data<T>();
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    auto* index = ids->data<int32_t>();
     Place place = boost::get<Place>(ctx.GetPlace());
     for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
+      size_t k = static_cast<size_t>(index[i]);
       if (d_ins[k]) {
         memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
                      d_out->data<T>() + i * cols, cols * sizeof(T));
@@ -74,5 +77,5 @@ class MultiplexGradCPUKernel : public framework::OpKernel {
     }
   }
 };
-}
-}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 375d8a35ac..04ebb14f6e 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -24,14 +24,13 @@ class PadOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of PadOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of PadOp should not be null.");
-
-    auto x_dim = ctx.Input<Tensor>("X")->dims();
-    auto paddings = Attr<std::vector<int>>("paddings");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PadOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
                       "Size of paddings should be equal to 2 * dimension size "
                       "of input tensor.");
@@ -39,19 +38,18 @@ class PadOp : public framework::OperatorWithKernel {
     for (int i = 0; i < x_dim.size(); ++i) {
       out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
     }
-    ctx.Output<framework::Tensor>("Out")->Resize(
-        framework::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
     if (out_dims[0] == x_dim[0]) {
       // Only pass LoD when the first dimension is equal between
       // output and input.
-      ctx.ShareLoD("X", /*->*/ "Out");
+      ctx->ShareLoD("X", /*->*/ "Out");
     }
   }
 };
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PadOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  PadOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -68,15 +66,15 @@ Given:
 X = [[1, 2],
    [3, 4]]
 
-and 
+and
 
 paddings = [0, 1, 1, 2]
 
 and
- 
-pad_value = 0 
 
-then we get 
+pad_value = 0
+
+then we get
 
 Out = [[0, 1, 2, 0, 0]
        [0, 3, 4, 0, 0]
@@ -101,14 +99,14 @@ class PadOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_g = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (x_g != nullptr) {
-      x_g->Resize(x_dims);
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
 };
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 912196c190..1692464f28 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -26,19 +26,14 @@ class PReluOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    auto *in = ctx.Input<framework::Tensor>("X");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Alpha"),
-                            "Input(Alpha) should not be null");
-    auto *alpha = ctx.Input<framework::Tensor>("Alpha");
-    PADDLE_ENFORCE(alpha->numel() == 1, "Size of weight Alpha must be one.");
-
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) should not be null");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    out->Resize(in->dims());
-    ctx.ShareLoD("X", /*->*/ "Out");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
+    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                   "Size of weight Alpha must be one.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -68,19 +63,13 @@ class PReluGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *x = ctx.Input<framework::Tensor>("X");
-
-    auto *dalpha =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Alpha"));
-    auto *alpha = ctx.Input<framework::Tensor>("Alpha");
-
-    dx->Resize(x->dims());
-    dalpha->Resize(alpha->dims());
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("Alpha"),
+                      ctx->GetInputDim("Alpha"));
   }
 };
 
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 39af08c875..1ba22006f2 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -25,22 +25,21 @@ class RankLossOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
     // input check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) shouldn't be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Left"),
-                            "Input(Left) shouldn't be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Right"),
-                            "Input(Right) shouldn't be null");
-    auto label_dims = ctx.Input<framework::Tensor>("Label")->dims();
-    auto left_dims = ctx.Input<framework::Tensor>("Left")->dims();
-    auto right_dims = ctx.Input<framework::Tensor>("Right")->dims();
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    auto left_dims = ctx->GetInputDim("Left");
+    auto right_dims = ctx->GetInputDim("Right");
+
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
                    "All inputs must have the same size");
     PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
                    "All inputs must be row vector with size batch_size x 1.");
-    ctx.Output<framework::Tensor>("Out")->Resize(label_dims);
+    ctx->SetOutputDim("Out", label_dims);
   }
 };
 
@@ -91,25 +90,22 @@ class RankLossGradOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Label) shouldn't be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Left"),
-                            "Input(Left) shouldn't be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Right"),
-                            "Input(Right) shouldn't be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) shouldn't be null.");
-    auto dims = ctx.Input<framework::Tensor>("Left")->dims();
-    auto *left_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
-    auto *right_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
-    if (left_grad) {
-      left_grad->Resize(dims);
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto dims = ctx->GetInputDim("Left");
+    auto left_grad_name = framework::GradVarName("Left");
+    auto right_grad_name = framework::GradVarName("Right");
+
+    if (ctx->HasOutput(left_grad_name)) {
+      ctx->SetOutputDim(left_grad_name, dims);
     }
-    if (right_grad) {
-      right_grad->Resize(dims);
+
+    if (ctx->HasOutput(right_grad_name)) {
+      ctx->SetOutputDim(right_grad_name, dims);
     }
   }
 };
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 8874f25871..a3c3fa2716 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -26,14 +26,14 @@ class ReshapeOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
     // input check
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
 
-    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
     for (auto dim : shape) {
       PADDLE_ENFORCE(dim > 0, "Each dimension of shape must be positive.");
@@ -41,19 +41,20 @@ class ReshapeOp : public framework::OperatorWithKernel {
     // capacity check
     int64_t capacity =
         std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    auto *in = ctx.Input<framework::Tensor>("X");
-    PADDLE_ENFORCE_EQ(capacity, in->numel(),
+    auto x_dims = ctx->GetInputDim("X");
+    int64_t in_size = framework::product(x_dims);
+    PADDLE_ENFORCE_EQ(capacity, in_size,
                       "The size of Input(X) mismatches with Attr(shape).");
     // resize output
     std::vector<int64_t> shape_int64(shape.size(), 0);
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
-    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
-    if (shape[0] == in->dims()[0]) {
+    ctx->SetOutputDim("Out", out_dims);
+    if (shape[0] == x_dims[0]) {
       // Only pass LoD when the first dimension is equal between
       // output and input.
-      ctx.ShareLoD("X", /*->*/ "Out");
+      ctx->ShareLoD("X", /*->*/ "Out");
     }
   }
 };
@@ -75,7 +76,7 @@ Given a 2-D tensor X with 2 rows and 2 columns
 
     [[1, 2], [3, 4]]
 
-with target shape = [1, 4], the reshape operator will transform 
+with target shape = [1, 4], the reshape operator will transform
 the tensor X into a 1-D tensor:
 
     [1, 2, 3, 4]
@@ -93,13 +94,11 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) shouldn't be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) shouldn't be null.");
-    auto dims = ctx.Input<framework::Tensor>("X")->dims();
-    auto *d_in = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_in->Resize(dims);
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index fc3ad721f2..1fcf0959df 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -24,16 +24,16 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"),
-                            "Input(b) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of RowwiseAddOp should not be null.");
-
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto b_dims = ctx.Input<Tensor>("b")->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of RowwiseAddOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("b"),
+                   "Input(b) of RowwiseAddOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RowwiseAddOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto b_dims = ctx->GetInputDim("b");
     PADDLE_ENFORCE_GT(
         x_dims.size(), b_dims.size(),
         "The rank of input `X` must be larger than the one of input `b`.");
@@ -43,16 +43,17 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
-    PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
-    ctx.Output<framework::Tensor>("Out")->Resize(x_dims);
-    ctx.ShareLoD("X", /*->*/ "Out");
+    PADDLE_ENFORCE_EQ(ctx->Outputs("Out").size(), 1,
+                      "The output size must be 1");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowwiseAddOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  RowwiseAddOpMaker(framework::OpProto* proto,
+                    framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left input of row-wise add op, must be matrix");
     AddInput("b", "The right input of row-wise add op, must be vector");
@@ -69,25 +70,29 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "X should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("b"), "b should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto b_dims = ctx.Input<Tensor>("b")->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("b"), "b should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto b_dims = ctx->GetInputDim("b");
     PADDLE_ENFORCE_GT(
         x_dims.size(), b_dims.size(),
         "The rank of input `X` must be larger than the one of input `b`.");
 
-    int num_col_dims = x_dims.size() - b_dims.size();
+    int64_t num_col_dims = x_dims.size() - b_dims.size();
     PADDLE_ENFORCE_EQ(
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("b"));
-    if (dx) dx->Resize(x_dims);
-    if (db) db->Resize(b_dims);
+    auto x_grad_name = framework::GradVarName("X");
+    auto b_grad_name = framework::GradVarName("b");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(b_grad_name)) {
+      ctx->SetOutputDim(b_grad_name, b_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 1ae77a9722..e92501e128 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -26,16 +26,13 @@ class ScaleOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of ScaleOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of ScaleOp should not be null.");
-
-    auto *in = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    out->Resize(in->dims());
-    ctx.ShareLoD("X", /*->*/ "Out");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ScaleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ScaleOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 3f02081a06..3fc4a39ebc 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -23,29 +23,30 @@ class ScatterOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Ref"),
-                            "Input(Ref) of ScatterOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Index"),
-                            "Input(Index) of ScatterOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Updates"),
-                            "Input(Updates) of ScatterOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of ScatterOp should not be null.");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ref"),
+                   "Input(Ref) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Index"),
+                   "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Updates"),
+                   "Input(Updates) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ScatterOp should not be null.");
 
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Index")->dims().size(), 1,
+    auto updates_dims = ctx->GetInputDim("Updates");
+    auto ref_dims = ctx->GetInputDim("Ref");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
                       "Update Index should be 1-D.");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Ref")->dims().size(),
-                      ctx.Input<Tensor>("Updates")->dims().size(),
+    PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
                       "Reference and Updates should have the same shape size");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Updates")->dims()[0],
-                      ctx.Input<Tensor>("Index")->dims()[0],
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
+                      ctx->GetInputDim("Index")[0],
                       "Updates and Index should have same batch-size.");
-    framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
-    for (int i = 1; i < data_dim.size(); ++i)
-      PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
-    ctx.Output<framework::Tensor>("Out")->Resize(
-        ctx.Input<Tensor>("Ref")->dims());
+    framework::DDim data_dim(updates_dims);
+    for (int i = 1; i < data_dim.size(); ++i) {
+      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
+    }
+    ctx->SetOutputDim("Out", ref_dims);
   }
 };
 
@@ -54,22 +55,17 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto *dUpdates =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Updates"));
-    auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *dRef = ctx.Output<framework::Tensor>(framework::GradVarName("Ref"));
-    auto *Ref = ctx.Input<Tensor>("Ref");
-
-    dRef->Resize(Ref->dims());
-    dUpdates->Resize(Updates->dims());
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("Updates"),
+                      ctx->GetInputDim("Updates"));
+    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
   }
 };
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScatterOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  ScatterOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ref", "The source input of scatter op");
     AddInput("Index",
@@ -77,13 +73,14 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Updates", "The updated value of updates op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
-Scatter Operator by selecting from the first axis, 
+Scatter Operator by selecting from the first axis,
 
 Out = Ref
 Out[Index] = Ref[Index] + Updates
 )DOC");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 6c1a2bc3fd..bc4af2f704 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -22,23 +22,12 @@ class SequencePoolOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of SequencePoolOp should not be null.");
-
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto dims = x->dims();
-    auto lod = x->lod();
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_GE(
-        dims[0],
-        /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
-        "The first dimension of Input(X) must be larger than batch size.");
-    dims[0] = lod[0].size() - 1;
-    ctx.Output<framework::LoDTensor>("Out")->Resize({dims});
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePoolOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
   }
 };
 
@@ -61,17 +50,17 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     SequencePoolOp pools features of all time-steps of each instance.
 
     For a mini-batch of 3 variable lengths sentences, containing 2, 3, and 2 time-steps:
-    
+
     Assume X is a [7,M,N] float LoDTensor, and X->lod()[0] = [0, 2, 5, 7].
-    Besides, for the sake of simplicity, we assume M=1 and N=1, 
+    Besides, for the sake of simplicity, we assume M=1 and N=1,
     and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
     Thus, Out is a [3,1,1] float LoDTensor, but Out->lod() is nullptr.
-    And for different strategy, the value of Out is as follows: 
+    And for different strategy, the value of Out is as follows:
 
     - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
     - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), 
+    - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
            6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
     - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
     - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
@@ -85,22 +74,18 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Gradient of Out should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "The input X should not be null.");
-    auto og_dims =
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->dims();
-    auto x_dims = ctx.Input<framework::LoDTensor>("X")->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
+    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
                       "The rank of output grad must equal to Input(X).");
     for (int64_t i = 1; i < og_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
     }
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    x_grad->Resize(x_dims);
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
 };
 
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 231614b4c1..cb80586e88 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -46,16 +46,27 @@ class SequencePoolKernel : public framework::OpKernel {
     int strategy = context.Attr<int>("strategy");
 
     auto dims = in->dims();
-    auto lod = in->lod()[0];
+    auto lod = in->lod();
     int64_t w = in->numel() / dims[0];
 
+    // InferShape by lod
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_GE(
+        dims[0],
+        /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
+        "The first dimension of Input(X) must be large than batch size.");
+    dims[0] = lod[0].size() - 1;
+    out->Resize({dims});
+
+    auto lod_level_0 = lod[0];
+
     out->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      Tensor in_t =
-          in->Slice<T>(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      Tensor in_t = in->Slice<T>(static_cast<int>(lod_level_0[i]),
+                                 static_cast<int>(lod_level_0[i + 1]));
       Tensor out_t = out->Slice<T>(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
       auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
 
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index b063e24272..3bce95535c 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -22,19 +22,18 @@ class SGDOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("param"),
-                            "Input(param) of SGDOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("grad"),
-                            "Input(grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("param_out"),
-                            "Output(param_out) of SGDOp should not be null.");
-
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("param")->dims(),
-                      ctx.Input<Tensor>("grad")->dims(),
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("param"),
+                   "Input(param) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("grad"),
+                   "Input(grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("param_out"),
+                   "Output(param_out) of SGDOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("grad"),
                       "Two input of SGD Op's dimension must be same.");
-    ctx.Output<framework::Tensor>("param_out")
-        ->Resize(ctx.Input<Tensor>("param")->dims());
+    ctx->SetOutputDim("param_out", param_dim);
   }
 };
 
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index ae6d1c80b3..2d197e3b1b 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,33 +22,28 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "X must be initialized.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Y must be initialized.");
-
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    PADDLE_ENFORCE_EQ(x->dims(), y->dims(),
-                      "The shape of X and Y must be the same.");
-    PADDLE_ENFORCE_GE(x->dims().size(), 2,
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
                       "The tensor rank of X must be at least 2.");
-    auto* inside_weight = ctx.Input<framework::Tensor>("InsideWeight");
-    if (inside_weight) {
-      auto* outside_weight = ctx.Input<framework::Tensor>("OutsideWeight");
-      PADDLE_ENFORCE_NOT_NULL(outside_weight,
-                              "If weights are provided, must specify both "
-                              "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(inside_weight->dims(), x->dims(),
+    if (ctx->HasInput("InsideWeight")) {
+      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
+                     "If weights are provided, must specify both "
+                     "inside and outside weights.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
                         "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(outside_weight->dims(), x->dims(),
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
                         "The shape of OutsideWeight must be same as X.");
     }
 
-    auto* diff = ctx.Output<framework::Tensor>("Diff");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    diff->Resize(x->dims());
+    ctx->SetOutputDim("Diff", x_dims);
     // loss is a two-rank tensor
-    out->Resize({x->dims()[0], 1});
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
   }
 };
 
@@ -99,12 +94,9 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    auto in_dims = ctx.Input<framework::Tensor>("X")->dims();
-    auto out_dims =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->dims();
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    auto in_dims = ctx->GetInputDim("X");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
                       "The tensor rank of Input(Out@Grad) should be 2.");
@@ -114,8 +106,14 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(out_dims[1], 1,
                       "The 2nd dimension of Input(Out@Grad) must be 1.");
 
-    if (x_grad) x_grad->Resize(in_dims);
-    if (y_grad) y_grad->Resize(in_dims);
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, in_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, in_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index e15cfe4850..e353afee3e 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -22,22 +22,23 @@ class SoftmaxOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of SoftmaxOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
-                            "Output(Y) of SoftmaxOp should not be null.");
-
-    PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of SoftmaxOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx.Output<framework::Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx->SetOutputDim("Y", x_dims);
   }
 };
 
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  SoftmaxOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of softmax. "
@@ -68,16 +69,15 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
-                            "Input(Y@GRAD) should be not null.");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Y")->dims(),
-                      ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Y"),
+                      ctx->GetInputDim(framework::GradVarName("Y")),
                       "Input(Y) and its gradients should have a same shape.");
 
-    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
-        ->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 8a3a5ab927..7220f486be 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
@@ -30,36 +31,11 @@ class SoftmaxKernel : public framework::OpKernel {
   void Compute(const framework::ExecutionContext& context) const override {
     auto X = context.Input<Tensor>("X");
     auto Y = context.Output<Tensor>("Y");
-    Y->mutable_data<T>(context.GetPlace());
-
-    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*Y);
-
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
 
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
-
-    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+    // allocate memory on device.
+    Y->mutable_data<T>(context.GetPlace());
 
-    softmax.device(context.GetEigenDevice<Place>()) =
-        (softmax *
-         softmax.sum(along_class)
-             .inverse()
-             .eval()
-             .reshape(batch_by_one)
-             .broadcast(one_by_class));
+    math::SoftmaxFunctor<Place, T>()(context, X, Y);
   }
 };
 
@@ -67,8 +43,6 @@ template <typename Place, typename T>
 class SoftmaxGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
-
     auto Y = context.Input<Tensor>("Y");
     auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
     auto dX = context.Output<Tensor>(framework::GradVarName("X"));
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
new file mode 100644
index 0000000000..b6f33ad9e0
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
+                                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.")
+        .NotInGradient();
+    AddInput(
+        "Label",
+        "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
+        "tensor. "
+        "If softLable is set to 0, Label is a Tensor<int> with shape [N x 1]. "
+        "If softLable is set to 1, Label is a Tensor<float/double> "
+        "with shape [N x K].");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
+              "entropy loss with shape [N x 1].");
+    AddAttr<bool>(
+        "softLabel",
+        "(bool, default: false), A flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Cross entropy loss with softmax are used as the output layer extensively. This
+operator computes the softmax normalized values for each row of the input
+tensor, after which cross-entropy loss is then computed. This provides a more
+numerically stable gradient.
+
+Because this operators performs a softmax on logits internally, it expects
+unscaled logits. Please do not call this op with the output of softmax operator,
+which will produce incorrect results.
+
+This operators expects mutually exclusive hard labels, each sample in a batch
+is in exactly one class with probabilities 1. Each sample in the batch with one
+and only one label.
+
+Equation:
+
+1) hard label (one-hot label)
+
+Loss_j = -\text{Logit}_{Label_j} + \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), j = 1, ..., K
+
+2) soft label (a distribution over all classes)
+
+Loss_j = -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), j = 1,...,K
+
+)DOC");
+  }
+};
+
+class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Logits"),
+                            "Input(Logits) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Softmax"),
+                            "Output(Softmax) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Loss"),
+                            "Output(Loss) should be not null.");
+
+    const Tensor* logits = ctx.Input<Tensor>("Logits");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    PADDLE_ENFORCE_EQ(
+        logits->dims().size(), 2UL,
+        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Label")->dims().size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    if (ctx.Attr<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(logits->dims()[1], labels->dims()[1],
+                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels->dims()[1], 1UL,
+                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx.Output<framework::Tensor>("Softmax")->Resize(logits->dims());
+    ctx.Output<framework::Tensor>("Loss")->Resize({logits->dims()[0], 1});
+
+    ctx.ShareLoD("Logits", /*->*/ "Softmax");
+    ctx.ShareLoD("Logits", /*->*/ "Loss");
+  }
+};
+
+class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
+                            "Input(Loss@Grad) should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
+                            "Input(Softmax) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Label) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("Logits")),
+                            "Output(Logits@Grad) should be not null.");
+
+    const Tensor* softmax = ctx.Input<Tensor>("Softmax");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Label")->dims().size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    if (ctx.Attr<bool>("softLabel")) {
+      PADDLE_ENFORCE_EQ(softmax->dims()[1], labels->dims()[1],
+                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels->dims()[1], 1UL,
+                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
+        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
+            ops::SoftmaxWithCrossEntropyOpMaker,
+            softmax_with_cross_entropy_grad,
+            ops::SoftmaxWithCrossEntropyOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyKernel<float>);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
new file mode 100644
index 0000000000..1cf4296dcc
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyGrad(T* out_grad, const T* in_grad,
+                                 const int* labels, const int batch_size,
+                                 const int class_num) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int sample_idx = tid / class_num;
+
+  if (tid < batch_size * class_num) out_grad[tid] *= in_grad[sample_idx];
+  __syncthreads();
+
+  if (tid < batch_size) {
+    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
+    out_grad[tid * class_num + labels[tid]] -= 1.;
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
+                                               const T* loss_grad,
+                                               const T* labels,
+                                               const int batch_size,
+                                               const int class_num) {
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < batch_size * class_num) {
+    int row_ids = ids / class_num;
+    logit_grad[ids] = logit_grad[ids] * loss_grad[row_ids] - labels[ids];
+  }
+}
+}  // namespace
+
+template <typename T>
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+
+    Tensor* loss = context.Output<Tensor>("Loss");
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
+    math::CrossEntropyFunctor<platform::GPUPlace, T>()(
+        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* loss_grad_data =
+        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    T* logit_grad_data = logit_grad->data<T>();
+
+    const int batch_size = logit_grad->dims()[0];
+    const int class_num = logit_grad->dims()[1];
+    int block = 512;
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (context.Attr<bool>("softLabel")) {
+      const T* label_data = labels->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              context.device_context())
+                              .stream()>>>(logit_grad_data, loss_grad_data,
+                                           label_data, batch_size, class_num);
+    } else {
+      const int* label_data = labels->data<int>();
+      CrossEntropyGrad<T><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              context.device_context())
+                              .stream()>>>(logit_grad_data, loss_grad_data,
+                                           label_data, batch_size, class_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad,
+                       ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
new file mode 100644
index 0000000000..bf792c1f59
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cross_entropy.h"
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+    Tensor* loss = context.Output<Tensor>("Loss");
+
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxFunctor<platform::CPUPlace, T>()(context, logits, softmax);
+    math::CrossEntropyFunctor<platform::CPUPlace, T>()(
+        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+
+    const int class_num = logit_grad->dims()[1];
+    if (context.Attr<bool>("softLabel")) {
+      auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+      auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+
+      logit_grad_mat.device(context.GetEigenDevice<platform::CPUPlace>()) =
+          logit_grad_mat *
+              out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) -
+          lbl_mat;
+    } else {
+      const int batch_size = logit_grad->dims()[0];
+      const int* label_data = labels->data<int>();
+      const T* out_grad_data = out_grad->data<T>();
+      T* logit_grad_data = logit_grad->data<T>();
+
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        logit_grad_data[index] =
+            (out_grad_data[i] * logit_grad_data[index] - 1.);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index a9d35b4fb7..8640d1010e 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -24,40 +24,42 @@ class SplitOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    // infershape
-    auto *in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
-    size_t num = static_cast<size_t>(ctx.Attr<int>("num"));
-    std::vector<int> sections =
-        static_cast<std::vector<int>>(ctx.Attr<std::vector<int>>("sections"));
-    const size_t n = outs.size();
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
 
     if (num > 0) {
-      int64_t in_axis_dim = in->dims()[axis];
+      int64_t in_axis_dim = in_dims[axis];
       PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
                         "tensor split does not result"
                         " in an equal division");
       size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < n; ++i) {
-        auto dim = in->dims();
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
         dim[axis] = out_axis_dim;
-        outs[i]->Resize(dim);
+        outs_dims.push_back(dim);
       }
     } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(sections.size(), n,
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
                         "tensor split sections size"
                         "should be equal to output size.");
-      for (size_t i = 0; i < n; ++i) {
-        auto dim = in->dims();
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
         dim[axis] = sections[i];
-        outs[i]->Resize(dim);
+        outs_dims.push_back(dim);
       }
     } else {
       PADDLE_ENFORCE_NOT_NULL(nullptr, "split operator should",
                               " specify indices or sections.");
     }
+    ctx->SetOutputsDim("Out", outs_dims);
   }
 };
 
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 33a564b05b..5a0cb59600 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -22,24 +22,19 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("X"),
-        "Input(X) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("Y"),
-        "Input(Y) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("sub_result"),
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("sub_result"),
         "Output(sub_result) of SquaredL2DistanceOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SquaredL2DistanceOp should not be null.");
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto x_dims = x->dims();
-    auto* y = ctx.Input<Tensor>("Y");
-    auto y_dims = y->dims();
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
 
     PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
                       "Tensor rank of both SquaredL2DistanceOp's "
@@ -47,17 +42,16 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
     int rank = framework::arity(x_dims);
     PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(x->numel() / x_dims[0], y->numel() / y_dims[0],
+    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
                       "Product of dimensions expcet the first dimension of "
                       "input and target must be equal.");
     PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
                    "First dimension of target must be equal to input "
                    "or to 1.");
 
-    ctx.Output<framework::Tensor>("sub_result")
-        ->Resize({x_dims[0], x->numel() / x_dims[0]});
-    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -92,22 +86,22 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Gradient of Out should not be null");
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
                       "First dimension of output gradient and "
                       "input value must be equal.");
     PADDLE_ENFORCE_EQ(out_dims[1], 1,
                       "Second dimension of output gradient "
                       "must be 1.");
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    if (x_grad) x_grad->Resize(x_dims);
-    if (y_grad) y_grad->Resize(y_dims);
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
+    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
   }
 };
 
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 437fc262f3..8f62a9f4db 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -21,31 +21,27 @@ class SumOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(!ctx.MultiInputVar("X").empty(),
-                   "Input(X) of SumOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of SumOp should not be null.");
-
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    int N = ins.size();
-
-    auto in_dim = ins[0]->dims();
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    auto x_dims = ctx->GetInputsDim("X");
+    PADDLE_ENFORCE(!x_dims.empty(), "Input(X) of SumOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SumOp should not be null.");
 
+    auto in_dim = x_dims[0];
+    size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
-    for (int i = 1; i < N; i++) {
-      auto dim = ins[i]->dims();
+    for (size_t i = 1; i < N; i++) {
+      auto dim = x_dims[i];
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
     }
-    out->Resize(in_dim);
-    ctx.ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", in_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
@@ -63,13 +59,16 @@ class SumGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto outputs =
-        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    auto dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    for (auto output : outputs) {
-      output->Resize(dims);
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_grad_names = ctx->Outputs(framework::GradVarName("X"));
+    size_t x_length = x_grad_names.size();
+    std::vector<framework::DDim> x_grad_dims;
+    x_grad_dims.reserve(x_length);
+    for (size_t i = 0; i < x_length; ++i) {
+      x_grad_dims.push_back(out_grad_dims);
     }
+    ctx->SetOutputsDim(framework::GradVarName("X"), x_grad_dims);
   }
 };
 
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index a6e43964e9..5f22bf1df8 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -22,26 +22,26 @@ class TopkOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
-                            "Input(X) of TopkOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) of TopkOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Indices"),
-                            "Output(Indices) of TopkOp should not be null.");
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
 
-    auto *input = ctx.Input<framework::Tensor>("X");
-    const int k = static_cast<int>(ctx.Attr<int>("k"));
+    auto input_dims = ctx->GetInputDim("X");
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
-    PADDLE_ENFORCE_GE(input->dims().size(), 1, "input must have >= 1d shape");
-    PADDLE_ENFORCE_GE(input->dims()[input->dims().size() - 1], k,
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
                       "input must have >= k columns");
 
-    framework::DDim dims = input->dims();
+    framework::DDim dims = input_dims;
     dims[dims.size() - 1] = k;
-    ctx.Output<framework::Tensor>("Out")->Resize(dims);
-    ctx.Output<framework::Tensor>("Indices")->Resize(dims);
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
   }
 };
 
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index afe4d149c5..53fe505b77 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -301,14 +301,16 @@ class TopkOpCUDAKernel : public framework::OpKernel {
 
     // NOTE: pass lds and dim same to input width.
     // NOTE: old matrix implementation of stride is different to eigen.
-    // TODO(typhoonzero): launch kernel on specified stream.
     // TODO(typhoonzero): refine this kernel.
     dim3 threads(256, 1);
     dim3 grid(input_height, 1);
 
-    KeMatrixTopK<T, 5, 256><<<grid, threads>>>(
-        output_data, output->dims()[1], indices_data, input_data, input_width,
-        input_width, int(k));
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(output_data, output->dims()[1],
+                                           indices_data, input_data,
+                                           input_width, input_width, int(k));
   }
 };
 
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 017a05326e..0672f9342d 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -24,12 +24,11 @@ class TransposeOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
-                            "Output(Out) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> axis = ctx->Attrs().Get<std::vector<int>>("axis");
     size_t x_rank = x_dims.size();
     size_t axis_size = axis.size();
 
@@ -51,14 +50,14 @@ class TransposeOp : public framework::OperatorWithKernel {
     for (size_t i = 0; i < axis_size; i++) {
       out_dims[i] = x_dims[axis[i]];
     }
-    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
+    ctx->SetOutputDim("Out", out_dims);
   }
 };
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TransposeOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  TransposeOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -79,7 +78,7 @@ For example:
         [3, 4, 5]])
  >> axis = [1, 0]
  >> output = input.transpose(axis)
- >> output 
+ >> output
  array([[0, 3],
         [1, 4],
 		[2, 5]])
@@ -94,14 +93,15 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    if (x_grad) x_grad->Resize(x_dims);
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 17ea48361b..2771df5608 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -23,18 +23,18 @@ namespace operators {
 template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(ctx.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
       seed = std::random_device()();
     }
     engine.seed(seed);
     std::uniform_real_distribution<T> dist(
-        static_cast<T>(context.Attr<float>("min")),
-        static_cast<T>(context.Attr<float>("max")));
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
     int64_t size = tensor->numel();
     for (int64_t i = 0; i < size; ++i) {
       data[i] = dist(engine);
@@ -47,21 +47,20 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Out"),
-        "Output(Out) of UniformRandomOp should not be null.");
+  void InferShape(framework::InferShapeContextBase* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UniformRandomOp should not be null.");
 
-    PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
-                   "uniform_random's min must less then max");
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "uniform_random's min must less then max");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
     for (auto dim : dims) {
       temp.push_back(static_cast<int64_t>(dim));
     }
-    tensor->Resize(framework::make_ddim(temp));
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
 };
 
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index caa78acd98..895e8d6a63 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "ParameterOptimizer.h"
+#include "ParameterUpdateFunctions.h"
 #include "Regularizer.h"
 
 namespace paddle {
@@ -37,6 +38,15 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
+#ifdef PADDLE_USE_MKLDNN
+    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
+                  (firstTime_ ? 1.0 : torch_learningRate),
+              paraConfig.momentum(),
+              applyDecay_ ? paraConfig.decay_rate() : 0,
+              vecs[PARAMETER_VALUE].get(),
+              vecs[PARAMETER_GRADIENT].get(),
+              vecs[PARAMETER_MOMENTUM].get());
+#else
     vecs[PARAMETER_VALUE]->sgdUpdate(
         *vecs[PARAMETER_GRADIENT],
         *vecs[PARAMETER_MOMENTUM],
@@ -44,6 +54,7 @@ public:
             (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
+#endif
   }
   virtual void finishBatch() { firstTime_ = false; }
 };
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index c8af7105c7..8b3be062b6 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,6 +30,9 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
+#ifdef PADDLE_USE_MKLDNN
+#pragma omp parallel for
+#endif
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
                      decayRate * value[i];
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index a106592e45..f6a39a8e26 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -34,13 +34,14 @@ class DeviceContext {
 
   template <typename DeviceType>
   DeviceType* get_eigen_device() const;
+
+  virtual void Wait() const {}
 };
 
 class CPUDeviceContext : public DeviceContext {
  public:
   CPUDeviceContext();
   explicit CPUDeviceContext(CPUPlace place);
-  virtual ~CPUDeviceContext() {}
 
   Eigen::DefaultDevice* eigen_device() const;
 
@@ -59,7 +60,7 @@ class CUDADeviceContext : public DeviceContext {
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const;
+  void Wait() const override;
 
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index ed2420b874..f0c825bd9b 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -36,7 +36,7 @@ int GetCurrentDeviceId();
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);
 
-//！Get the memory usage of current GPU device.
+//! Get the memory usage of current GPU device.
 void GpuMemoryUsage(size_t &available, size_t &total);
 
 //! Get the maximum allocation size of current GPU device.
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index fbe074188e..25e290ffbb 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -237,7 +237,13 @@ All parameter, weight, gradient are variables in Paddle.
              return Backward(forwardOp, no_grad_vars).release();
            })
       .def("infer_shape", &OperatorBase::InferShape)
-      .def("run", &OperatorBase::Run)
+      .def("run",
+           [](OperatorBase &self,
+              const Scope &scope,
+              const platform::DeviceContext &dev_ctx) {
+             self.Run(scope, dev_ctx);
+             dev_ctx.Wait();
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index f01ad4142d..066837ca95 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -37,6 +37,19 @@ add_test(NAME test_CompareTwoNets
             --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
+################ test_CompareMKLDNNandCPU ######################
+if(WITH_MKLDNN)
+  add_unittest_without_exec(test_CompareMKLDNNandCPU
+      test_CompareTwoNets.cpp)
+  add_test(NAME test_CompareMKLDNNandCPU
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
+              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
+              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
+              --use_gpu=False
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
+
 ############### test_CompareTwoOpts ###################
 add_unittest_without_exec(test_CompareTwoOpts
     test_CompareTwoOpts.cpp)
diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
new file mode 100644
index 0000000000..77f7816153
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@@ -0,0 +1,63 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 1000,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+            
+tmp = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+            
+tmp = fc_layer(input=tmp, size=64,
+               bias_attr=True,
+               act=ReluActivation())
+
+output = fc_layer(input=tmp, size=10,
+                  bias_attr=True,
+                  act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 94f65e545d..307645d2c3 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -26,12 +26,15 @@ DECLARE_int32(gpu_id);
 
 DECLARE_bool(local);
 DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
 
 DECLARE_string(config);
 DECLARE_string(nics);
 
 DEFINE_string(config_file_a, "", "config of one network to compare");
 DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(use_mkldnn_a, false, "whether to use mkldnn to run config_file_a");
+DEFINE_bool(use_mkldnn_b, false, "whether to use mkldnn to run config_file_b");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -128,6 +131,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
                 matA.getWidth());
   }
 
+  if (FLAGS_use_mkldnn_a || FLAGS_use_mkldnn_b) {
+    // some format of mkldnn parameter is different with cpu
+    // test_MKLDNN will check the parameters
+    return;
+  }
+
   vector<ParameterPtr>& parametersA = comDataA.parameters;
   vector<ParameterPtr>& parametersB = comDataB.parameters;
 
@@ -167,10 +176,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
+  FLAGS_use_mkldnn = FLAGS_use_mkldnn_a;
   calcGradient(dataA, FLAGS_config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
+  FLAGS_use_mkldnn = FLAGS_use_mkldnn_b;
   calcGradient(dataB, FLAGS_config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 0f57b81966..098a51ab87 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1566,7 +1566,7 @@ class LayerBase(object):
         self.config = g_config.model_config.layers.add()
         assert isinstance(self.config, LayerConfig)
         use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        mkldnn_acts = ['relu', 'tanh']
+        mkldnn_acts = ['relu', 'tanh', 'softmax']
         if use_mkldnn and active_type in mkldnn_acts:
             active_type = "mkldnn_" + active_type
         self.config.name = name
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index c97e6c0a36..74025d2a7b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -921,7 +921,7 @@ def data_layer(name, size, depth=None, height=None, width=None,
 
         data = data_layer(name="input", size=1000)
 
-    :param name: The name of this layer. It is optional.
+    :param name: The name of this layer.
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
@@ -3668,6 +3668,7 @@ def gru_step_naive_layer(input,
     :param param_attr:
     :param layer_attr:
     :return:
+    :rtype: LayerOutput
     """
     if input.size % 3 != 0:
         raise ValueError("GruStep input size must be divided by 3")
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 0a5673868c..579ad7b407 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -177,7 +177,7 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 
 
 class OpTest(unittest.TestCase):
-    def check_output_with_place(self, place):
+    def check_output_with_place(self, place, atol):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
@@ -206,22 +206,23 @@ class OpTest(unittest.TestCase):
                         self.scope.find_var(sub_out_name).get_tensor())
                     self.assertTrue(
                         np.allclose(
-                            actual, expect, atol=1e-05),
-                        "output name: " + out_name + " has diff")
+                            actual, expect, atol=atol),
+                        "output name: " + out_name + " has diff.")
             else:
                 actual = np.array(self.scope.find_var(out_name).get_tensor())
                 expect = self.outputs[out_name]
+
                 self.assertTrue(
                     np.allclose(
-                        actual, expect, atol=1e-05),
-                    "output name: " + out_name + " has diff")
+                        actual, expect, atol=atol),
+                    "output name: " + out_name + " has diff.")
 
-    def check_output(self):
+    def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
         if core.is_compile_gpu():
             places.append(core.GPUPlace(0))
         for place in places:
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, atol)
 
     def __assert_is_close(self, numeric_grads, analytic_grads, names,
                           max_relative_error, msg_prefix):
@@ -235,9 +236,10 @@ class OpTest(unittest.TestCase):
 
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
-                return "%s Variable %s max gradient diff %f over limit %f, the first " \
-                  "error element is %d" % (
-                   msg_prefix, name, max_diff, max_relative_error, offset)
+                return ("%s Variable %s max gradient diff %f over limit %f, "
+                        "the first error element is %d") % (
+                            msg_prefix, name, max_diff, max_relative_error,
+                            offset)
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index f10db78322..1de514dff4 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -4,22 +4,24 @@ from op_test import OpTest
 
 
 class TestCrossEntropyOp1(OpTest):
-    """Test standard cross-entropy, with index representation of labels.
+    """Test cross-entropy with discrete one-hot labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
         batch_size = 30
         class_num = 10
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.randint(0, class_num, (batch_size, 1), dtype="int32")
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
             dtype="float32")
+
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {'soft_label': False}
+        self.attrs = {"softLabel": False}
 
     def test_check_output(self):
         self.check_output()
@@ -29,13 +31,14 @@ class TestCrossEntropyOp1(OpTest):
 
 
 class TestCrossEntropyOp2(OpTest):
-    """Test soft-label cross-entropy, with vecterized soft labels.
+    """Test cross-entropy with vectorized soft labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 10
-        class_num = 5
+        batch_size = 5
+        class_num = 37
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label = np.random.uniform(0.1, 1.0,
@@ -43,46 +46,49 @@ class TestCrossEntropyOp2(OpTest):
         label /= label.sum(axis=1, keepdims=True)
         cross_entropy = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': True}
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
 
 
 class TestCrossEntropyOp3(OpTest):
-    """Test one-hot cross-entropy, with vecterized one-hot representation of
-    labels.
+    """Test cross-entropy with vectorized one-hot representation of labels.
     """
 
     def setUp(self):
         self.op_type = "cross_entropy"
-        batch_size = 30
-        class_num = 10
+        batch_size = 5
+        class_num = 17
+
         X = np.random.uniform(0.1, 1.0,
                               [batch_size, class_num]).astype("float32")
         label_index = np.random.randint(
             0, class_num, (batch_size), dtype="int32")
         label = np.zeros(X.shape)
         label[np.arange(batch_size), label_index] = 1
+
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
             dtype="float32")
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
-        self.inputs = {'X': X, 'Label': label}
-        self.outputs = {'Y': cross_entropy}
-        self.attrs = {'soft_label': True}
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y", max_relative_error=0.05)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
new file mode 100644
index 0000000000..8ce65bfc31
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
@@ -0,0 +1,38 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sigmoid_np(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh_np(x):
+    return 2 * sigmoid_np(2. * x) - 1.
+
+
+class LstmUnitTest(OpTest):
+    def setUp(self):
+        self.op_type = "lstm_unit"
+        x_np = np.random.normal(size=(5, 16)).astype("float32")
+        c_np = np.random.normal(size=(5, 4)).astype("float32")
+        i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
+        forget_bias_np = 0.
+        self.attrs = {'forget_bias': 0.}
+
+        new_c = c_np * sigmoid_np(f_np + forget_bias_np) + sigmoid_np(
+            i_np) * tanh_np(j_np)
+        new_h = tanh_np(new_c) * sigmoid_np(o_np)
+
+        self.inputs = {'X': x_np, 'C_prev': c_np}
+        self.outputs = {'C': new_c, 'H': new_h}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_multiplex_op.py b/python/paddle/v2/framework/tests/test_multiplex_op.py
index f2b3881cde..5937eb5aa4 100644
--- a/python/paddle/v2/framework/tests/test_multiplex_op.py
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
@@ -6,20 +6,22 @@ from op_test import OpTest
 class TestMultiplexOp(OpTest):
     def setUp(self):
         self.op_type = "multiplex"
-        rows = 3
-        index = np.array([3, 1, 0])
+        rows = 4
+        index = np.arange(0, rows).astype('int32')
+        np.random.shuffle(index)
+        index = np.reshape(index, (rows, 1))
         ins1 = np.random.random((rows, 10)).astype("float32")
         ins2 = np.random.random((rows, 10)).astype("float32")
         ins3 = np.random.random((rows, 10)).astype("float32")
         ins4 = np.random.random((rows, 10)).astype("float32")
         self.inputs = {
-            'X': [('index', index), ('x1', ins1), ('x2', ins2), ('x3', ins3),
-                  ('x4', ins4)]
+            'Ids': index,
+            'X': [('x1', ins1), ('x2', ins2), ('x3', ins3), ('x4', ins4)]
         }
         # multiplex output
         output = np.zeros_like(ins1)
         for i in range(0, rows):
-            k = index[i] + 1
+            k = index[i][0]
             output[i] = self.inputs['X'][k][1][i]
         self.outputs = {'Out': output}
 
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/framework/tests/test_prelu_op.py
index 2b6b7db368..676fd9f7c5 100644
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
@@ -7,6 +7,14 @@ class PReluTest(OpTest):
     def setUp(self):
         self.op_type = "prelu"
         x_np = np.random.normal(size=(10, 10)).astype("float32")
+
+        for pos, val in np.ndenumerate(x_np):
+            # Since zero point in prelu is not differentiable, avoid randomize
+            # zero.
+            while abs(val) < 1e-3:
+                x_np[pos] = np.random.normal()
+                val = x_np[pos]
+
         x_np_sign = np.sign(x_np)
         x_np = x_np_sign * np.maximum(x_np, .005)
         alpha_np = np.array([.1])
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
index 1b948f252f..b41c810d9a 100644
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -5,7 +5,7 @@ from op_test import OpTest
 
 def stable_softmax(x):
     """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x)
+    shiftx = x - np.max(x).clip(-64.)
     exps = np.exp(shiftx)
     return exps / np.sum(exps)
 
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
new file mode 100644
index 0000000000..428395b76c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -0,0 +1,70 @@
+import unittest
+import numpy as np
+
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 3
+        class_num = 37
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int32")
+
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax[i][labels[i][0]])]
+             for i in range(softmax.shape[0])],
+            dtype="float32")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+
+
+class TestSoftmaxWithCrossEntropyOp2(OpTest):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 2
+        class_num = 17
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float32")
+        labels /= np.sum(labels, axis=1, keepdims=True)
+
+        cross_entropy = (-labels * np.log(softmax)).sum(
+            axis=1, keepdims=True).astype("float32")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
+        self.attrs = {"softLabel": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+
+
+if __name__ == "__main__":
+    unittest.main()