diff --git a/adversarial/README.md b/adversarial/README.md
new file mode 100644
index 0000000000..51da21918a
--- /dev/null
+++ b/adversarial/README.md
@@ -0,0 +1,9 @@
+# Advbox
+
+Advbox is a Python toolbox to create adversarial examples that fool neural networks. It requires Python and paddle.
+
+## How to use
+
+1. train a model and save it's parameters. (like fluid_mnist.py)
+2. load the parameters which is trained in step1, then reconstruct the model.(like mnist_tutorial_fgsm.py)
+3. use advbox to generate the adversarial sample.
diff --git a/adversarial/advbox/__init__.py b/adversarial/advbox/__init__.py
new file mode 100644
index 0000000000..f56f14f18d
--- /dev/null
+++ b/adversarial/advbox/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+   A set of tools for generating adversarial example on paddle platform 
+"""
diff --git a/adversarial/advbox/attacks/base.py b/adversarial/advbox/attacks/base.py
new file mode 100644
index 0000000000..98a65f2fdd
--- /dev/null
+++ b/adversarial/advbox/attacks/base.py
@@ -0,0 +1,39 @@
+"""
+The base model of the model.
+"""
+from abc import ABCMeta, abstractmethod
+
+
+class Attack(object):
+    """
+    Abstract base class for adversarial attacks. `Attack` represent an adversarial attack
+    which search an adversarial example. subclass should implement the _apply() method.
+
+    Args:
+        model(Model): an instance of the class advbox.base.Model.
+
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, model):
+        self.model = model
+
+    def __call__(self, image_label):
+        """
+        Generate the adversarial sample.
+
+        Args:
+        image_label(list): The image and label tuple list with one element.
+        """
+        adv_img = self._apply(image_label)
+        return adv_img
+
+    @abstractmethod
+    def _apply(self, image_label):
+        """
+        Search an adversarial example.
+
+        Args:
+        image_batch(list): The image and label tuple list with one element.
+        """
+        raise NotImplementedError
diff --git a/adversarial/advbox/attacks/gradientsign.py b/adversarial/advbox/attacks/gradientsign.py
new file mode 100644
index 0000000000..15b1d176cb
--- /dev/null
+++ b/adversarial/advbox/attacks/gradientsign.py
@@ -0,0 +1,38 @@
+"""
+This module provide the attack method for FGSM's implement.
+"""
+from __future__ import division
+import numpy as np
+from collections import Iterable
+from .base import Attack
+
+
+class GradientSignAttack(Attack):
+    """
+    This attack was originally implemented by Goodfellow et al. (2015) with the
+    infinity norm (and is known as the "Fast Gradient Sign Method"). This is therefore called
+    the Fast Gradient Method.
+    Paper link: https://arxiv.org/abs/1412.6572
+    """
+
+    def _apply(self, image_label, epsilons=1000):
+        assert len(image_label) == 1
+        pre_label = np.argmax(self.model.predict(image_label))
+
+        min_, max_ = self.model.bounds()
+        gradient = self.model.gradient(image_label)
+        gradient_sign = np.sign(gradient) * (max_ - min_)
+
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, 1, num=epsilons + 1)
+
+        for epsilon in epsilons:
+            adv_img = image_label[0][0].reshape(
+                gradient_sign.shape) + epsilon * gradient_sign
+            adv_img = np.clip(adv_img, min_, max_)
+            adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+            if pre_label != adv_label:
+                return adv_img
+
+
+FGSM = GradientSignAttack
diff --git a/adversarial/advbox/models/__init__.py b/adversarial/advbox/models/__init__.py
new file mode 100644
index 0000000000..eee0f6efd4
--- /dev/null
+++ b/adversarial/advbox/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Paddle model for target of attack 
+"""
diff --git a/adversarial/advbox/models/base.py b/adversarial/advbox/models/base.py
new file mode 100644
index 0000000000..74e1045def
--- /dev/null
+++ b/adversarial/advbox/models/base.py
@@ -0,0 +1,90 @@
+"""
+The base model of the model.
+"""
+from abc import ABCMeta
+import abc
+
+abstractmethod = abc.abstractmethod
+
+
+class Model(object):
+    """
+    Base class of model to provide attack.
+
+
+    Args:
+        bounds(tuple): The lower and upper bound for the image pixel.
+        channel_axis(int): The index of the axis that represents the color channel.
+        preprocess(tuple): Two element tuple used to preprocess the input. First
+            substract the first element, then divide the second element.
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, bounds, channel_axis, preprocess=None):
+        assert len(bounds) == 2
+        assert channel_axis in [0, 1, 2, 3]
+
+        if preprocess is None:
+            preprocess = (0, 1)
+        self._bounds = bounds
+        self._channel_axis = channel_axis
+        self._preprocess = preprocess
+
+    def bounds(self):
+        """
+        Return the upper and lower bounds of the model.
+        """
+        return self._bounds
+
+    def channel_axis(self):
+        """
+        Return the channel axis of the model.
+        """
+        return self._channel_axis
+
+    def _process_input(self, input_):
+        res = input_
+        sub, div = self._preprocess
+        if sub != 0:
+            res = input_ - sub
+        assert div != 0
+        if div != 1:
+            res /= div
+        return res
+
+    @abstractmethod
+    def predict(self, image_batch):
+        """
+        Calculate the prediction of the image batch.
+
+        Args:
+            image_batch(numpy.ndarray): image batch of shape (batch_size, height, width, channels).
+
+        Return:
+            numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def num_classes(self):
+        """
+        Determine the number of the classes
+
+        Return:
+            int: the number of the classes
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def gradient(self, image_batch):
+        """
+        Calculate the gradient of the cross-entropy loss w.r.t the image.
+
+        Args:
+            image_batch(list): The image and label tuple list.
+
+        Return:
+            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image with
+                the shape (height, width, channel).
+        """
+        raise NotImplementedError
diff --git a/adversarial/advbox/models/paddle.py b/adversarial/advbox/models/paddle.py
new file mode 100644
index 0000000000..33b2a3d5c6
--- /dev/null
+++ b/adversarial/advbox/models/paddle.py
@@ -0,0 +1,101 @@
+from __future__ import absolute_import
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.framework import program_guard
+
+from .base import Model
+
+
+class PaddleModel(Model):
+    """
+    Create a PaddleModel instance.
+    When you need to generate a adversarial sample, you should construct an instance of PaddleModel.
+
+    Args:
+        program(paddle.v2.fluid.framework.Program): The program of the model which generate the adversarial sample.
+        input_name(string): The name of the input.
+        logits_name(string): The name of the logits.
+        predict_name(string): The name of the predict.
+        cost_name(string): The name of the loss in the program.
+    """
+
+    def __init__(self,
+                 program,
+                 input_name,
+                 logits_name,
+                 predict_name,
+                 cost_name,
+                 bounds,
+                 channel_axis=3,
+                 preprocess=None):
+        super(PaddleModel, self).__init__(
+            bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
+
+        if preprocess is None:
+            preprocess = (0, 1)
+
+        self._program = program
+        self._place = fluid.CPUPlace()
+        self._exe = fluid.Executor(self._place)
+
+        self._input_name = input_name
+        self._logits_name = logits_name
+        self._predict_name = predict_name
+        self._cost_name = cost_name
+
+        # gradient
+        loss = self._program.block(0).var(self._cost_name)
+        param_grads = fluid.backward.append_backward(
+            loss, parameter_list=[self._input_name])
+        self._gradient = dict(param_grads)[self._input_name]
+
+    def predict(self, image_batch):
+        """
+            Predict the label of the image_batch.
+
+            Args:
+                image_batch(list): The image and label tuple list.
+            Return:
+                numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        """
+        feeder = fluid.DataFeeder(
+            feed_list=[self._input_name, self._logits_name],
+            place=self._place,
+            program=self._program)
+        predict_var = self._program.block(0).var(self._predict_name)
+        predict = self._exe.run(self._program,
+                                feed=feeder.feed(image_batch),
+                                fetch_list=[predict_var])
+        return predict
+
+    def num_classes(self):
+        """
+            Calculate the number of classes of the output label. 
+
+        Return:
+            int: the number of classes
+        """
+        predict_var = self._program.block(0).var(self._predict_name)
+        assert len(predict_var.shape) == 2
+        return predict_var.shape[1]
+
+    def gradient(self, image_batch):
+        """
+        Calculate the gradient of the loss w.r.t the input.
+
+        Args:
+            image_batch(list): The image and label tuple list.
+        Return:
+            list: The list of the gradient of the image.
+        """
+        feeder = fluid.DataFeeder(
+            feed_list=[self._input_name, self._logits_name],
+            place=self._place,
+            program=self._program)
+
+        grad, = self._exe.run(self._program,
+                              feed=feeder.feed(image_batch),
+                              fetch_list=[self._gradient])
+        return grad
diff --git a/adversarial/fluid_mnist.py b/adversarial/fluid_mnist.py
new file mode 100644
index 0000000000..db4d4b5186
--- /dev/null
+++ b/adversarial/fluid_mnist.py
@@ -0,0 +1,86 @@
+"""
+CNN on mnist data using fluid api of paddlepaddle
+"""
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def mnist_cnn_model(img):
+    """
+    Mnist cnn model
+
+    Args:
+        img(Varaible): the input image to be recognized
+
+    Returns:
+        Variable: the label prediction
+    """
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Train the cnn model on mnist datasets
+    """
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+    optimizer.minimize(avg_cost)
+
+    accuracy = fluid.evaluator.Accuracy(input=logits, label=label)
+
+    BATCH_SIZE = 50
+    PASS_NUM = 3
+    ACC_THRESHOLD = 0.98
+    LOSS_THRESHOLD = 10.0
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc="
+                  + str(pass_acc))
+            if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
+                break
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+    fluid.io.save_params(
+        exe, dirname='./mnist', main_program=fluid.default_main_program())
+    print('train mnist done')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adversarial/mnist_tutorial_fgsm.py b/adversarial/mnist_tutorial_fgsm.py
new file mode 100644
index 0000000000..8b29346b8c
--- /dev/null
+++ b/adversarial/mnist_tutorial_fgsm.py
@@ -0,0 +1,87 @@
+"""
+FGSM demos on mnist using advbox tool.
+"""
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import matplotlib.pyplot as plt
+import numpy as np
+
+from advbox.models.paddle import PaddleModel
+from advbox.attacks.gradientsign import GradientSignAttack
+
+
+def cnn_model(img):
+    """
+    Mnist cnn model
+    Args:
+        img(Varaible): the input image to be recognized
+    Returns:
+        Variable: the label prediction
+    """
+    #conv1 = fluid.nets.conv2d()
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(
+        feed_list=[IMG_NAME, LABEL_NAME],
+        place=place,
+        program=fluid.default_main_program())
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
+                    logits.name, avg_cost.name, (-1, 1))
+    att = GradientSignAttack(m)
+    for data in train_reader():
+        # fgsm attack
+        adv_img = att(data)
+        plt.imshow(n[0][0], cmap='Greys_r')
+        plt.show()
+        #np.save('adv_img', adv_img)
+        break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 084d3237d9..8b7dc5b7db 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -7,11 +7,11 @@ Machine:
 
 System: CentOS release 6.3 (Final), Docker 1.12.1.
 
-PaddlePaddle: (TODO: will rerun after 0.11.0)
-- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+PaddlePaddle:
+- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
   - MKL-DNN tag v0.11
   - MKLML 2018.0.1.20171007
-- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
   - OpenBLAS v0.2.20
 	 
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
@@ -56,15 +56,15 @@ Input image size - 3 * 224 * 224, Time: images/second
 
 <img src="figs/googlenet-cpu-train.png" width="500">
 
-- Alexnet
+- AlexNet
 
 | BatchSize    | 64     | 128    | 256    |
 |--------------|--------| ------ | -------|
-| OpenBLAS     | 2.13   | 2.45   | 2.68   | 
+| OpenBLAS     | 45.62  | 72.79  | 107.22 | 
 | MKLML        | 66.37  | 105.60 | 144.04 |
 | MKL-DNN      | 399.00 | 498.94 | 626.53 | 
 
-chart TBD
+<img src="figs/alexnet-cpu-train.png" width="500">
 
 #### Inference
 Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
@@ -72,36 +72,41 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 
 | BatchSize | 1     | 2     | 4     | 8     | 16    |
 |-----------|-------|-------|-------|-------|-------|
-| OpenBLAS  | 1.07  | 1.08  | 1.06  | 0.88  | 0.65  |
+| OpenBLAS  | 1.10  | 1.96  | 3.62  | 3.63  | 2.25  |
 | MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
 | MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
 
+<img src="figs/vgg-cpu-infer.png" width="500">
+
 - ResNet-50
 
 | BatchSize | 1     | 2      | 4      | 8      | 16     |
 |-----------|-------|--------|--------|--------|--------|
-| OpenBLAS  | 3.35  | 3.19   | 3.09   | 2.55   | 1.96   |
+| OpenBLAS  | 3.31  | 6.72   | 11.59  | 13.17  | 9.27   |
 | MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
 | MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
 
+<img src="figs/resnet-cpu-infer.png" width="500">
 
 - GoogLeNet
 
 | BatchSize | 1      | 2      | 4      | 8      | 16     |
 |-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  | 12.04  | 11.31  | 10.00  | 9.07   | 4.34   |
+| OpenBLAS  | 12.06  | 23.56  | 34.48  | 36.45  | 23.12  |
 | MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
 | MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
 
-- Alexnet
+<img src="figs/googlenet-cpu-infer.png" width="500">
+
+- AlexNet
 
 | BatchSize | 1      | 2      | 4      | 8      | 16     |
 |-----------|--------|--------|--------|--------|--------|
-| OpenBLAS  |    |   |   |   |    |
+| OpenBLAS  | 3.53   | 6.23   | 15.04  | 26.06  | 31.62  |
 | MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
 | MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
 
-chart TBD
+<img src="figs/alexnet-cpu-infer.png" width="500">
 
 ### Laptop
 TBD
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
new file mode 100644
index 0000000000..b619613ea7
--- /dev/null
+++ b/benchmark/cluster/README.md
@@ -0,0 +1,78 @@
+# Cluster Training Benchmark
+
+## Setup
+
+- Platform
+  - Kubernetes: v1.6.2
+  - Linux Kernel: v3.10.0
+
+- Resource
+  - CPU: 10 Cores per Pod
+  - Memory: 5GB per Pod
+
+- Docker Image
+
+  We use different base Docker Image to run the benchmark on Kubernetes:
+  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
+  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
+  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
+
+- Model
+  vgg16 is used in this benchmark.
+
+## Cases
+
+- Variable
+  - Batch Size of training data.
+  - PServer count of the training job.
+  - The number of trainers.
+
+- Invariant
+  - The resource of trainer/pserver Pod.
+
+### Measure the Performance for Different Batch Size
+
+- PServer Count: 40
+- Trainer Count: 100
+- Metrics: mini-batch / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure the Performance for Different PServer Count
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure Parallel Efficiency By Increasing Trainer Count
+
+- PServer Count: 20
+- Batch Size: 64
+- Metrics:
+
+$S = \div(T1, TN)$
+
+which S is the ratio of T1 over TN, training time of 1 and N trainers.
+The parallel efficiency is:
+
+$E = \div(S, N)$
+
+| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+## Reproduce the benchmark
+
+TODO
diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png
new file mode 100644
index 0000000000..6215ae4e42
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-infer.png differ
diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png
new file mode 100644
index 0000000000..b3200bbc04
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-train.png differ
diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png
new file mode 100644
index 0000000000..19478d433b
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-infer.png differ
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
index c3f67faf09..4e86e058d0 100644
Binary files a/benchmark/figs/googlenet-cpu-train.png and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png
new file mode 100644
index 0000000000..bc43d4b8d2
Binary files /dev/null and b/benchmark/figs/resnet-cpu-infer.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
index b96ecd5ff9..96746b1759 100644
Binary files a/benchmark/figs/resnet-cpu-train.png and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png
new file mode 100644
index 0000000000..3a51ec6c47
Binary files /dev/null and b/benchmark/figs/vgg-cpu-infer.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
index f830ca6a87..6d548cfd59 100644
Binary files a/benchmark/figs/vgg-cpu-train.png and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
new file mode 100644
index 0000000000..8679d4f272
--- /dev/null
+++ b/benchmark/paddle/image/plotlog.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import argparse
+import matplotlib.pyplot as plt
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Parse Log')
+    parser.add_argument(
+        '--file_path', '-f', type=str, help='the path of the log file')
+    parser.add_argument(
+        '--sample_rate',
+        '-s',
+        type=float,
+        default=1.0,
+        help='the rate to take samples from log')
+    parser.add_argument(
+        '--log_period', '-p', type=int, default=1, help='the period of log')
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_file(file_name):
+    loss = []
+    error = []
+    with open(file_name) as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if not line.startswith('pass'):
+                continue
+            line_split = line.split(' ')
+            if len(line_split) != 5:
+                continue
+
+            loss_str = line_split[2][:-1]
+            cur_loss = float(loss_str.split('=')[-1])
+            loss.append(cur_loss)
+
+            err_str = line_split[3][:-1]
+            cur_err = float(err_str.split('=')[-1])
+            error.append(cur_err)
+
+    accuracy = [1.0 - err for err in error]
+
+    return loss, accuracy
+
+
+def sample(metric, sample_rate):
+    interval = int(1.0 / sample_rate)
+    if interval > len(metric):
+        return metric[:1]
+
+    num = len(metric) / interval
+    idx = [interval * i for i in range(num)]
+    metric_sample = [metric[id] for id in idx]
+    return metric_sample
+
+
+def plot_metric(metric,
+                batch_id,
+                graph_title,
+                line_style='b-',
+                line_label='y',
+                line_num=1):
+    plt.figure()
+    plt.title(graph_title)
+    if line_num == 1:
+        plt.plot(batch_id, metric, line_style, label=line_label)
+    else:
+        for i in range(line_num):
+            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
+    plt.xlabel('batch')
+    plt.ylabel(graph_title)
+    plt.legend()
+    plt.savefig(graph_title + '.jpg')
+    plt.close()
+
+
+def main():
+    args = parse_args()
+    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
+
+    loss, accuracy = parse_file(args.file_path)
+    batch = [args.log_period * i for i in range(len(loss))]
+
+    batch_sample = sample(batch, args.sample_rate)
+    loss_sample = sample(loss, args.sample_rate)
+    accuracy_sample = sample(accuracy, args.sample_rate)
+
+    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
+    plot_metric(
+        accuracy_sample,
+        batch_sample,
+        'accuracy',
+        line_style='g-',
+        line_label='accuracy')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index 71a49231a5..a9a7b8a667 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -8,6 +8,7 @@ function clock_to_seconds() {
 }
 
 function infer() {
+  export OPENBLAS_MAIN_FREE=1
   topology=$1
   layer_num=$2
   bs=$3
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5d24caebdc..89fc34796a 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -63,9 +63,30 @@ ExternalProject_Add(
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
 
-ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies mkldnn)
+LIST(APPEND external_project_dependencies shared_mkldnn)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(mkldnn STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+# copy the real so.0 lib to install dir
+# it can be directly contained in wheel or capi
+SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+    DEPENDS mkldnn)
+ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
+ENDIF()
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 20dbc32a73..15a07ea3da 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index a8e1aca49c..7cb4efa7bf 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -63,7 +63,7 @@ ExternalProject_Add(
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
 
-ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)
 
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
index 43fc19dc49..5f15cad2b5 100644
--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
@@ -15,4 +15,4 @@ Fluid
     fluid/param_attr.rst
     fluid/profiler.rst
     fluid/regularizer.rst
-
+    fluid/io.rst
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
new file mode 100644
index 0000000000..67f68c4e9e
--- /dev/null
+++ b/doc/api/v2/fluid/io.rst
@@ -0,0 +1,10 @@
+===========
+IO
+===========
+
+
+
+is_parameter
+-----------
+..  autofunction:: paddle.v2.fluid.io.is_parameter
+    :noindex:
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index a7c8670f66..696a8012aa 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -38,6 +38,16 @@ elementwise_add
 ..  autofunction:: paddle.v2.fluid.layers.elementwise_add
     :noindex:
 
+elementwise_sub
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+    :noindex:
+
+elementwise_mul
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+    :noindex:
+
 elementwise_div
 ---------------
 ..  autofunction:: paddle.v2.fluid.layers.elementwise_div
diff --git a/doc/design/block.md b/doc/design/block.md
index fab7f2dc48..907a2def55 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -202,8 +202,8 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
 
-VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
 
 ```python
 a = pd.Variable(shape=[20, 20])
diff --git a/doc/design/ci_build_whl.png b/doc/design/ci_build_whl.png
new file mode 100644
index 0000000000..232762b82a
Binary files /dev/null and b/doc/design/ci_build_whl.png differ
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md
similarity index 52%
rename from doc/design/refactor/distributed_architecture.md
rename to doc/design/dist_refactor/distributed_architecture.md
index d9fe7d6bbb..3a741f9586 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -52,8 +52,9 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
 
 The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
-This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
 
 ## Distributed Training Architecture
 
@@ -61,68 +62,111 @@ The revamped distributed training architecture can address the above discussed l
 
 <img src="src/distributed_architecture.png"/>
 
-The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
 
-### PaddlePaddle Python
+### Python API
 
-PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
 
 ```Python
-paddle.init()
-input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
-img, label = input[0], input[1]
-hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
-prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
-cost = paddle.layer.classification_cost(input=prediction, label=label)
-optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
-session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
-for i in range(1000):
-	_, cost_val = session.eval(targets=[cost, optimizer])
-	print cost_val
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+    for data in train_reader():
+        loss, acc = exe.run(trainer_prog,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost])
 ```
 
-The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
-
-#### session.eval
-
-As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
-The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
-
-The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
-
-### PaddlePaddle Converter
-
-The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
-
-1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
-
-2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
-
-3. Optimize the computation graph.
-
-4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
-
-5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+   distributed training program:
+   1. Parse configurations from `RemoteExecutor`.
+   1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+   1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+      DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+      "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+    feed=feeder.feed(data),
+    fetch_list=[avg_cost],
+    job_desc=JobDesc(
+      jobname,
+      num_trainer,
+      num_pserver,
+      cpu_per_trainer,
+      gpu_per_trainer,
+      mem_per_trainer,
+      cpu_per_pserver,
+      mem_per_pserver
+    ))
+for data in train_reader():
+    loss, acc = exe.run(trainer_prog,
+                        feed=feeder.feed(data),
+                        fetch_list=[avg_cost])
+```
 
-6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
 
-7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
+<img src="src/remote_executor.png"/>
 
-The output IRs will be cached to optimize the conversion latency.
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
 
 
-#### Placement Algorithm
+### Placement Algorithm
 
 Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
 
 In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
 
 
-### PaddlePaddle Runtime
-
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
-
-
 ### Local Training Architecture
 
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
@@ -132,9 +176,18 @@ The local training architecture will be the same as the distributed training arc
 
 ### Training Data
 
-In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
-
-When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
 
 
 ## References:
diff --git a/doc/design/refactor/multi_cpu.md b/doc/design/dist_refactor/multi_cpu.md
similarity index 100%
rename from doc/design/refactor/multi_cpu.md
rename to doc/design/dist_refactor/multi_cpu.md
diff --git a/doc/design/refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md
similarity index 76%
rename from doc/design/refactor/parameter_server.md
rename to doc/design/dist_refactor/parameter_server.md
index fa3c5d7990..1094f06d46 100644
--- a/doc/design/refactor/parameter_server.md
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -1,4 +1,4 @@
-# Design Doc: Operation Graph Based Parameter Server
+# Design Doc: Parameter Server
 
 ## Abstract
 
@@ -10,7 +10,7 @@ different purposes.
 ## Background
 
 The previous implementations of the parameter server does not run a
-subgraph. parameter initialization, optimizer computation, network
+fluid sub-program. Parameter initialization, optimizer computation, network
 communication and checkpointing are implemented twice on both the
 trainer and the parameter server.
 
@@ -23,10 +23,10 @@ server becomes a natural extension.
 
 ## Design
 
-### Graph Converter
+### Distributed Transpiler
 
-The *graph converter* converts the user-defined operation (OP) graph
-into subgraphs to be scheduled on different nodes with the following
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
 steps:
 
 1. OP placement: the OPs will be placed on different nodes according
@@ -34,7 +34,6 @@ steps:
    time. Currently we will use a simple heuristic that puts parameter
    varable on parameter server workers and everything else on trainer
    workers.
-
 1. Add communication OPs to enable the communication between nodes.
 
 We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
@@ -48,8 +47,8 @@ After converting:
 
 <img src="src/dist-graph.png" width="700"/>
 
-1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
-1. Operators are added to the subgraphs.
+1. The parameter variable W and it's optimizer program are placed on the parameter server.
+1. Operators are added to the program.
    - *Send* sends data to the connected *Recv* operator.  The
 	 scheduler on the receive node will only schedule *Recv* operator
 	 to run when the *Send* operator has ran (the *Send* OP will mark
@@ -64,39 +63,30 @@ After converting:
 ### Benefits
 
 - Model parallelism become easier to implement: it's an extension to
-  the trainer - parameter server approach. we already have the
-  communication OPs, but need to extend the graph converter's
-  placement functionality.
-
+  the trainer - parameter server approach. We can have several "Transpilers"
+  to achieve different goals.
 - User-defined optimizer is easier to add - user can now express it as
-  a subgraph.
-
+  a sub-program.
 - No more duplication logic inside the trainer and the parameter
   server mentioned in the background section.
 
 ### Challenges
 
-- It might be hard for the graph converter to cut a general graph
-  (without any hint for which subgraph is the optimizer). We may need
-  to label which subgraph inside the OP graph is the optimizer.
-
 - It's important to balance the parameter shards of on multiple
   parameter server. If a single parameter is very big (some
   word-embedding, fully connected, softmax layer), we need to
   automatically partition the single parameter onto different
   parameter servers when possible (only element-wise optimizer depends
   on the parameter variable).
+- In the "Aync SGD" figure, the "W" variable on the parameter server
+  could be read and wrote concurrently. See
+  [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+  details about concurrent program in fluid.
 
 ### Discussion
 
-- In the "Aync SGD" figure, the "W" variable on the parameter server
-  could be read and wrote concurrently, what is our locking strategy?
-  E.g., each variable have a lock cpp method to be invoked by every
-  OP, or, have a lock OP.
-
 - Can the Enqueue OP be implemented under our current tensor design
   (puts the input tensor into the queue tensor)?
-
 - *Dequeue* OP will have variable numbers of output (depends on the
   `min_count` attribute), does our current design support it? (similar
   question for the *Add* OP)
diff --git a/doc/design/refactor/src/compiler.graffle b/doc/design/dist_refactor/src/compiler.graffle
similarity index 100%
rename from doc/design/refactor/src/compiler.graffle
rename to doc/design/dist_refactor/src/compiler.graffle
diff --git a/doc/design/refactor/src/compiler.png b/doc/design/dist_refactor/src/compiler.png
similarity index 100%
rename from doc/design/refactor/src/compiler.png
rename to doc/design/dist_refactor/src/compiler.png
diff --git a/doc/design/refactor/src/dist-graph.graffle b/doc/design/dist_refactor/src/dist-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/dist-graph.graffle
rename to doc/design/dist_refactor/src/dist-graph.graffle
diff --git a/doc/design/refactor/src/dist-graph.png b/doc/design/dist_refactor/src/dist-graph.png
similarity index 100%
rename from doc/design/refactor/src/dist-graph.png
rename to doc/design/dist_refactor/src/dist-graph.png
diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/design/dist_refactor/src/distributed_architecture.graffle
new file mode 100644
index 0000000000..d1b6014134
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/design/dist_refactor/src/distributed_architecture.png
new file mode 100644
index 0000000000..29c7b0c078
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.png differ
diff --git a/doc/design/refactor/src/local-graph.graffle b/doc/design/dist_refactor/src/local-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/local-graph.graffle
rename to doc/design/dist_refactor/src/local-graph.graffle
diff --git a/doc/design/refactor/src/local-graph.png b/doc/design/dist_refactor/src/local-graph.png
similarity index 100%
rename from doc/design/refactor/src/local-graph.png
rename to doc/design/dist_refactor/src/local-graph.png
diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/design/dist_refactor/src/local_architecture.graffle
new file mode 100644
index 0000000000..49fcc663eb
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/design/dist_refactor/src/local_architecture.png
new file mode 100644
index 0000000000..14adc9fd72
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.png differ
diff --git a/doc/design/refactor/src/multi-threads.graffle b/doc/design/dist_refactor/src/multi-threads.graffle
similarity index 100%
rename from doc/design/refactor/src/multi-threads.graffle
rename to doc/design/dist_refactor/src/multi-threads.graffle
diff --git a/doc/design/refactor/src/multi-threads/multi-threads@3x.png b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
similarity index 100%
rename from doc/design/refactor/src/multi-threads/multi-threads@3x.png
rename to doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
diff --git a/doc/design/refactor/src/multi-threads/single-thread@3x.png b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
similarity index 100%
rename from doc/design/refactor/src/multi-threads/single-thread@3x.png
rename to doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/design/dist_refactor/src/paddle-compile.graffle
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.graffle
rename to doc/design/dist_refactor/src/paddle-compile.graffle
diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/design/dist_refactor/src/paddle-compile.png
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.png
rename to doc/design/dist_refactor/src/paddle-compile.png
diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle
new file mode 100644
index 0000000000..ce2c18fee5
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.graffle differ
diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png
new file mode 100644
index 0000000000..6be4b1841b
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.png differ
diff --git a/doc/design/error_clip.md b/doc/design/error_clip.md
new file mode 100644
index 0000000000..8e845462cc
--- /dev/null
+++ b/doc/design/error_clip.md
@@ -0,0 +1,87 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        block.append_op(
+            type="clip",
+            inputs={"X": grad_name},
+            outputs={"Out": grad_name},
+            attrs={"min": self.min,
+                   "max": self.max})
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
index 00f514711a..1f68cef4cc 100644
--- a/doc/design/memory_optimization.md
+++ b/doc/design/memory_optimization.md
@@ -5,28 +5,28 @@
 
 In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
 
-- availability of Big Data
-- supercomputing power to process this Big Data over very large neural networks
-- modern algorithms
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
 
 Following graph shows the details:
 
 ![](images/deep_learning.png)
 
-Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference. 
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. 
 
 ## Solution
 
 ### Basic Strategy
 
-There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
 
 #### In-place Operation
 In a relu activation operator： 
 
 $y = \max(x, 0)$
 
-If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
 
 #### Memory Sharing
 
@@ -40,18 +40,18 @@ d = op2(a)
 e = op3(d, f)
 ```
 
-In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
 
 
 ### Live Variable Analysis
 
-It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
 
 In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
 
-In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
 
-Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
 
 We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
 
@@ -60,7 +60,7 @@ We can leran these techniques from compilers. There are mainly two stages to mak
 
 
 #### Control Flow Graph
-To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
 
 Following is the flow graph for a simple loop.
 
@@ -68,18 +68,18 @@ Following is the flow graph for a simple loop.
 
 #### Dataflow Analysis
 
-liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
 
 A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
 
 - Flow Graph Terminology
 
-A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
 In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
 
 - Uses and Defs
 
-An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
 
 - Liveness
 
@@ -168,9 +168,9 @@ class ControlFlowGraph(object):
         return self._program
 ```
 
-#### make dataflow analysis
+#### Make dataflow analysis
 
-We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
 
 For example:
 
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
index aa82e96bf7..f86e6b7a56 100644
--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
@@ -1,6 +1,6 @@
 # Design Doc: The Keys of Operator Kernel Type
 ## Problem
-An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
 
 ```cpp
 struct OpKernelType {
@@ -10,13 +10,13 @@ struct OpKernelType {
 ```
 For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
 
-It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
 
-We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
 
-For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
 
-It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
 
 ## Solution
 
@@ -31,17 +31,17 @@ struct OpKernelType {
 };
 ```
 
-Following is the details:
+The details are as follows:
 
 ### Place
 
-`Place` is defined as follows:
+`Place` is defined as:
 
 ```cpp
 typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
 ```
 
-`Place` is to represent the device memory where data is locating.
+`Place` represents the device memory where data is located.
 
 
 ### Library
@@ -52,10 +52,10 @@ One operator kernel is usually implemented based on one library. `Library` is de
 enum Library { Plain, MKLDNN, CUDNN };
 ```
 
-We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
-A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
 
-If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
 
 
 ### DataType
@@ -67,15 +67,15 @@ If we want to support new Library, a new enumerator need to be added to `Library
 
 Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
 
-Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
 
-- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
 
-- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
 
-- The inference of Layout is at run-time, not compile-time.
+- The inference of Layout is at run-time, not at compile-time.
 
-- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
 
 `Layout` is also defined as a enum variable:
 
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
index cb5fdc765b..73f6d7b90c 100644
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@@ -279,6 +279,26 @@ class LayerHelper(object):
     return tmp
 ```
 
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
 ## Optimizer
 
 [Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md
deleted file mode 100644
index 1d9a26683c..0000000000
--- a/doc/design/refactor/session.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# Design Doc: Session
-
-## Abstract
-
-The *session* object encapsulates the environment in which the
-computation graph is executed.
-
-We will have the *local* session and *remote* session, they offer the
-same [interface](#interface). The local session encapsulates the local
-runtime environment and the remote session encapsulates the cluster
-runtime environment.
-
-The local runtime environment contains:
-
-1. computation devices (i.e., CPU, GPU) handles, and
-1. the [scope](../scope.md) which holds all variables.
-
-The remote runtime environment contains:
-
-1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
-   and
-1. the distributed [scope](../scope.md) in a cluster which holds all
-   variables.
-
-The user can create a remote session on Paddle Cloud and evaluate the
-computation graph with it. In this way, the user can control the
-remote computation resource in a cluster from his local computer.
-
-
-## Background
-
-The current design has an implicit global session in which
-`paddle.eval()` is executed. The pain point is:
-
-Since the user is not able to explicitly switch between runtime
-environments, the user cannot run a topology in two independent
-environments.
-
-For example, in reinforcement learning, the user may want to have a
-stale model for inference and a fresh model for training, and only
-replace the stale model with the fresh model periodically.
-
-Furthermore, we have no concept that encapsulates a remote environment
-that executes a computation graph.
-
-We need the session object to address above issues.
-
-
-## Session
-
-A session is an object that owns the runtime environment. All
-computations are executed through `session.eval()`.
-
-
-### Interface
-
-```python
-eval(
-    targets,
-    feed_dict=None,
-)
-```
-
-Evaluates the target Operations or Variables in `targets`.
-
-- *targets*: the evaluation targets. Can be a single Operation or
-  Variable, or a list with the Operations or Variables as
-  elements. The value returned by `eval()` has the same shape as the
-  `target` argument.
-
-  The PaddlePaddle program is represented by
-  the [ProgramDesc](../design/program.md), `eval()` will infer the
-  ProgramDesc from the given targets and run the PaddlePaddle
-  program. Please
-  see
-  [this graph](./distributed_architecture.md#local-training-architecture) for
-  the detailed illustration for the local session
-  and
-  [this graph](./distributed_architecture.md#distributed-training-architecture) for
-  the detailed illustration for the remote session.
-
-- *feed_dict*: a dictionary that contains the tensors which override
-  the edges of the computation graph.
-
-  feed_dict not only can provide the input data, it can override any
-  OP's input as well:
-
-  ```python
-  a = pd.constant(2.0, name="a")
-  b = pd.variable(name="b")
-  c = pd.mul(a,b)
-  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
-  ```
-
-```python
-close()
-```
-
-Closes the session and releases the scope that the session owns.
-
-
-### Create a Local Session
-
-```python
-session(
-    devices=None
-)
-```
-
-Creates a new session. One session owns one global scope, so creating
-multiple sessions will create different scopes.
-
-- *devices*: a single `string` or a list of `string` of device names,
-  the corresponding devices will be the computation devices for
-  `eval()`. If not specified, all available devices (e.g., all GPUs)
-  will be used. The user doesn't need to specify the CPU device since
-  it will be always used. Multiple sessions can use the same device.
-
-
-#### Example
-
-```Python
-a = paddle.constant(1.0)
-b = paddle.constant(2.0)
-c = a + b
-sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
-sess.eval(c)
-sess.close()
-```
-
-### Create a Remote Session
-
-```python
-create_cloud_job(
-    name,
-    num_trainer,
-    mem_per_trainer,
-    gpu_per_trainer,
-    cpu_per_trainer,
-    num_ps,
-    mem_per_ps,
-    cpu_per_ps,
-)
-```
-
-Creates a Paddle Cloud job. Fails if the job name exists.
-
-```python
-get_cloud_job(
-    name
-)
-```
-
-Gets a Paddle Cloud job.
-
-```python
-remote_session(
-    job
-)
-```
-
-- *job*: the Paddle Cloud job.
-
-#### Example
-
-```Python
-reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
-image = reader.column(0)
-label = reader.column(1)
-fc1 = paddle.op.fc(image, size=256, act="sigmoid")
-fc2 = paddle.op.fc(fc1, size=10, act="softmax")
-cost = paddle.op.cross_entropy(fc2, label)
-opt = paddle.optimizer.sgd(cost)
-
-job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
-sess = paddle.remote_ession(job)
-for i in range(1000):
-    sess.eval(opt)
-sess.close()
-```
diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle
deleted file mode 100644
index f8496e5732..0000000000
Binary files a/doc/design/refactor/src/distributed_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png
deleted file mode 100644
index 410c4510c6..0000000000
Binary files a/doc/design/refactor/src/distributed_architecture.png and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle
deleted file mode 100644
index cc7783c453..0000000000
Binary files a/doc/design/refactor/src/local_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png
deleted file mode 100644
index 4b999538b7..0000000000
Binary files a/doc/design/refactor/src/local_architecture.png and /dev/null differ
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 14c081ea84..b978726109 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
 1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
 1. 对这个版本的提交，做如下几个操作:
+  * 使用Regression Test List作为检查列表，测试本次release的正确性。
+	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
-	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
-		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
 	* 编译这个版本的python wheel包，并发布到pypi。
 		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
 		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
+		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
 1. 协同完成Release Note的书写
 
 
@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
+## 发布wheel包到pypi
+
+使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
+可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
+使用`twine`工具上传即可。
+
+<img src="ci_build_whl.png">
+
+* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可：
+
+1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
+1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+1. 执行 `docker push paddlepaddle/paddle:[version]`
+
 ## PaddlePaddle 分支规范
 
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
index 0b2958c1b1..89fa95326c 100644
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -1,12 +1,12 @@
 ## Background
-PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
 
-PaddlePaddle use proto message to describe compile time graph because
+PaddlePaddle use proto message to describe compile time program because
 
-1. Computation graph should be able to be saved to a file.
-1. In distributed training, the graph will be serialized and send to multiple workers.
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers.
 
-The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
 | |compile time|runtime|
 |---|---|---|
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index ccd9097702..e0c69f7a6a 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,6 +9,7 @@
 
   usage/cmd_parameter/index_cn.rst
   usage/cluster/cluster_train_cn.md
+  usage/capi/index_cn.rst
 
 开发标准
 --------
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
index e4211abb3b..31987920f3 100644
--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
@@ -26,16 +26,16 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 sgd_optimizer.minimize(avg_cost)
 ```
 
-- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
-- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
   - Every Layer has one or more operators and variables/parameters
     - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
       - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
       - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
       - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
 - Optimizer: `fluid.optimizer.SGD`. It does the following
-  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
-  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
 
 # Run Time
 
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/usage/capi/compile_paddle_lib_cn.md
new file mode 100644
index 0000000000..ac5ecffe2e
--- /dev/null
+++ b/doc/howto/usage/capi/compile_paddle_lib_cn.md
@@ -0,0 +1,122 @@
+## 编译 PaddlePaddle 预测库
+
+### 概述
+
+使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库，只需在编译时需配制下面这些编译选项：
+
+必须配置选项：
+- `WITH_C_API`，必须配置为`ON`。
+
+推荐配置选项：
+- `WITH_PYTHON`，推荐配置为`OFF`
+- `WITH_SWIG_PY`，推荐配置为`OFF`
+- `WITH_GOLANG`，推荐设置为`OFF`
+
+可选配置选项：
+- `WITH_GPU`，可配置为`ON/OFF`
+- `WITH_MKL`，可配置为`ON/OFF`
+
+对推荐配置中的选项建议按照设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+执行上述代码生成Makefile文件后，执行：`make && make install`。成功编译后，使用C-API所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件）均会存放于`PADDLE_ROOT`目录中。
+
+编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构（包括了编译出的PaddlePaddle头文件和链接库，以及第三方依赖链接库和头文件（如果需要，由链接方式决定））：
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### 链接说明
+
+目前提供三种链接方式：
+
+1. 链接`libpaddle_capi_shared.so` 动态库
+    - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时，需注意：
+        1. 如果编译时指定编译CPU版本，且使用`OpenBLAS`数学库，在使用C-API开发预测程序时，只需要链接`libpaddle_capi_shared.so`这一个库。
+        1. 如果是用编译时指定CPU版本，且使用`MKL`数学库，由于`MKL`库有自己独立的动态库文件，在使用PaddlePaddle C-API开发预测程序时，需要自己链接MKL链接库。
+        1. 如果编译时指定编译GPU版本，CUDA相关库会在预测程序运行时动态装载，需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
+    - 这种方式最为简便，链接相对容易，**在无特殊需求情况下，推荐使用此方式**。
+
+2. 链接静态库 `libpaddle_capi_whole.a`
+    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时，需注意：
+        1. 需要指定`-Wl,--whole-archive`链接选项。
+        1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库，可在`PADDLE_ROOT/third_party`下找到。
+        1. 如果在编译 C-API 时使用OpenBLAS数学库，需要显示地链接`libopenblas.a`。
+        1. 如果在编译 C-API 是使用MKL数学库，需要显示地链接MKL的动态库。
+
+3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`
+    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时，需注意：
+        1. 这种链接方式主要用于移动端预测。
+        1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
+        1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
+        1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/usage/capi/images/csr.png
new file mode 100644
index 0000000000..3dc10b8de4
Binary files /dev/null and b/doc/howto/usage/capi/images/csr.png differ
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/usage/capi/images/sequence_data.png
new file mode 100644
index 0000000000..6e47a46b89
Binary files /dev/null and b/doc/howto/usage/capi/images/sequence_data.png differ
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/usage/capi/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000..a4399ade04
Binary files /dev/null and b/doc/howto/usage/capi/images/workflow_of_CAPI.png differ
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/usage/capi/index_cn.rst
new file mode 100644
index 0000000000..fd774fbc74
--- /dev/null
+++ b/doc/howto/usage/capi/index_cn.rst
@@ -0,0 +1,9 @@
+PaddlePaddle C-API
+==================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
new file mode 100644
index 0000000000..563ec5ca21
--- /dev/null
+++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
@@ -0,0 +1,285 @@
+## 输入/输出数据组织
+
+这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据，以及如何解析神经网络前向计算的输出结果。
+
+### 输入/输出数据类型
+在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+1. 一维整型数组
+1. 二维浮点型矩阵
+    - 稠密矩阵
+    - 稀疏矩阵
+
+说明：
+1. 一维数组**仅支持整型值**；
+    - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
+    - 分类任务中类别标签；
+1. 逻辑上高于二维的数据（例如含有多个通道的图片，视频等）在程序实现中都会转化为二维矩阵，转化方法在相应的领域都有通用解决方案，需要使用者自己了解并完成转化；
+1. 二维矩阵可以表示行向量和列向量，任何时候如果需要浮点型数组（向量），都应使用C-API中的矩阵来表示，而不是C-API中的一维数组。
+1. 不论是一维整型数组还是二维浮点数矩阵，**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时，无需关心和处理序列信息。关于什么是“序列信息”，下文会详细进行介绍。
+
+### 基本使用概念
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输入，每一个输入/输入都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`Argument`内部由`IVector`（对应着上文提到的一维整型数组）和`Matrix`（对应着上文提到的二维浮点型矩阵）来实际存储数据；由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
+
+- **注**：
+    1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。
+    1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。
+    1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。
+
+### 组织输入数据
+- 一维整型数组
+
+    概念上可以将`paddle_ivector`理解为一个一维的整型数组，通常用于表示离散的类别标签，或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。
+    ```c
+    int ids[] = {1, 2, 3};
+     paddle_ivector ids_array =
+         paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false);
+     CHECK(paddle_arguments_set_ids(in_args, 0, ids_array));
+    ```
+
+- **稠密矩阵**
+    - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列，矩阵里的元素是浮点数。对神经网络来说，矩阵的高度`m`是一次预测接受的样本数目，宽度$n$是神经网络定义时，`paddle.layer.data`的`size`。
+    - 下面的代码片段创建了一个高度为1，宽度为`layer_size`的稠密矩阵，矩阵中每个元素的值随机生成。
+
+    ```c
+    paddle_matrix mat = paddle_matrix_create(
+                            /* height = batch size */ 1,
+                            /* width = dimensionality of the data layer */ layer_size,
+                            /* whether to use GPU */ false);
+
+    paddle_real* array;
+    // Get the pointer pointing to the start address of the first row of the
+    // created matrix.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    // Fill the matrix with a randomly generated test sample.
+    srand(time(0));
+    for (int i = 0; i < layer_size; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    // Assign the matrix to the argument.
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+    ```
+
+- **稀疏矩阵**
+
+  PaddlePaddle C-API 中 稀疏矩阵使用[CSR（Compressed Sparse Row Format）](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。
+  <p align="center">
+  <img src="https://user-images.githubusercontent.com/5842774/34159369-009fd328-e504-11e7-9e08-36bc6dc5e505.png" width=700><br> 图1. 稀疏矩阵存储示意图
+  </p>
+
+  CSR存储格式通过：（1）非零元素的值（上图中的`values`）；（2）行偏移(上图中的`row offsets`)：每一行元素在`values`中的起始偏移，`row offsets`中元素个数总是等于行数 + 1；（3）非零元素的列号（上图中的`column indices`）来确定稀疏矩阵的内容。
+
+  在PaddlePaddle C-API中，通过调用以下接口创建稀疏矩阵：
+
+  ```c
+  PD_API paddle_matrix paddle_matrix_create_sparse(
+      uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+  ```
+
+  1. 创建稀疏矩阵时需要显示地指定矩阵的（1）高度（`height`，在神经网络中等于一次预测处理的样本数）（2）宽度（`width`，`paddle.layer.data`的`size`）以及（3）非零元个数（`nnz`）。
+  1. 当上述接口第4个参数`isBinary`指定为`true`时，**只需要设置行偏移（`row_offset`）和列号(`colum indices`)，不需要提供元素值（`values`）**，这时行偏移和列号指定的元素默认其值为1。
+
+  下面的代码片段创建了一个CPU上的二值稀疏矩阵：
+
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 (colIndices) / sizeof(int),
+                                 NULL /*values array is NULL.*/,
+                                 0 /*size of the value arrary is 0.*/));
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+  ```
+  下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵：
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+  float values[] = {0.5, 0.5, 0.5};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 sizeof(colIndices) / sizeof(int),
+                                 values,
+                                 sizeof(values) / sizeof(float)));
+  ```
+  注意事项：
+  1. 移动端预测**不支持**稀疏矩阵及相关的接口。
+
+### 组织序列信息
+
+多个排成一列的元素（可以是整型、浮点数、浮点数向量等）构成一个序列，元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中，序列输入/输出数据是在上文介绍的**数据输入（一维整型数组，二维浮点数矩阵）基础上，附加上序列信息**。下面详细解释什么是“序列信息”。
+
+我们将神经网络一次计算接受的所有输入样本称之为一个`batch`（可以含有一条或多条样本），每一个序列在整个`batch`中的偏移，就是PaddlePaddle中所指的**序列信息**，称之为“sequence start positions”。PaddlePaddle 支持两种序列类型：
+
+1. 单层序列
+    - 序列中的每一个元素是非序列，是进行计算的基本单位，不可再进行拆分。
+    - 例如：自然语言中的句子是一个序列，序列中的元素是词语；
+1. 双层序列
+    - 序列中的每一个元素又是一个序列。
+    - 例如：自然语言中的段落是一个双层序列；段落是由句子构成的序列；句子是由词语构成的序列。
+    - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。
+
+这篇文档之后部分会统一使用`sequence_start_positions`来特指：PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。
+
+对双层序列来讲，不仅要提供每一个外层序列在整个`batch`中的偏移，每一个外层序列又含有若干个内层序列，需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说：**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。
+
+**注：**
+1. 不论序列中的元素在内存中占用多少实际存储空间，`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位，而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。
+1. 非序列输入不携带`sequence_start_positions`，非序列输入无需构造`sequence_start_positions`。
+1. **不论是单层序列还是双层序列的序列信息，都使用`paddle_ivector`（也就是PaddlePaddle中的一维整型数组）来存储。**
+
+图2 是PaddlePaddle中单层序列和双层序列存储示意图。
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34159714-1f81a9be-e505-11e7-8a8a-4902146ec899.png" width=800><br>图2. 序列输入示意图
+</p>
+
+- 单层序列
+
+    图2 (a) 展示了一个含有4个序列的`batch`输入：
+    1. 4个序列的长度分别为：5、3、2、4；
+    1. 这时的`sequence_start_positions`为：`[0, 5, 8, 10, 14]`；
+    1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，都可以通过调用下面的接口为原有的数据输入附加上序列信息，使之变为一个单层序列输入，代码片段如下：
+
+    ```c
+    int seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+    // Suppose the network only has one input data layer.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+    ```
+
+- 双层序列
+
+    图2 (b) 展示了一个含有4个序列的`batch`输入；
+    1. 4个序列的长度分别为：5、3、2、4；这四个序列又分别含有3、2、1、2个子序列；
+    1. 这时的需要同时提供：
+        - 外层序列在`batch`中的起始偏移`：[0, 5, 8, 10, 14]`；
+        - 内层序列在`batch`中的起始偏移：`[0, 2, 3, 5, 7， 8， 10， 13， 14]`；
+    1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型，这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**，分别为数据输入添加外层序列和内层序列的序列信息，使之变为一个双层序列输入，代码片段如下：
+    ```c
+    // set the sequence start positions for the outter sequences.
+    int outter_seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos =
+        paddle_ivector_create(outter_seq_pos_array,
+                              sizeof(outter_pos_array) / sizeof(int),
+                              false,
+                              false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    // If the input is a sequence not the nested sequence, the third parameter is
+    // fixed to be 0.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+    // set the sequence start positions for the outter sequences.
+    int inner_seq_pos_array[] = {0, 2, 3, 5, 7， 8， 10， 13， 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos));
+    ```
+
+注意事项：
+1. 当一个`batch`中含有多个序列，**不支持序列长度为`0`的序列（也就是空输入）** 作为输入。不同计算层对空输入的处理策略有可能不同，潜在会引起未定义行为，或者引起行时错误，请在输入时进行合法性检查。
+
+### Python 端数据类型说明
+
+下表列出了Python端训练接口暴露的数据类型（`paddle.layer.data`函数`type`字段的取值）对应于调用C-API需要创建的数据类型：
+
+<html>
+<table border="2" frame="border">
+<table>
+<thead>
+<tr>
+<th style="text-align:left">Python 端数据类型</th>
+<th style="text-align:left">C-API 输入数据类型</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value</td>
+<td style="text-align:left">整型数组，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector</td>
+<td style="text-align:left">浮点型稠密矩阵，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sequence</td>
+<td style="text-align:left">整型数组，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sub_sequence</td>
+<td style="text-align:left">整型数组，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，无需提供非零元的值，默认为1，需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵，需提供非零元的值，需附加双层序列信息</td>
+</tr>
+</tbody>
+</table>
+</html>
+<br>
+
+
+### 输出数据
+
+PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`，`argument`通过`paddle_matrix`或`paddle_ivector`存数数据，如果输出是一个序列，那么会携带有`sequence_start_positions`信息。调用C-API相关接口，读取需要的结果即可。
+
+### 总结
+
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为`argument`。
+- `argument`并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
+- 在`argument`内部由`paddle_ivector`（一维整型数组）和`paddle_matrix`（二维浮点型矩阵）来实际存储数据。
+如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
+
+于是，在组织神经网络输入时，需要思考完成以下工作：
+1. 为每一个输入/输出创建`argument`。
+    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
+1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
+    - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。
+    - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。
+1. 如果输入是序列数据，需要创建并填写`sequence_start_positions`信息。
+    - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。
+    - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。
+    - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/usage/capi/workflow_of_capi_cn.md
new file mode 100644
index 0000000000..e0a42fff12
--- /dev/null
+++ b/doc/howto/usage/capi/workflow_of_capi_cn.md
@@ -0,0 +1,119 @@
+## C-API 使用流程
+
+这篇文档介绍 PaddlePaddle C-API 整体使用流程。
+
+### 使用流程
+
+使用 C-API 的工作流程如图1所示，分为（1）准备预测模型和（2）预测程序开发两大部分。
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34658453-365f73ea-f46a-11e7-9b3f-0fd112b27bae.png" width=500><br> 图1. C-API使用流程示意图
+</p>
+
+- 准备预测模型
+    1. 只将神经网络结构进行序列化。
+        - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
+    1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
+        - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。
+        - 预测时只需加载一个文件便于发布。
+    - **注意**：以上两种方式只需选择其一即可。
+- 调用 C-API 开发预测序
+    1. 初始化PaddlePaddle运行环境。
+    1. 加载预测模型。
+    1. 创建神经网络输入，组织输入数据。
+    1. 进行前向计算，获得计算结果。
+    1. 清理和结束。
+
+### 准备预测模型
+
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+
+下面，我们将训练结束后存储下来的模型转换成预测模型。
+
+1. 序列化神经网络模型配置
+
+    PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数，使用 C-API 进行预测时，需要将网络结构使用 protobuf 进行序列化，写入文件中。
+
+    调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中，示例代码如下：
+
+    ```python
+    from paddle.utils.dump_v2_config import dump_v2_config
+    from mnist_v2 import network
+
+    predict = network(is_infer=True)
+    dump_v2_config(predict, "trainer_config.bin", True)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+
+    使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
+
+2. 合并模型文件(可选)
+
+    一些情况为了便于发布，希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求，可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化，将序列化结果写入一个文件内。
+
+    代码示例如下：
+
+    ```python
+    from paddle.utils.merge_model import merge_v2_modelss
+    from mnist_v2 import network
+
+    net = network(is_infer=True)
+    param_file = "models/params_pass_4.tar"
+    output_file = "output.paddle.model"
+    merge_v2_model(net, param_file, output_file)
+    ```
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+
+#### 注意事项
+1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
+1. **预测使用的网络结构往往不同于训练**，通常需要去掉网络中的：（1）类别标签层；（2）损失函数层；（3）`evaluator`等，只留下核心计算层，请注意是否需要修改网络结构。
+1. 预测时，可以获取网络中定义的任意多个（大于等于一个）层前向计算的结果，需要哪些层的计算结果作为输出，就将这些层加入一个Python list中，作为调用`dump_v2_config`的第一个参数。
+
+### 编写预测代码
+
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+
+#### step 1. 初始化PaddlePaddle运行环境
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+
+#### step2. 加载模型
+
+这里介绍C-API使用中的一个重要概念：Gradient Machine。
+
+概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
+
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+
+- 注意事项
+    1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
+    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
+    1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
+
+#### step 3. 创建神经网络输入，组织输入数据
+
+基本使用概念：
+- 在PaddlePaddle内部，神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据，而是将输入/输出数据有机地组织在一起。
+- 在`Argument`内部由：1. `Matrix`（二维矩阵，存储浮点类型输入/输出）；2. `IVector`（一维数组，**仅用于存储整型值**，多用于自然语言处理任务）来实际存储数据。
+
+C-API支持的所有输入数据类型和他们的组织方式，请参考“输入/输出数据组织”一节。
+
+这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
+
+在组织神经网络输入，获取输出时，需要思考完成以下工作：
+1. 为每一个输入/输出创建`argument`；
+1. 为每一个`argument`创建`paddle_matrix`来存储数据；
+
+与输入不同的是，不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。
+
+#### step 4. 前向计算
+
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+
+#### step 5. 清理
+
+结束预测之后，对使用的中间变量和资源进行清理和释放。
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 659bae9c0c..c2fc86687d 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -51,7 +51,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 
 - port：**必选，默认7164**，pserver监听的起始端口，根据ports_num决定总端口个数，从起始端口监听多个端口用于通信
 - ports_num：**必选，默认1**，监听的端口个数
-- ports_num_for_sparse：**必选，默认1**，用于稀疏类型参数通信的端口个数
+- ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
 - num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
 
 ### 启动计算节点
@@ -60,7 +60,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 $ python train.py
 ```
 
-trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量（https://zh.wikipedia.org/wiki/环境变量 ）或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
 
 使用环境变量：
 
@@ -95,7 +95,7 @@ paddle.init(
 - trainer_count：**必选，默认1**，当前训练任务trainer总个数
 - port：**必选，默认7164**，连接到pserver的端口
 - ports_num：**必选，默认1**，连接到pserver的端口个数
-- ports_num_for_sparse：**必选，默认1**，和pserver之间用于稀疏类型参数通信的端口个数
+- ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
 - num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
 - trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 915405ca5b..28cd1fa790 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -52,7 +52,7 @@ Parameter Description
 
 - port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
 - ports_num: **required, default 1**, total number of ports will listen on.
-- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
+- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
 - num_gradient_servers: **required, default 1**, total number of gradient servers.
 
 ### Starting trainer
@@ -98,7 +98,7 @@ Parameter Description
 - trainer_count: **required, default 1**, total count of trainers in the training job.
 - port: **required, default 7164**, port to connect to parameter server.
 - ports_num: **required, default 1**, number of ports for communication.
-- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
+- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
 - num_gradient_servers: **required, default 1**, total number of gradient server.
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 5eeaf7e31f..376cd46fb0 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -3,59 +3,82 @@
 
 #include "../common/common.h"
 
+// Modify this path as needed.
 #define CONFIG_BIN "./trainer_config.bin"
+// Modify this path as needed.
+// This demo assumes that merged model is not used, then this path is the
+// directory storing all the trained parameters.
+// If the model is trained by PaddlePaddle V2 API, the model is saved as
+// a compressed file. You need to uncompress the compressed file first.
+#define MODEL_PATH "models/pass_4"
 
 int main() {
-  // Initalize Paddle
+  // Initalize the PaddlePaddle runtime environment.
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file generated by `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
+  // Load the trained model. Modify the parameter MODEL_PATH to set the correct
+  // path of the trained model.
+  CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, MODEL_PATH));
+
+  // Inputs and outputs of the network are organized as paddle_arguments object
+  // in C-API. In the comments below, "argument" specifically means one input of
+  // the neural network in PaddlePaddle C-API.
   paddle_arguments in_args = paddle_arguments_create_none();
 
-  // There is only one input of this network.
+  // There is only one data layer in this demo MNIST network, invoke this
+  // function to create one argument.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ false);
-  srand(time(0));
+  // Each argument needs one matrix or one ivector (integer vector, for sparse
+  // index input, usually used in NLP task) to holds the real input data.
+  // In the comments below, "matrix" specifically means the object needed by
+  // argument to hold the data. Here we create the matrix for the above created
+  // agument to store the testing samples.
+  paddle_matrix mat =
+      paddle_matrix_create(/* height = batch size */ 1,
+                           /* width = dimensionality of the data layer */ 784,
+                           /* whether to use GPU */ false);
 
   paddle_real* array;
-
-  // Get First row.
+  // Get the pointer pointing to the start address of the first row of the
+  // created matrix.
   CHECK(paddle_matrix_get_row(mat, 0, &array));
 
+  // Fill the matrix with a randomly generated test sample.
+  srand(time(0));
   for (int i = 0; i < 784; ++i) {
     array[i] = rand() / ((float)RAND_MAX);
   }
 
+  // Assign the matrix to the argument.
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
+  // Create the output argument.
   paddle_arguments out_args = paddle_arguments_create_none();
+
+  // Invoke the forward computation.
   CHECK(paddle_gradient_machine_forward(machine,
                                         in_args,
                                         out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
+                                        /* is train taks or not */ false));
 
+  // Create the matrix to hold the forward result of the neural network.
+  paddle_matrix prob = paddle_matrix_create_none();
+  // Access the matrix of the output argument, the predicted result is stored in
+  // which.
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
   uint64_t height;
   uint64_t width;
-
   CHECK(paddle_matrix_get_shape(prob, &height, &width));
   CHECK(paddle_matrix_get_row(prob, 0, &array));
 
@@ -68,6 +91,7 @@ int main() {
   }
   printf("\n");
 
+  // The cleaning up.
   CHECK(paddle_matrix_destroy(prob));
   CHECK(paddle_arguments_destroy(out_args));
   CHECK(paddle_matrix_destroy(mat));
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
new file mode 100644
index 0000000000..c030d572cb
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
@@ -0,0 +1,8 @@
+from paddle.utils.merge_model import merge_v2_model
+
+from mnist_v2 import network
+
+net = network(is_infer=True)
+param_file = "models/params_pass_4.tar"
+output_file = "output.paddle.model"
+merge_v2_model(net, param_file, output_file)
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/capi/examples/model_inference/dense/mnist_v2.py
new file mode 100644
index 0000000000..ee28111153
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/mnist_v2.py
@@ -0,0 +1,117 @@
+import os
+import sys
+import gzip
+import logging
+import argparse
+from PIL import Image
+import numpy as np
+
+import paddle.v2 as paddle
+from paddle.utils.dump_v2_config import dump_v2_config
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def multilayer_perceptron(img, layer_size, lbl_dim):
+    for idx, size in enumerate(layer_size):
+        hidden = paddle.layer.fc(input=(img if not idx else hidden),
+                                 size=size,
+                                 act=paddle.activation.Relu())
+    return paddle.layer.fc(input=hidden,
+                           size=lbl_dim,
+                           act=paddle.activation.Softmax())
+
+
+def network(input_dim=784, lbl_dim=10, is_infer=False):
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(input_dim))
+
+    predict = multilayer_perceptron(
+        images, layer_size=[128, 64], lbl_dim=lbl_dim)
+
+    if is_infer:
+        return predict
+    else:
+        label = paddle.layer.data(
+            name='label', type=paddle.data_type.integer_value(lbl_dim))
+        return paddle.layer.classification_cost(input=predict, label=label)
+
+
+def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
+    if task == "train":
+        if not os.path.exists(save_dir):
+            os.mkdir(save_dir)
+
+        paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
+        cost = network()
+        parameters = paddle.parameters.create(cost)
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=0.1 / 128.0,
+            momentum=0.9,
+            regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 100 == 0:
+                    logger.info("Pass %d, Batch %d, Cost %f, %s" %
+                                (event.pass_id, event.batch_id, event.cost,
+                                 event.metrics))
+            if isinstance(event, paddle.event.EndPass):
+                with gzip.open(
+                        os.path.join(save_dir, "params_pass_%d.tar" %
+                                     event.pass_id), "w") as f:
+                    trainer.save_parameter_to_tar(f)
+
+        trainer.train(
+            reader=paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=8192),
+                batch_size=128),
+            event_handler=event_handler,
+            num_passes=5)
+    elif task == "dump_config":
+        predict = network(is_infer=True)
+        dump_v2_config(predict, "trainer_config.bin", True)
+    else:
+        raise RuntimeError(("Error value for parameter task. "
+                            "Available options are: train and dump_config."))
+
+
+def parse_cmd():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle MNIST demo for CAPI.")
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=False,
+        help=("A string indicating the taks type. "
+              "Available options are: \"train\", \"dump_config\"."),
+        default="train")
+    parser.add_argument(
+        "--use_gpu",
+        type=bool,
+        help=("A bool flag indicating whether to use GPU device or not."),
+        default=False)
+    parser.add_argument(
+        "--trainer_count",
+        type=int,
+        help=("This parameter is only used in training task. It indicates "
+              "how many computing threads are created in training."),
+        default=1)
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        help=("This parameter is only used in training task. It indicates "
+              "path of the directory to save the trained models."),
+        default="models")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_cmd()
+    main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
index 8ba67aee56..029b94ee63 100644
--- a/paddle/capi/examples/model_inference/sparse_binary/main.c
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -1,5 +1,6 @@
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
 #define CONFIG_BIN "./trainer_config.bin"
@@ -9,16 +10,18 @@ int main() {
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file which is generated by
+  // `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
   CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
+  // Load the trained parameters. Uncomment the following line and change the
+  // directory as needed.
   // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
   //                                                "./some_where_to_params"));
   paddle_arguments in_args = paddle_arguments_create_none();
@@ -26,7 +29,7 @@ int main() {
   // There is only one input of this network.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
+  // Create the input matrix.
   paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
   srand(time(0));
   paddle_real* array;
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index 482b51e8a8..1f0e033c5b 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -168,3 +168,13 @@ paddle_error paddle_gradient_machine_get_layer_output(
   out->args.push_back(layerOutput);
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_gradient_machine_release_layer_output(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  m->machine->releaseOutput();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index 28eeb23e3b..7e37dea00b 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -113,6 +113,14 @@ paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
                                          const char* layerName,
                                          paddle_arguments args);
 
+/**
+ * @brief Release the middle layer's output memory of the gradient machine.
+ * @param [in] gradient machine that have run a inference
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index fb8c9ab96d..597ea959f2 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -32,8 +32,12 @@ cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
-cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
-cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
+cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
+
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
+        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
@@ -41,9 +45,9 @@ device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform)
+    shape_inference data_transform lod_tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
@@ -73,8 +77,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-
-cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
 cc_test(init_test SRCS init_test.cc DEPS init)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
+nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry init math_function)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index eaf13ddcef..85e693434a 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -427,7 +427,8 @@ std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
     VLOG(5) << "Making backward " << (*it)->Type() << " op";
     std::vector<std::unique_ptr<OpDesc>> op_grads;
 
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
+        (*it)->Type() == "parallel_do") {
       int step_block_idx = (*it)->GetBlockAttr("sub_block");
       BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
                                                   grad_to_var, step_block_idx);
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 0668b08ff7..54498e175d 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -53,12 +53,12 @@ VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
-VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
   VarDesc *res = FindVarRecursive(name_bytes);
   if (res == nullptr) {
     res = Var(name_bytes);
   }
-  return res;
+  return *res;
 }
 
 bool BlockDesc::HasVarRecursive(const std::string &name) const {
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 6c8c81b332..4b609e4bcb 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -57,7 +57,7 @@ class BlockDesc {
 
   VarDesc *FindVarRecursive(const std::string &name_bytes) const;
 
-  VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);
+  VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
 
   bool HasVarRecursive(const std::string &var_name) const;
 
diff --git a/paddle/framework/data_device_transform.cc b/paddle/framework/data_device_transform.cc
new file mode 100644
index 0000000000..b3fd48ae12
--- /dev/null
+++ b/paddle/framework/data_device_transform.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_device_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static const platform::DeviceContext* GetDeviceContext(
+    const platform::Place& src_place, const platform::Place& dst_place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    return pool.Get(src_place);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    return pool.Get(dst_place);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
+Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place) {
+  VLOG(3) << "DeviceTransform in, src_place " << in.place()
+          << " dst_place: " << dst_place;
+  Tensor* out = new Tensor();
+  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+  dev_ctx->Wait();
+  Copy(in, dst_place, *dev_ctx, out);
+  dev_ctx->Wait();
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_device_transform.h b/paddle/framework/data_device_transform.h
new file mode 100644
index 0000000000..bebf0d1b32
--- /dev/null
+++ b/paddle/framework/data_device_transform.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_device_transform_test.cu b/paddle/framework/data_device_transform_test.cu
new file mode 100644
index 0000000000..5d89f5546f
--- /dev/null
+++ b/paddle/framework/data_device_transform_test.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input1 of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
+    AddComment("This is test op");
+  }
+};
+
+class TestOpWithKernel : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    if (Attr<bool>("use_gpu")) {
+      VLOG(3) << "force use gpu kernel";
+      return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
+    } else {
+      VLOG(3) << "use default kernel";
+      return OpKernelType(proto::DataType::FP32,
+                          ctx.Input<Tensor>("input")->place());
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TestKernel : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+
+    const Tensor* input = ctx.Input<Tensor>("input");
+
+    std::cout << "input place:" << input->place() << std::endl;
+    auto* output = ctx.Output<framework::LoDTensor>("output");
+    output->Resize(input->dims());
+    output->mutable_data<T>(ctx.GetPlace());
+
+    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        input, input, output, ctx.template device_context<DeviceContext>(),
+        AddFunctor<T>());
+    functor.Run();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    test_op, paddle::framework::TestOpWithKernel,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+TEST(Operator, CPUtoGPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  InitDevices();
+
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  // create an op to run on CPU
+  paddle::framework::proto::OpDesc cpu_op_desc;
+  cpu_op_desc.set_type("test_op");
+  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
+
+  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
+  // prepare input
+  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
+  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float>(i);
+  }
+
+  // get output
+  auto* output = scope.Var("OUT1");
+  cpu_op->Run(scope, cpu_place);
+
+  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
+  }
+
+  // create an op to run on GPU
+  paddle::framework::proto::OpDesc gpu_op_desc;
+  gpu_op_desc.set_type("test_op");
+  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
+
+  auto attr = gpu_op_desc.mutable_attrs()->Add();
+  attr->set_name("use_gpu");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
+
+  paddle::platform::CUDAPlace cuda_place(0);
+  // get output
+  auto* output2 = scope.Var("OUT2");
+  gpu_op->Run(scope, cuda_place);
+
+  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(cuda_place);
+
+  paddle::framework::Tensor output_tensor;
+  Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+       &output_tensor);
+
+  dev_ctx->Wait();
+  float* output2_ptr = output_tensor.data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
+  }
+}
diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h
index 4a8669c3a4..3ab976ecac 100644
--- a/paddle/framework/data_layout.h
+++ b/paddle/framework/data_layout.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/platform/enforce.h"
 
 #include <iostream>
 #include "paddle/platform/enforce.h"
diff --git a/paddle/framework/data_layout_transform.cc b/paddle/framework/data_layout_transform.cc
new file mode 100644
index 0000000000..96794cae97
--- /dev/null
+++ b/paddle/framework/data_layout_transform.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_layout_transform.h"
+
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+
+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataLayout only support DataLayout transform on same place!");
+  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
+                 "TransDataLayout only support Datatype are same!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+
+  auto src_dim = src.dims();
+  std::vector<int64_t> dst_dim;
+
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  dst->Resize(make_ddim(dst_dim));
+  auto place = kernel_pair.second.place_;
+  dst->mutable_data(place, src.type());
+
+  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+
+  dst->set_layout(kernel_pair.second.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.h b/paddle/framework/data_layout_transform.h
new file mode 100644
index 0000000000..befae1f636
--- /dev/null
+++ b/paddle/framework/data_layout_transform.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+void TransDataLayout(const std::vector<int>& axis,
+                     const platform::DeviceContext* ctx,
+                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Variable* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
index ac6e40a3ae..e56edb9539 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -11,125 +11,44 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <functional>
 
 #include "paddle/framework/data_transform.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/platform/device_context.h"
+
+#include "paddle/framework/data_device_transform.h"
 
 namespace paddle {
 namespace framework {
 
-DataTransformFnMap& DataTransformFnMap::Instance() {
-  static DataTransformFnMap data_transform_map;
-  return data_transform_map;
-}
-
-auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
-                               DataLayout::kNHWC, LibraryType::kPlain);
-
-auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
-                               DataLayout::kNHWC, LibraryType::kPlain);
-
-auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
-                               DataLayout::kNHWC, LibraryType::kPlain);
-
-auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
-                               DataLayout::kNCHW, LibraryType::kPlain);
-
-void TransDataType(const platform::DeviceContext* ctx,
-                   const KernelTypePair& kernel_pair, const Variable& in,
-                   Variable* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
-  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_pair.first.place_,
-                                      kernel_pair.second.place_),
-      "TransDataType Only Support DataType transform on same place!");
-
-  auto src = in.Get<Tensor>();
-  auto* dst = out->GetMutable<Tensor>();
-
-  auto dims = src.dims();
-  dst->Resize(dims);
-  auto dst_type = kernel_pair.second.data_type_;
-  auto src_type = kernel_pair.first.data_type_;
-
-  switch (src_type) {
-    case proto::DataType::FP32:
-      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
-      break;
-    case proto::DataType::FP64:
-      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
-      break;
-    case proto::DataType::INT32:
-      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
-      break;
-    case proto::DataType::INT64:
-      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
-      break;
-    case proto::DataType::BOOL:
-      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
-      break;
-    default:
-      PADDLE_THROW("Not support type %d", src_type);
+Tensor* DataTransform(const OpKernelType& expected_kernel_type,
+                      const OpKernelType& kernel_type_for_var,
+                      const Tensor& input_tensor) {
+  Tensor* out = nullptr;
+  if (!platform::is_same_place(kernel_type_for_var.place_,
+                               expected_kernel_type.place_)) {
+    out = DeviceTransform(input_tensor, expected_kernel_type.place_);
   }
+  PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+  return out;
 }
 
-void TransDataLayout(const std::vector<int>& axis,
-                     const platform::DeviceContext* ctx,
-                     const KernelTypePair& kernel_pair, const Variable& in,
-                     Variable* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
-  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_pair.first.place_,
-                                      kernel_pair.second.place_),
-      "TransDataLayout only support DataLayout transform on same place!");
-  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
-                 "TransDataLayout only support Datatype are same!");
-
-  auto src = in.Get<Tensor>();
-  auto* dst = out->GetMutable<Tensor>();
-  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
-
-  auto place = kernel_pair.second.place_;
-  CopyFrom(src, place, *ctx, dst);
-
-  auto src_dim = src.dims();
-  std::vector<int64_t> dst_dim;
-
-  dst_dim.resize(axis.size());
-  for (size_t i = 0; i < axis.size(); i++) {
-    dst_dim[i] = src_dim[axis[i]];
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var) {
+  if (in_var.IsType<LoDTensor>()) {
+    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+    tran_lod_tensor->ShareDataWith(tensor);
+  } else if (in_var.IsType<SelectedRows>()) {
+    auto& in_selected_rows = in_var.Get<SelectedRows>();
+    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    trans_selected_rows->set_height(in_selected_rows.height());
+    trans_selected_rows->set_rows(in_selected_rows.rows());
+    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+  } else {
+    PADDLE_THROW("unknown var type");
   }
-
-  dst->Resize(make_ddim(dst_dim));
-
-  auto src_type = kernel_pair.first.data_type_;
-  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
-
-  dst->set_layout(kernel_pair.second.data_layout_);
 }
 
 }  // namespace framework
 }  // namespace paddle
-
-namespace f = paddle::framework;
-
-namespace {
-std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
-std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
-}
-
-REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
-REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
-                           std::bind(f::TransDataLayout, NHWC2NCHW,
-                                     std::placeholders::_1,
-                                     std::placeholders::_2,
-                                     std::placeholders::_3,
-                                     std::placeholders::_4));
-REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
-                           std::bind(f::TransDataLayout, NCHW2NHWC,
-                                     std::placeholders::_1,
-                                     std::placeholders::_2,
-                                     std::placeholders::_3,
-                                     std::placeholders::_4));
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
index 56ebc80f43..ee95c7e856 100644
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/selected_rows.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
 #include "paddle/operators/math/math_function.h"
@@ -29,145 +30,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+Tensor* DataTransform(const OpKernelType& expected_kernel_type,
+                      const OpKernelType& kernel_type_for_var,
+                      const Tensor& input_tensor);
 
-using DataTransformFn =
-    std::function<void(const platform::DeviceContext*, const KernelTypePair&,
-                       const Variable&, Variable*)>;
-
-struct KernelTypePairHash {
-  static void HashCombine(const OpKernelType& t, std::size_t* seed) {
-    OpKernelType::Hash kernel_type_hasher;
-    (*seed) ^= kernel_type_hasher(t) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
-  }
-
-  size_t operator()(const KernelTypePair& kernel_pair) const {
-    std::size_t seed = 0;
-    HashCombine(kernel_pair.first, &seed);
-    HashCombine(kernel_pair.second, &seed);
-    return seed;
-  }
-};
-
-template <typename InType, typename OutType>
-struct CastDataTypeFunctor {
-  HOSTDEVICE inline OutType operator()(InType in) const {
-    return static_cast<OutType>(in);
-  }
-};
-
-template <typename InType>
-struct CastDataType {
-  CastDataType(const framework::Tensor& in, framework::Tensor* out,
-               const platform::DeviceContext* ctx)
-      : in_(in), out_(out), ctx_(ctx) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-
-  template <typename OutType>
-  void operator()() {
-    auto place = ctx_->GetPlace();
-
-    auto* in_begin = in_.data<InType>();
-    auto numel = in_.numel();
-    auto* in_end = in_begin + numel;
-    auto* out_begin = out_->mutable_data<OutType>(place);
-
-    if (platform::is_cpu_place(place)) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans(*context, in_begin, in_end, out_begin,
-            CastDataTypeFunctor<InType, OutType>());
-    } else {
-      // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
-    }
-  }
-};
-
-struct CastDataLayout {
-  CastDataLayout(const platform::DeviceContext* ctx,
-                 const std::vector<int>& axis, const framework::Tensor& in,
-                 framework::Tensor* out)
-      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
-  const platform::DeviceContext* ctx_;
-  const std::vector<int> axis_;
-
-  template <typename T>
-  void operator()() {
-    auto place = ctx_->GetPlace();
-
-    if (platform::is_cpu_place(place)) {
-      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
-      trans4(*context, in_, out_, axis_);
-    } else {
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
-    }
-  }
-};
-
-using DataTransformMap =
-    std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
-
-class DataTransformFnMap {
- public:
-  static DataTransformFnMap& Instance();
-
-  bool Has(const KernelTypePair& key_pair) const {
-    return map_.find(key_pair) != map_.end();
-  }
-
-  void Insert(const OpKernelType& left, const OpKernelType& right,
-              const DataTransformFn& data_tranform_fn) {
-    Insert(std::make_pair(left, right), data_tranform_fn);
-  }
-
-  void Insert(const KernelTypePair& kernel_type_pair,
-              const DataTransformFn& data_tranform_fn) {
-    PADDLE_ENFORCE(!Has(kernel_type_pair),
-                   "KernelTypePair %s has been registered", "");
-    map_.insert({kernel_type_pair, data_tranform_fn});
-  }
-
-  const DataTransformFn& Get(const KernelTypePair& key_pair) const {
-    auto data_transformer = GetNullable(key_pair);
-    PADDLE_ENFORCE_NOT_NULL(data_transformer,
-                            "DataTransformFn should not be NULL");
-    return *data_transformer;
-  }
-
-  const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
-    auto it = map_.find(key_pair);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &(it->second);
-    }
-  }
-
-  const DataTransformMap& Map() const { return map_; }
-
- private:
-  DataTransformFnMap() = default;
-  DataTransformMap map_;
-  DISABLE_COPY_AND_ASSIGN(DataTransformFnMap);
-};
-
-// generate unique name with __LINE__
-// refs https://stackoverflow.com/questions/1597007
-#define TOKENPASTE(x, y) x##y
-#define TOKENPASTE2(x, y) TOKENPASTE(x, y)
-#define REGISTER_DATA_TRANSFORM_FN(from, to, fn)                              \
-  static int TOKENPASTE2(fn_, __LINE__)() {                                   \
-    ::paddle::framework::DataTransformFnMap::Instance().Insert(from, to, fn); \
-    return 0;                                                                 \
-  }                                                                           \
-  static int TOKENPASTE2(var_, __LINE__) __attribute__((unused)) =            \
-      TOKENPASTE2(fn_, __LINE__)()
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc
deleted file mode 100644
index edd305fd17..0000000000
--- a/paddle/framework/data_transform_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <array>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "paddle/framework/data_transform.h"
-#include "paddle/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-using namespace platform;
-
-/**
- * @brief cross validation of different kernel type transform
- *  We use four bit map represent different combination.
- *  If the field has multiple possible value, only choose two of them.
- *  For DataType, only test the FP32(float), FP64(double).
- *  e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
- *       1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
- */
-
-std::array<proto::DataType, 2> kDataType = {
-    {proto::DataType::FP32, proto::DataType::FP64}};
-
-std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
-
-std::array<DataLayout, 2> kDataLayout = {{
-    DataLayout::kNHWC, DataLayout::kNCHW,
-}};
-
-std::array<LibraryType, 2> kLibraryType = {{
-    LibraryType::kPlain, LibraryType::kMKLDNN,
-}};
-
-OpKernelType GenFromBit(const std::vector<bool> bits) {
-  return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
-                      kLibraryType[bits[3]]);
-}
-
-int test_value = 0;
-
-auto kernel0 = GenFromBit({0, 0, 0, 0});
-auto kernel1 = GenFromBit({0, 0, 0, 1});
-auto kernel2 = GenFromBit({0, 0, 1, 0});
-auto kernel3 = GenFromBit({0, 0, 1, 1});
-
-void TransDataType_t(const platform::DeviceContext* ctx,
-                     const KernelTypePair& p, const Variable& in,
-                     Variable* out) {
-  test_value++;
-}
-
-void TransDataLayout_t(const platform::DeviceContext* ctx,
-                       const KernelTypePair& p, const Variable& in,
-                       Variable* out) {
-  test_value--;
-}
-
-void TransLibraryType_t(const platform::DeviceContext* ctx,
-                        const KernelTypePair& p, const Variable& in,
-                        Variable* out) {
-  test_value += 2;
-}
-
-}  // namespace framework
-}  // namespace paddle
-
-namespace frw = paddle::framework;
-
-REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
-REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
-
-TEST(DataTransform, Register) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto& instance = DataTransformFnMap::Instance();
-  paddle::framework::Variable in;
-  paddle::framework::Variable out;
-
-  DeviceContext* ctx = new CPUDeviceContext();
-  auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
-  instance.Get(pair0)(ctx, pair0, in, &out);
-  ASSERT_EQ(test_value, 1);
-
-  auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
-  instance.Get(pair1)(ctx, pair1, in, &out);
-  ASSERT_EQ(test_value, 0);
-
-  auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
-  instance.Get(pair3)(ctx, pair3, in, &out);
-  ASSERT_EQ(test_value, 2);
-}
-
-TEST(DataTransform, DataLayout) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto& instance = DataTransformFnMap::Instance();
-  Variable in;
-  Variable out;
-  Tensor* src = in.GetMutable<Tensor>();
-  src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
-  src->set_layout(DataLayout::kNHWC);
-
-  DeviceContext* ctx = new CPUDeviceContext();
-
-  {
-    auto kernel1 = GenFromBit({1, 0, 0, 0});
-    auto kernel2 = GenFromBit({1, 0, 1, 0});
-    auto pair0 = std::make_pair(kernel1, kernel2);
-    instance.Get(pair0)(ctx, pair0, in, &out);
-  }
-
-  Tensor dst = out.Get<Tensor>();
-
-  EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
-  EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
-
-  {
-    auto kernel1 = GenFromBit({1, 0, 1, 0});
-    auto kernel2 = GenFromBit({1, 0, 0, 0});
-    auto pair0 = std::make_pair(kernel1, kernel2);
-    instance.Get(pair0)(ctx, pair0, out, &in);
-  }
-
-  EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
-  EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
-}
-
-TEST(DataTransform, DataType) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto& instance = DataTransformFnMap::Instance();
-  DeviceContext* ctx = new CPUDeviceContext();
-
-  Variable in;
-  Variable out;
-  Tensor* src = in.GetMutable<Tensor>();
-  float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
-  for (int i = 0; i < 6; ++i) {
-    ptr[i] = i / 3;
-  }
-
-  {
-    auto kernel1 = GenFromBit({0, 0, 0, 0});
-    auto kernel2 = GenFromBit({1, 0, 0, 0});
-    auto pair0 = std::make_pair(kernel1, kernel2);
-    instance.Get(pair0)(ctx, pair0, in, &out);
-  }
-  Tensor dst = out.Get<Tensor>();
-  EXPECT_TRUE(dst.data<double>() != nullptr);
-}
diff --git a/paddle/framework/data_type_transform.cc b/paddle/framework/data_type_transform.cc
new file mode 100644
index 0000000000..63373232e9
--- /dev/null
+++ b/paddle/framework/data_type_transform.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type_transform.h"
+
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    auto* in_begin = in_.data<InType>();
+    auto numel = in_.numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(place);
+
+    if (platform::is_cpu_place(place)) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out) {
+  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_pair.first.place_,
+                                      kernel_pair.second.place_),
+      "TransDataType Only Support DataType transform on same place!");
+
+  auto src = in.Get<Tensor>();
+  auto* dst = out->GetMutable<Tensor>();
+
+  auto dims = src.dims();
+  dst->Resize(dims);
+  auto dst_type = kernel_pair.second.data_type_;
+  auto src_type = kernel_pair.first.data_type_;
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type_transform.h b/paddle/framework/data_type_transform.h
new file mode 100644
index 0000000000..8ec9074225
--- /dev/null
+++ b/paddle/framework/data_type_transform.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/variable.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+void TransDataType(const platform::DeviceContext* ctx,
+                   const KernelTypePair& kernel_pair, const Variable& in,
+                   Variable* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h
new file mode 100644
index 0000000000..7e308ffb5a
--- /dev/null
+++ b/paddle/framework/details/cow_ptr.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <thread>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Change it to thread safe flags if needed.
+class ThreadUnsafeOwnershipFlags {
+ public:
+  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+
+  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags& operator=(
+      const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
+
+  void SetOwnership(bool flag) { flag_ = flag; }
+
+  // Invoke the callback if it is not owned.
+  template <typename Callback>
+  void AcquireOwnershipOnce(Callback acquire) {
+    if (!flag_) {
+      acquire();
+      flag_ = true;
+    }
+  }
+
+ private:
+  bool flag_;
+};
+
+// Copy-On-Write pointer.
+// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
+//
+// The template parameter OwnershipFlags should have:
+//   * a constructor takes a bool. True if own.
+//   * SetOwnership(bool flag).
+//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
+//     owned.
+//
+// https://en.wikipedia.org/wiki/Copy-on-write
+template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
+class COWPtr {
+ public:
+  // Ctor from raw pointer.
+  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+
+  // Move methods. Steal ownership from origin
+  COWPtr(COWPtr&& other)
+      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
+  COWPtr& operator=(COWPtr&& origin) = default;
+
+  // Copy methods. Not own payload
+  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
+  COWPtr& operator=(const COWPtr& other) {
+    payload_ = other.payload_;
+    ownership_.SetOwnership(false);
+    return *this;
+  }
+
+  // Access read only data.
+  const T& Data() const { return *payload_; }
+
+  // Access mutable data. If the data is not owned, the data will be copied
+  // before.
+  T* MutableData() {
+    ownership_.AcquireOwnershipOnce(
+        [this] { payload_.reset(new T(*payload_)); });
+    return payload_.get();
+  }
+
+ private:
+  // Actual data pointer.
+  std::shared_ptr<T> payload_;
+
+  // Ownership flag.
+  OwnershipFlags ownership_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc
new file mode 100644
index 0000000000..936954a233
--- /dev/null
+++ b/paddle/framework/details/cow_ptr_test.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/details/cow_ptr.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(COWPtr, all) {
+  COWPtr<int> ptr(new int{0});
+  ASSERT_EQ(ptr.Data(), 0);
+  COWPtr<int> ptr2 = ptr;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(&ptr2.Data(), &ptr.Data());
+  *ptr2.MutableData() = 10;
+  ASSERT_EQ(ptr.Data(), 0);
+  ASSERT_EQ(ptr2.Data(), 10);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index bf1f0471cc..c0418c9266 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/place.h"
 
 DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
@@ -49,10 +50,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
     var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarDesc::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
+        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
+        " PLACE_LIST]",
         var_type);
   }
 }
@@ -111,7 +115,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << op->DebugString();
+    VLOG(3) << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
     if (FLAGS_check_nan_inf) {
       for (auto& vname : op->OutputVars(true)) {
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 4f2746e4b8..ea69b87e2a 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -123,6 +123,7 @@ message VarDesc {
     STEP_SCOPES = 5;
     LOD_RANK_TABLE = 6;
     LOD_TENSOR_ARRAY = 7;
+    PLACE_LIST = 8;
   }
   required string name = 1;
   required VarType type = 2;
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index 2de5242831..2082f8bb76 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -87,7 +87,11 @@ class GradOpDescMakerBase {
     auto onames = this->Output(name);
     ret_val.reserve(onames.size());
     std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
-                   GradVarName);
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     (*this->grad_to_var_)[g_name] = fwd_var_name;
+                     return g_name;
+                   });
     return ret_val;
   }
 
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 3bea8f3d0a..4ef82a541e 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -11,10 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <string.h>  // for strdup
 #include <algorithm>
 #include <string>
 
 #include "paddle/framework/init.h"
+#include "paddle/framework/operator.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"
@@ -24,7 +26,6 @@ namespace framework {
 
 std::once_flag gflags_init_flag;
 
-// TODO(qijun) move init gflags to init.cc
 void InitGflags(std::vector<std::string> &argv) {
   std::call_once(gflags_init_flag, [&]() {
     int argc = argv.size();
@@ -40,43 +41,29 @@ void InitGflags(std::vector<std::string> &argv) {
   });
 }
 
-bool InitDevices(const std::vector<std::string> &devices) {
-  // device format
-  // CPU
-  // GPU:1
-  // TODO(dzhwinter) : add device format annotation for users.
+void InitDevices() {
+  /*Init all avaiable devices by default */
+
   std::vector<platform::Place> places;
-  for (auto &device : devices) {
-    auto p = string::Piece(device);
-    if (string::HasPrefix(p, "CPU")) {
-      places.emplace_back(platform::CPUPlace());
-    } else if (string::HasPrefix(p, "GPU")) {
+  places.emplace_back(platform::CPUPlace());
+
 #ifdef PADDLE_WITH_CUDA
-      auto pos = string::RFind(p, ':', string::Piece::npos);
-      auto number = device.substr(pos + 1);
-      places.emplace_back(platform::CUDAPlace(std::stoi(number)));
+  int count = platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
 #else
-      LOG(WARNING)
-          << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+  LOG(WARNING)
+      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
 #endif
-    } else {
-      return false;
-    }
-  }
 
-  if (std::find_if(places.begin(), places.end(),
-                   [&](const platform::Place &place) {
-                     return platform::is_cpu_place(place);
-                   }) == places.end()) {
-    places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
-  }
   platform::DeviceContextPool::Init(places);
-  return true;
 }
 
 void InitGLOG(const std::string &prog_name) {
-  google::InitGoogleLogging(prog_name.c_str());
+  // glog will not hold the ARGV[0] inside.
+  // Use strdup to alloc a new string.
+  google::InitGoogleLogging(strdup(prog_name.c_str()));
   google::InstallFailureSignalHandler();
 }
 
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
index 9c84a03ded..c8fd964d00 100644
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
@@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv);
 
 void InitGLOG(const std::string &prog_name);
 
-bool InitDevices(const std::vector<std::string> &devices);
+void InitDevices();
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f0788051d4..f837a965d3 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -14,18 +14,13 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
 
-TEST(Init, InitDevices) {
+TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
-  std::vector<std::string> ds1 = {"CPU"};
-  ASSERT_EQ(InitDevices(ds1), true);
+  using paddle::platform::DeviceContextPool;
 
-#ifdef PADDLE_WITH_CUDA
-  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
-  ASSERT_EQ(InitDevices(ds2), true);
-
-  // test re-init
-  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
-  ASSERT_EQ(InitDevices(ds3), true);
-#endif
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_GE(pool.size(), 1U);
 }
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 8f6944c241..87a57d0951 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -43,16 +43,36 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   return os;
 }
 
-LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
-  LoD new_lod;
-  new_lod.reserve(level_end - level_begin);
-  for (size_t i = level_begin; i < level_end; i++) {
-    new_lod.emplace_back(in.at(i));
-  }
-  // transform the lowest level to absolute offset.
-  LoD abs_offset_lod = ToAbsOffset(in);
-  new_lod.back() = abs_offset_lod[level_end - 1];
-  return new_lod;
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
+
+  if (!platform::is_cpu_place(t.place())) {
+    LoDTensor tt;
+    framework::Copy(t, platform::CPUPlace(), &tt);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(t.place());
+    dev_ctx.Wait();
+
+    os << tt;
+    return os;
+  }
+
+  os << "dim: " << t.dims() << "\n";
+  os << "lod: " << t.lod() << "\n";
+
+  // only print first ten elements
+  int64_t size = t.numel() < 10 ? t.numel() : 10;
+  for (int64_t i = 0; i < size; ++i) {
+    os << t.data<float>()[i] << " ";
+  }
+
+  return os;
+}
+
+std::string LoDToString(const LoD &lod) {
+  std::ostringstream stream;
+  stream << lod;
+  return stream.str();
 }
 
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
@@ -115,43 +135,6 @@ bool operator==(const LoD &a, const LoD &b) {
   return true;
 }
 
-size_t LoDTensor::NumElements(size_t level, size_t idx) const {
-  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE_LT(idx, NumElements(level));
-  return lod_[level][idx + 1] - lod_[level][idx];
-}
-
-size_t LoDTensor::NumInstancesInElement(size_t level, size_t idx) const {
-  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE_LT(idx, NumElements(level));
-  auto abs_lod = ToAbsOffset(lod());
-  size_t begin = abs_lod[level][idx];
-  size_t end = abs_lod[level][idx + 1];
-  return end - begin;
-}
-
-void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
-  auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
-  lod_ = new_lod;
-}
-
-void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
-                              size_t elem_end) {
-  PADDLE_ENFORCE_LT(level, NumLevels());
-  PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
-  PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
-
-  auto abs_lod = framework::ToAbsOffset(lod());
-  auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
-  lod_ = new_lod;
-
-  // slice the underlying tensor
-  size_t begin = abs_lod[level][elem_begin];
-  size_t end = abs_lod[level][elem_end];
-  PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
-  ShareDataWith(Slice(begin, end));
-}
-
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
 LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
@@ -244,5 +227,56 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
+// TODO(tonyyang-svail): make this function support LoD
+std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
+    const std::vector<platform::Place> places) const {
+  check_memory_size();
+  PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
+  PADDLE_ENFORCE(dims()[0] % places.size() == 0,
+                 "Batch size should be divided by places size");
+
+  std::vector<LoDTensor> lods;
+  for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+    int begin = place_idx * dims()[0] / places.size();
+    int end = (place_idx + 1) * dims()[0] / places.size();
+
+    auto src = Slice(begin, end);
+    auto &dst_place = places[place_idx];
+    LoDTensor dst;
+    framework::Copy(src, dst_place, &dst);
+
+    lods.emplace_back(dst);
+  }
+
+  return lods;
+}
+
+// TODO(tonyyang-svail): make this function support LoD
+void LoDTensor::MergeLoDTensor(
+    const std::vector<const LoDTensor *> &lod_tensors,
+    platform::Place dst_place) {
+  PADDLE_ENFORCE(!lod_tensors.empty());
+  framework::DDim new_dim = lod_tensors[0]->dims();
+  std::type_index new_type = lod_tensors[0]->type();
+  auto new_layout = lod_tensors[0]->layout();
+  for (auto *lod : lod_tensors) {
+    PADDLE_ENFORCE(new_dim == lod->dims());
+    PADDLE_ENFORCE(new_type == lod->type());
+    PADDLE_ENFORCE(new_layout == lod->layout());
+  }
+  new_dim[0] *= lod_tensors.size();
+  Resize(new_dim);
+  set_layout(new_layout);
+
+  mutable_data(dst_place, new_type);
+  int begin = 0;
+  for (auto *src : lod_tensors) {
+    int end = begin + src->dims()[0];
+    auto dst = Slice(begin, end);
+    framework::Copy(*src, dst_place, &dst);
+    begin = end;
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index d0b6befffe..88ea78f268 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -58,14 +58,9 @@ using Vector = thrust::host_vector<
 using LoD = std::vector<Vector<size_t>>;
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
 
-/*
- * Slice levels from a LoD.
- * NOTE the lowest level should always be the absolute offsets of the underlying
- * tensor instances. So if higher layers are sliced without the lowest level,
- * the lower level of the sliced LoD will be transformed to the absolute offset.
- */
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
+std::string LoDToString(const LoD& lod);
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end);
@@ -115,34 +110,11 @@ class LoDTensor : public Tensor {
     return (lod_)[level].size() - 1;
   }
 
-  /*
-   * Number of lower-level elements.
-   * For example, a 2-level lod-tensor
-   *
-   * 0-th level   |   |
-   * 1-th level   ||  |||
-   *
-   * NumElements(0, 0) get 2
-   * NumElements(0, 1) get 3
-   */
-  size_t NumElements(size_t level, size_t idx) const;
-
-  /*
-   * Get the number of instances in the underlying tensor in the `idx`-th
-   * element.
-   */
-  size_t NumInstancesInElement(size_t level, size_t idx) const;
-
-  /*
-   * Shrink levels[level_begin:level_end]
-   */
-  void ShrinkLevels(size_t level_begin, size_t level_end);
+  std::vector<LoDTensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
 
-  /*
-   * Shrink elements of a level, [elem_begin: elem_end]
-   * @note: low performance in slice lod_.
-   */
-  void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
+  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
+                      platform::Place place);
 
  private:
   LoD lod_;
@@ -177,8 +149,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   for (size_t ins = 0; ins < num_instances; ins++) {
     for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
       auto slice = tensor.Slice(elem, elem + 1);
-      CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
-               platform::CPUDeviceContext(), &slice);
+      Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
+           platform::CPUDeviceContext(), &slice);
     }
   }
   return tensor;
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 0868c1f6e6..baad9c6f98 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -54,92 +54,6 @@ class LoDTensorTester : public ::testing::Test {
   LoDTensor lod_tensor_;
 };
 
-TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
-
-TEST_F(LoDTensorTester, NumElements) {
-  ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
-  ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
-}
-
-TEST_F(LoDTensorTester, NumElements2) {
-  ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
-}
-
-TEST_F(LoDTensorTester, ShrinkLevels) {
-  // slice 1 level
-  for (size_t level = 0; level < 3UL; ++level) {
-    LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.ShrinkLevels(level, level + 1);
-    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-  }
-  // shrink 2 level
-  for (size_t level = 0; level < 2UL; ++level) {
-    LoDTensor new_lod_tensor = lod_tensor_;
-    new_lod_tensor.ShrinkLevels(level, level + 2);
-    // the lowest level's last element should be the tensor's batch_size.
-    ASSERT_EQ(new_lod_tensor.lod().back().back(),
-              lod_tensor_.lod().back().back());
-    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
-  }
-}
-
-TEST_F(LoDTensorTester, ShrinkInLevel) {
-  size_t level = 0;
-  LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 1);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(2), 5UL);
-  ASSERT_EQ(new_lod_tensor.dims()[0], 12);
-  for (int i = 0; i < 12 * 128; i++) {
-    ASSERT_EQ(new_lod_tensor.data<float>()[i], i);
-  }
-
-  level = 1;
-  new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 1, 2);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
-  ASSERT_EQ(new_lod_tensor.dims()[0], 7);
-  for (int i = 5 * 128; i < 12 * 128; i++) {
-    ASSERT_EQ(new_lod_tensor.data<float>()[i - 5 * 128], i);
-  }
-
-  LoDTensor t1;
-  t1.set_lod(lod_tensor_.lod());
-  t1.ShareDataWith(lod_tensor_);
-
-  LoDTensor t2;
-  t2.set_lod(lod_tensor_.lod());
-  t2.ShareDataWith(lod_tensor_);
-
-  t1.ShrinkInLevel(0, 1, 2);
-  t2.ShrinkInLevel(0, 0, 1);
-  EXPECT_NE(t1.data<float>(), t2.data<float>());
-  EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
-}
-
-TEST_F(LoDTensorTester, SerializeAndDeserialize) {
-  LoDTensor dst_tensor;
-  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
-  std::ostringstream oss;
-  SerializeToStream(oss, lod_tensor_, cpu_ctx);
-  std::istringstream iss(oss.str());
-  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
-  float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
-  for (int i = 0; i < kLodTensorSize; ++i) {
-    EXPECT_EQ(dst_ptr[i], i);
-  }
-  EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
-}
-
 TEST(LodExpand, test) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
@@ -201,5 +115,21 @@ TEST(LoD, AppendLoD) {
   EXPECT_EQ(origin, expected);
 }
 
+TEST(LoD, ToAbsOffset) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(abs_lod, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index e02e572af2..1c0372bb16 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -66,6 +66,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
                       out);
     out_var->SetLoDLevel(in_var->GetLoDLevel());
   }
+
   bool IsRuntime() const override;
 
  protected:
@@ -383,7 +384,7 @@ void OpDesc::InferVarType(BlockDesc *block) const {
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
         block->FindRecursiveOrCreateVar(out_var_name)
-            ->SetType(proto::VarDesc::LOD_TENSOR);
+            .SetType(proto::VarDesc::LOD_TENSOR);
       }
     }
   }
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index 4cf784a0d0..a5ffb16292 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -129,7 +129,7 @@ class OpDesc {
   }
 
   proto::OpDesc desc_;
-  // input arg name => output variable names
+  // input arg name => input variable names
   VariableNameMap inputs_;
   // output arg name => output variable names
   VariableNameMap outputs_;
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index cef530c6e6..66f07b6757 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -12,13 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/op_registry.h"
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/framework/op_registry.h"
+
 namespace pd = paddle::framework;
 
 namespace paddle {
 namespace framework {
+
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
@@ -215,7 +218,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(InferShapeContext* ctx) const override {}
 
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
   }
@@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) {
   op->Run(scope, cpu_place);
 }
 
-#ifdef PADDLE_WITH_CUDA
 TEST(OperatorRegistrar, CUDA) {
   paddle::framework::proto::OpDesc op_desc;
   paddle::platform::CUDAPlace cuda_place(0);
@@ -263,4 +265,127 @@ TEST(OperatorRegistrar, CUDA) {
 
   op->Run(scope, cuda_place);
 }
-#endif
+
+static int op_test_value = 0;
+
+using paddle::platform::DeviceContext;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+namespace paddle {
+namespace framework {
+
+class OpWithMultiKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
+        framework::LibraryType::kCUDNN);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    ++op_test_value;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    --op_test_value;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest2<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value += 10;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest2<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value -= 10;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
+                             paddle::framework::OpWithMultiKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, OpWithMultiKernel) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_multi_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  // TODO(qiao) add priority back
+  // use all available kernels
+  paddle::framework::UseALL();
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -10);
+
+  // remove cuda kernels
+  paddle::framework::UseCPU();
+  op->Run(scope, cpu_place);
+
+  EXPECT_EQ(op_test_value, -9);
+
+  // add cuda kernels
+  paddle::framework::UseCUDA();
+  op->Run(scope, cuda_place);
+
+  EXPECT_EQ(op_test_value, -10);
+
+  // use cudnn kernel
+  paddle::framework::UseCUDNN();
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -20);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index fc7091f1c8..be1373dc2a 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -11,20 +11,101 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
 
 #include <algorithm>
-#include <atomic>
 
 #include "paddle/framework/data_transform.h"
 #include "paddle/framework/executor.h"
-#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
+DEFINE_bool(op_sync, false,
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
 namespace paddle {
 namespace framework {
 
+std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
+void UseCPU() {
+  kKernelPriority.clear();
+  /*Plain CPU*/
+  auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain);
+  kKernelPriority.insert(kKernelPriority.begin(), pair0);
+}
+
+void UseMKLDNN() {
+  UseCPU();
+#if PADDLE_WITH_MKLML
+  {
+    /*MKLDNN Kernel*/
+    auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN);
+    kKernelPriority.insert(kKernelPriority.begin(), pair0);
+  }
+#endif
+}
+
+void UseCUDA() {
+  UseMKLDNN();
+#if PADDLE_WITH_CUDA
+  /*Plain GPU*/
+  auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain);
+  kKernelPriority.insert(kKernelPriority.begin(), pair0);
+#endif
+}
+
+void UseCUDNN() {
+  UseCUDA();
+#if PADDLE_WITH_CUDA
+  if (platform::dynload::HasCUDNN()) {
+    /*CUDNN Kernel*/
+    auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN);
+    kKernelPriority.insert(kKernelPriority.begin(), pair0);
+  }
+#endif
+}
+
+void UseALL() {
+  UseCPU();
+  UseMKLDNN();
+  UseCUDA();
+  UseCUDNN();
+}
+
+static DDim GetDims(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return DDim({-1});
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    return DDim({-1});
+  }
+}
+
+static LoD GetLoD(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  auto default_lod = LoD({{}});
+
+  if (var == nullptr) {
+    return default_lod;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().lod();
+  } else {
+    return default_lod;
+  }
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -57,7 +138,7 @@ const std::vector<std::string>& OperatorBase::Outputs(
   return it->second;
 }
 
-std::string OperatorBase::DebugString() const {
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
   std::stringstream ss;
   ss << "Op(" << type_ << "), inputs:{";
   for (auto it = inputs_.begin(); it != inputs_.end();) {
@@ -65,6 +146,10 @@ std::string OperatorBase::DebugString() const {
     ss << input.first << "[";
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
+      }
       if (i != input.second.size() - 1) {
         ss << ", ";
       }
@@ -81,6 +166,10 @@ std::string OperatorBase::DebugString() const {
     ss << output.first << "[";
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
+      }
       if (i != output.second.size() - 1) {
         ss << ", ";
       }
@@ -178,6 +267,10 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
+static bool VarIsTensor(const Variable* var) {
+  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+}
+
 static const Tensor* GetTensorFromVar(const Variable* var) {
   const Tensor* t = nullptr;
   if (var->IsType<LoDTensor>()) {
@@ -185,7 +278,8 @@ static const Tensor* GetTensorFromVar(const Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     t = &(var->Get<SelectedRows>().value());
   } else {
-    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
   }
   return t;
 }
@@ -197,7 +291,8 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     t = var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
   }
   return t;
 }
@@ -347,6 +442,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
+
+    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+    // Need to call ShareLayout explicitly in sequence related ops.
+    // Shall we have a better method to shared info between in/out Tensor?
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_layout(in_tensor.layout());
   }
 
   bool IsRuntime() const override { return true; }
@@ -359,7 +473,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
     }
   }
 
@@ -370,7 +485,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
     }
   }
 
@@ -384,24 +500,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
-const platform::DeviceContext* GetDeviceContext(
-    framework::KernelTypePair& kernel_pair) {
-  auto& actual_kernel_key = kernel_pair.first;
-  auto& expected_kernel_key = kernel_pair.second;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(actual_kernel_key.place_) &&
-      platform::is_cpu_place(expected_kernel_key.place_)) {
-    return pool.Get(actual_kernel_key.place_);
-  } else if (platform::is_cpu_place(actual_kernel_key.place_) &&
-             platform::is_gpu_place(expected_kernel_key.place_)) {
-    return pool.Get(expected_kernel_key.place_);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
 void OperatorWithKernel::Run(const Scope& scope,
                              const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -417,71 +515,65 @@ void OperatorWithKernel::Run(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  // check if op[type] have kernel for kernel_key
-  OpKernelMap& kernels = kernels_iter->second;
-
   ExecutionContext ctx(*this, scope, *dev_ctx);
-  auto actual_kernel_key = GetActualKernelType(ctx);
-  auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
-  auto kernel_iter = kernels.find(expected_kernel_key);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
 
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("The operator %s does not support %s", type_,
-                 expected_kernel_key);
-  }
-
-  if (actual_kernel_key == expected_kernel_key) {
-    PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
-                      "Currently, model parallelism is only supported between "
-                      "CPU and other devices. For example, multi-GPU model "
-                      "parallelism will failed.");
-  } else {
-    auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
-    const DataTransformFn* trans_fun =
-        DataTransformFnMap::Instance().GetNullable(kernel_pair);
-    if (trans_fun) {
-      auto input_vars = this->InputVars();
-      // TODO(qijun) filter the input vars that do not need to be transformed
-
-      // filter vars that has been transformed
-      std::vector<std::string> need_trans;
-      for (auto var_name : input_vars) {
-        auto var_name_trans =
-            var_name + framework::KernelTypeToString(expected_kernel_key);
-        if (!scope.FindVar(var_name_trans)) {
-          const_cast<Scope&>(scope).Var(var_name_trans);
-          need_trans.push_back(var_name);
-        }
-      }
+  OpKernelMap& kernels = kernels_iter->second;
 
-      if (!need_trans.empty()) {
-        auto trans_dev_ctx = GetDeviceContext(kernel_pair);
+  for (auto& candidate : kKernelPriority) {
+    auto candidate_key =
+        OpKernelType(expected_kernel_key.data_type_, std::get<0>(candidate),
+                     expected_kernel_key.data_layout_, std::get<1>(candidate));
 
-        // Wait for transform starting
-        dev_ctx->Wait();
+    if ((candidate_key == expected_kernel_key) ||
+        (kernels.count(candidate_key))) {
+      expected_kernel_key = candidate_key;
+      break;
+    }
+  }
 
-        for (auto var_name : need_trans) {
-          (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
-                       scope.FindVar(var_name + framework::KernelTypeToString(
-                                                    expected_kernel_key)));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name_item : this->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      if (var && VarIsTensor(var)) {
+        auto* tensor_in = GetTensorFromVar(var);
+        if (tensor_in->IsInitialized()) {
+          auto kernel_type_for_var = this->GetKernelTypeForVar(
+              var_name_item.first, *tensor_in, expected_kernel_key);
+          if (kernel_type_for_var != expected_kernel_key) {
+            auto out_var_names = OutputVars(true);
+            if (std::find(out_var_names.begin(), out_var_names.end(),
+                          var_name) != out_var_names.end()) {
+              PADDLE_THROW(
+                  "var %s is both input and output, "
+                  "does not support transform",
+                  var_name);
+            }
+            VLOG(3) << "need to do transform for var " << var_name;
+            auto* trans_var = new_scope.Var(var_name);
+            auto* out = DataTransform(expected_kernel_key, kernel_type_for_var,
+                                      *tensor_in);
+            CopyVariableWithTensor(*var, *out, *trans_var);
+          }
         }
-        // Wait for data transform finishing
-        trans_dev_ctx->Wait();
       }
     }
   }
 
-  kernel_iter->second->Compute(ctx);
-}
+  auto kernel_iter = kernels.find(expected_kernel_key);
 
-OpKernelType OperatorWithKernel::GetActualKernelType(
-    const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
-}
+  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
+  kernel_iter->second->Compute(
+      ExecutionContext(*this, new_scope, *new_dev_ctx));
 
-OpKernelType OperatorWithKernel::GetExpectedKernelType(
-    const OpKernelType& actual_kernel_type) const {
-  return actual_kernel_type;
+  /*For profiling/benchmark only*/
+  if (FLAGS_op_sync) {
+    new_dev_ctx->Wait();
+  }
 }
 
 proto::DataType OperatorWithKernel::IndicateDataType(
@@ -513,5 +605,16 @@ proto::DataType OperatorWithKernel::IndicateDataType(
   return static_cast<proto::DataType>(data_type);
 }
 
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
+}
+
+OpKernelType OperatorWithKernel::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const OpKernelType& expected_kernel_type) const {
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index d0a9b643d5..d5feb59864 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <atomic>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <vector>
 
@@ -52,10 +53,33 @@ constexpr char kGradVarSuffix[] = "@GRAD";
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
-// define some kernel hint
-const std::string kUseCPU = "use_cpu";
-const std::string kUseCUDNN = "use_cudnn";
-const std::string kUseMKLDNN = "use_mkldnn";
+// define some kernel priority
+extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
+/**
+ * @brief Use cpu kernel only
+ */
+void UseCPU();
+
+/**
+ * @brief Perfer MKLDNN kernel than Plain CPU kernel
+ */
+void UseMKLDNN();
+
+/**
+ * @brief Perfer CUDA kernel than Plain CPU kernel
+ */
+void UseCUDA();
+
+/**
+ * @brief Perfer cudnn kernel than Plain CUDA kernel
+ */
+void UseCUDNN();
+
+/**
+ * @brief Use all available kernels
+ */
+void UseALL();
 
 inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
@@ -84,7 +108,10 @@ class OperatorBase {
     return boost::get<T>(attrs_.at(name));
   }
 
-  virtual std::string DebugString() const;
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+
+  std::string DebugString() const { return DebugStringEx(nullptr); }
 
   /// Net will call this function to Run an op.
   virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
@@ -381,9 +408,10 @@ class OperatorWithKernel : public OperatorBase {
   }
 
  protected:
-  virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const;
-  virtual OpKernelType GetExpectedKernelType(
-      const OpKernelType& actual_kernel_type) const;
+  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const OpKernelType& expected_kernel_type) const;
 
  private:
   // indicate kernel DataType by input data. Defaultly all input data must be
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 4d38a7ada9..b69d7c7a74 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -69,7 +69,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                              paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -114,7 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
-  OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override {
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
     return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
   }
 };
@@ -194,7 +195,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -224,7 +225,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
   proto::OpDesc op_desc;
 
   op_desc.set_type("op_multi_inputs_with_kernel");
@@ -263,7 +264,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };
 
 TEST(Operator, Clone) {
-  paddle::framework::InitDevices({"CPU"});
+  paddle::framework::InitDevices();
   OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                   paddle::framework::VariableNameMap{},
                   paddle::framework::AttributeMap{});
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 4e80e3d974..2bd0ac8f5a 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -109,6 +109,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
   Rename(origin_name, var_name);
   return var_name;
 }
+
 Variable* Scope::FindVarLocally(const std::string& name) const {
   auto it = vars_.find(name);
   if (it != vars_.end()) return it->second;
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 10143326df..a1da81cc79 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -75,9 +75,9 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
- private:
   Variable* FindVarLocally(const std::string& name) const;
 
+ private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 341a6949be..4aaa29d794 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -55,6 +55,10 @@ class Tensor {
   template <typename T>
   inline const T* data() const;
 
+  inline bool IsInitialized() const;
+
+  inline void switch_place(platform::Place new_place);
+
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
@@ -200,6 +204,15 @@ class Tensor {
   size_t offset_;
 };
 
+inline void Tensor::switch_place(platform::Place new_place) {
+  if (holder_->place() == new_place) {
+    return;
+  }
+
+  // TODO(tonyyang-svail): do memcpy here.
+  PADDLE_THROW("Not Implemented");
+}
+
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 6c6f298edc..1340c5e485 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -84,6 +84,8 @@ inline const T* Tensor::data() const {
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
 }
 
+inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
+
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc
index 7efc649d0b..a5b83eaa07 100644
--- a/paddle/framework/tensor_util.cc
+++ b/paddle/framework/tensor_util.cc
@@ -69,7 +69,7 @@ struct AnyVisitor : public boost::static_visitor<bool> {
     tmp.mutable_data<bool>(cpu);
     auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
     gpuctx->Wait();
-    CopyFrom(out, cpu, *gpuctx, &tmp);
+    Copy(out, cpu, *gpuctx, &tmp);
     gpuctx->Wait();
     return GetResult(tmp, cpu);
   }
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index 5ac13cba4d..091b63bf0f 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -29,11 +29,12 @@ namespace framework {
  * @param[in] dst_place  The dst place.
  * @param[in] ctx        The device context contains device resources.
  *
- * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
  */
-
-inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                     const platform::DeviceContext& ctx, Tensor* dst) {
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
   src.check_memory_size();
 
   dst->Resize(src.dims());
@@ -88,26 +89,25 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
 }
 
 /**
- * @brief CopyFrom support CPU <-> CPU
+ * @brief Wrapper on
+ *     Copy(const Tensor& src, const platform::Place& dst_place,
+ *              const platform::DeviceContext& ctx, Tensor* dst);
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
  */
-inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                     Tensor* dst) {
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-
-  auto src_place = src.place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = dst->mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  PADDLE_ENFORCE(platform::is_cpu_place(src_place) &&
-                 platform::is_cpu_place(dst_place));
-
-  memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-               boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 Tensor* dst) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(src.place())) {
+    dev_ctx = pool.Get(src.place());
+  } else {
+    dev_ctx = pool.Get(dst_place);
+  }
+  Copy(src, dst_place, *dev_ctx, dst);
 }
 
 /**
@@ -116,8 +116,8 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
  * @param[in] src        The external tensor.
  * @param[in] ctx        The device context contains device resources.
  *
- * * @note    CopyFromVector assumes that the tensor has been resized
- *            before invoking.
+ * * @note    CopyFromVector will resize dst to an 1D tensor with the same
+ *            size as src.
  */
 template <typename T>
 inline void CopyFromVector(const std::vector<T>& src,
@@ -316,7 +316,7 @@ inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
           DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
       is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
       auto cpu_place = new platform::CPUPlace();
-      framework::CopyFrom(cpu_tensor, *cpu_place, dev_ctx, tensor);
+      framework::Copy(cpu_tensor, *cpu_place, dev_ctx, tensor);
       delete cpu_place;
 #else
       PADDLE_THROW("Unexpected branch");
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
index 15cd2bd09c..3636125f20 100644
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 
-TEST(CopyFrom, Tensor) {
+TEST(Copy, Tensor) {
   Tensor src_tensor;
   Tensor dst_tensor;
   platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
@@ -32,7 +32,7 @@ TEST(CopyFrom, Tensor) {
   src_tensor.set_layout(DataLayout::kAnyLayout);
 
   auto cpu_place = new platform::CPUPlace();
-  CopyFrom(src_tensor, *cpu_place, &dst_tensor);
+  Copy(src_tensor, *cpu_place, &dst_tensor);
 
   const int* dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(src_ptr, dst_ptr);
@@ -43,7 +43,7 @@ TEST(CopyFrom, Tensor) {
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
   Tensor slice_tensor = src_tensor.Slice(1, 2);
-  CopyFrom(slice_tensor, *cpu_place, &dst_tensor);
+  Copy(slice_tensor, *cpu_place, &dst_tensor);
   const int* slice_ptr = slice_tensor.data<int>();
   dst_ptr = dst_tensor.data<int>();
   ASSERT_NE(dst_ptr, slice_ptr);
@@ -67,11 +67,11 @@ TEST(CopyFrom, Tensor) {
     // CPU Tensor to GPU Tensor
     auto gpu_place = new platform::CUDAPlace(0);
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
-    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
     // GPU Tensor to CPU Tensor
     auto cpu_place = new platform::CPUPlace();
-    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -84,10 +84,10 @@ TEST(CopyFrom, Tensor) {
     Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor
-    CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
     // GPU Tensor to CPU Tensor
-    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Slice Tensors
     gpu_ctx.Wait();
@@ -155,7 +155,7 @@ TEST(CopyFromVector, Tensor) {
     CUDADeviceContext gpu_ctx(*gpu_place);
     CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
-    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -175,7 +175,7 @@ TEST(CopyFromVector, Tensor) {
     CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
     gpu_tensor.Resize(make_ddim({2, 2}));
     CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -287,7 +287,7 @@ TEST(Tensor, SerializeAndDeserialize) {
     auto gpu_place = new platform::CUDAPlace();
     platform::CUDADeviceContext gpu_ctx(*gpu_place);
 
-    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
     std::ostringstream oss;
     SerializeToStream(oss, gpu_tensor, gpu_ctx);
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index aeab18d721..62ab6593ef 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
+      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
   }
 }
 
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index e5a94759f9..36b76fb196 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -32,6 +32,8 @@ class Variable {
     return *static_cast<const T*>(holder_->Ptr());
   }
 
+  bool IsInitialized() const { return holder_ != nullptr; }
+
   template <typename T>
   T* GetMutable() {
     if (!IsType<T>()) {
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index ebfe0573cf..4ab54a5022 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -233,6 +233,13 @@ public:
     (void)numProcessed;
   }
 
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  virtual void releaseOutput() {}
+
 protected:
   virtual void onLoadParameter() {}
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 68bf37d59d..1f2aa61b6f 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -187,6 +187,31 @@ void NeuralNetwork::init(const ModelConfig& config,
     CHECK(it != layerMap_.end());
     outputLayers_.push_back(it->second);
   }
+
+  for (const auto& layer : layers_) {
+    const auto& name = layer->getName();
+    bool isMiddleLayer = true;
+
+    // if data layer
+    for (const auto& dataLayer : dataLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    // if output layer
+    for (const auto& dataLayer : outputLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    if (isMiddleLayer) {
+      middleLayers_.push_back(layer);
+    }
+  }
 }
 
 void NeuralNetwork::connect(LayerPtr agentLayer,
@@ -327,6 +352,13 @@ void NeuralNetwork::onPassEnd() {
   }
 }
 
+void NeuralNetwork::releaseOutput() {
+  for (auto& layer : middleLayers_) {
+    Argument& arg = layer->getOutput();
+    arg.value.reset();
+  }
+}
+
 #ifndef PADDLE_MOBILE_INFERENCE
 
 class CombinedEvaluator : public Evaluator {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 6888380290..968e198cf6 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -137,6 +137,13 @@ public:
   /// some finish work, like convert the weight format of MKLDNNLayers
   void finish();
 
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  void releaseOutput();
+
 protected:
   /**
    * The constructor of NeuralNetwork.
@@ -158,6 +165,7 @@ protected:
 
   std::vector<DataLayerPtr> dataLayers_;
   std::vector<LayerPtr> outputLayers_;
+  std::vector<LayerPtr> middleLayers_;
 
   static std::map<std::string, bool> dllInitMap;
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 6fbf3c7fde..2d0fff608c 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -132,6 +132,8 @@ void MKLDNNLayer::reshapeInput(int& batchsize,
   if (w != 0) {
     width = w;
   }
+  height = height != 0 ? height : 1;
+  width = width != 0 ? width : 1;
 }
 
 void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index e48b9b5a91..3ba39f18b6 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -98,6 +98,8 @@ protected:
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
+        ih_(0),
+        iw_(0),
         condition_(0),
         needResetBwd_(true),
         outputOnlyMKLDNN_(false),
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
index 48a51efcd2..37b8b20ddf 100644
--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@@ -38,23 +38,16 @@ void InferenceEngine::LoadInferenceModel(
   LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
 // PicklingTools cannot parse the vector of strings correctly.
 #else
-  // program_desc_str
-  // the inference.model is stored by following python codes:
-  //  inference_program = fluid.io.get_inference_program(predict)
-  //  model_filename = "recognize_digits_mlp.inference.model/inference.model"
-  //  with open(model_filename, "w") as f:
-  //      program_str = inference_program.desc.serialize_to_string()
-  //          f.write(struct.pack('q', len(program_str)))
-  //          f.write(program_str)
-  std::string model_filename = dirname + "/inference.model";
+  std::string model_filename = dirname + "/__model__.dat";
   LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream fs(model_filename, std::ios_base::binary);
-  int64_t size = 0;
-  fs.read(reinterpret_cast<char*>(&size), sizeof(int64_t));
-  LOG(INFO) << "program_desc_str's size: " << size;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
   std::string program_desc_str;
-  program_desc_str.resize(size);
-  fs.read(&program_desc_str[0], size);
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
 #endif
   program_ = new framework::ProgramDesc(program_desc_str);
   GenerateLoadProgram(dirname);
@@ -176,7 +169,7 @@ void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
   }
 
   auto* place = new platform::CPUPlace();
-  framework::InitDevices({"CPU"});
+  framework::InitDevices();
   framework::Executor* executor = new framework::Executor(*place);
   framework::Scope* scope = new framework::Scope();
 
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index c4bb6baee7..1a73a94567 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -114,5 +114,21 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
 
 #endif
 
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 11bbb88187..7012b6d331 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -54,6 +54,13 @@ void Free(Place place, void* ptr);
 template <typename Place>
 size_t Used(Place place);
 
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
 /**
  * \brief   Free memory block in one place.
  *
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index f476bf7126..b3f699f9b7 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -44,6 +44,9 @@ TEST(BuddyAllocator, CPUAllocation) {
 
   EXPECT_NE(p, nullptr);
 
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
   paddle::memory::Free(cpu, p);
 }
 
@@ -99,6 +102,9 @@ TEST(BuddyAllocator, GPUAllocation) {
 
   EXPECT_NE(p, nullptr);
 
+  paddle::platform::Place place = gpu;
+  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
   paddle::memory::Free(gpu, p);
 }
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 77b52eb176..e1b695e8cd 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -135,6 +135,7 @@ op_library(detection_output_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
+op_library(print_op DEPS lod_tensor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
@@ -151,7 +152,9 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding math_function)
 op_library(cos_sim_op DEPS cos_sim_functor)
+op_library(parallel_do_op DEPS executor)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index d7baa6e905..8e8a3c7dd3 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
index 0d77dbcbac..66f5b0f449 100644
--- a/paddle/operators/adagrad_op.h
+++ b/paddle/operators/adagrad_op.h
@@ -47,8 +47,7 @@ class AdagradOpKernel : public framework::OpKernel<T> {
           *ctx.Input<framework::Tensor>("Grad"));
       auto moment = framework::EigenVector<T>::Flatten(
           *ctx.Input<framework::Tensor>("Moment"));
-      auto lr = framework::EigenVector<T>::Flatten(
-          *ctx.Input<framework::Tensor>("LearningRate"));
+      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
       auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
       auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
@@ -56,8 +55,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
 
       moment_out.device(*place) = moment + grad * grad;
       Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
-      param_out.device(*place) =
-          param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        auto* lr = learning_rate->data<T>();
+        param_out.device(*place) =
+            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
+      } else {
+        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+        param_out.device(*place) =
+            param -
+            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* param_tensor = ctx.Input<framework::Tensor>("Param");
       PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index e0eef5d9f9..3fdad5ad9b 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -42,7 +42,7 @@ class ArrayOp : public framework::OperatorBase {
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
       framework::Tensor t;
-      framework::CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx, &t);
+      framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
       dev_ctx.Wait();
       offset = static_cast<size_t>(*t.data<int64_t>());
     } else {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index 49366fee8d..ba5c6bd3c6 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -110,8 +110,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
             platform::DeviceContextPool::Instance();
         auto &dev_ctx = *pool.Get(place);
 
-        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
-                            dev_ctx, &slice);
+        framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
+                        dev_ctx, &slice);
         out_offset += len;
       }
     }
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 7d77be3be1..e04aa2d28c 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -45,7 +45,7 @@ class AssignFunctor {
     out_rows.set_height(rows.height());
     auto &t = rows.value();
     auto *m = out_rows.mutable_value();
-    framework::CopyFrom(t, t.place(), dev_ctx_, m);
+    framework::Copy(t, t.place(), dev_ctx_, m);
   }
 
   template <typename T>
@@ -57,7 +57,7 @@ class AssignFunctor {
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
     auto &out_tensor = *out;
-    CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
+    Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
 
diff --git a/paddle/operators/assign_value_op.cc b/paddle/operators/assign_value_op.cc
new file mode 100644
index 0000000000..d5671c1183
--- /dev/null
+++ b/paddle/operators/assign_value_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AssignValueOp : public framework::OperatorWithKernel {
+ public:
+  AssignValueOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AssignValueOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::DataType(ctx.Attr<int>("dtype")), ctx.GetPlace());
+  }
+};
+
+class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Shape of values.");
+    AddAttr<int>("dtype", "data type of values")
+        .InEnum({framework::proto::DataType::INT32,
+                 framework::proto::DataType::FP32});
+    AddAttr<std::vector<float>>("fp32_values", "store the float values")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("int32_values", "store the int values")
+        .SetDefault({});
+    AddComment(R"DOC(
+AssignValue operator
+
+$$Out = values$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.cu.cc b/paddle/operators/assign_value_op.cu.cc
new file mode 100644
index 0000000000..b17e201500
--- /dev/null
+++ b/paddle/operators/assign_value_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.h b/paddle/operators/assign_value_op.h
new file mode 100644
index 0000000000..db2e430779
--- /dev/null
+++ b/paddle/operators/assign_value_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AssignValueKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int dtype = ctx.Attr<int>("dtype");
+    const char* value_name = nullptr;
+    switch (dtype) {
+      case framework::proto::DataType::INT32:
+        value_name = "int32_values";
+        break;
+      case framework::proto::DataType::FP32:
+        value_name = "fp32_values";
+        break;
+      default:
+        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        break;
+    }
+    auto values = ctx.Attr<std::vector<T>>(value_name);
+    framework::CopyFromVector(values, ctx.device_context(), out);
+    out->Resize(framework::make_ddim(shape));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index c16bc11931..b6494f9509 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index dd7b038b00..0e984c38ba 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -306,7 +306,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     const auto *var = ctx.InputVar(framework::GradVarName("Y"));
     if (var == nullptr) {
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 2e0513b37a..ed2e7b738a 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -39,7 +39,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
 
   std::map<size_t /*offset*/, std::vector<Item>> hash;
   framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int>(platform::CPUPlace());
+  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
   auto *scores_data =
       selected_scores->mutable_data<float>(platform::CPUPlace());
 
@@ -66,7 +66,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
 
 void BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
                                       std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int>();
+  auto *pre_ids_data = pre_ids.data<int64_t>();
 
   for (size_t offset = 0; offset < items->size(); offset++) {
     auto prefix_id = pre_ids_data[offset];
@@ -127,7 +127,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
   auto abs_lod = framework::ToAbsOffset(ids.lod());
   PADDLE_ENFORCE_GE(source_abs_two_level_lod.size(), 2UL);
 
-  auto *ids_data = ids.data<int>();
+  auto *ids_data = ids.data<int64_t>();
   auto *scores_data = scores.data<float>();
 
   size_t instance_dim = 1;
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index a040404266..44f667aead 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -55,10 +55,10 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(framework::proto::DataType::FP32,
-                                   ctx.device_context());
+                                   platform::CPUPlace());
   }
 };
 
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
index 74ab435c86..300aff90c0 100644
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
@@ -145,6 +145,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
         context.Attr<std::vector<int>>("excluded_chunk_types").end());
 
     auto* inference = context.Input<LoDTensor>("Inference");
+    auto place = inference->place();
     auto* label = context.Input<LoDTensor>("Label");
     auto* precision = context.Output<Tensor>("Precision");
     auto* recall = context.Output<Tensor>("Recall");
@@ -155,15 +156,15 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
 
     const int64_t* inference_data = inference->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
-    T* precision_data = precision->mutable_data<T>(context.GetPlace());
-    T* racall_data = recall->mutable_data<T>(context.GetPlace());
-    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+    T* precision_data = precision->mutable_data<T>(place);
+    T* racall_data = recall->mutable_data<T>(place);
+    T* f1_data = f1->mutable_data<T>(place);
     int64_t* num_infer_chunks_data =
-        num_infer_chunks->mutable_data<int64_t>(context.GetPlace());
+        num_infer_chunks->mutable_data<int64_t>(place);
     int64_t* num_label_chunks_data =
-        num_label_chunks->mutable_data<int64_t>(context.GetPlace());
+        num_label_chunks->mutable_data<int64_t>(place);
     int64_t* num_correct_chunks_data =
-        num_correct_chunks->mutable_data<int64_t>(context.GetPlace());
+        num_correct_chunks->mutable_data<int64_t>(place);
     *num_infer_chunks_data = 0;
     *num_label_chunks_data = 0;
     *num_correct_chunks_data = 0;
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 44665b7872..daa2c193b4 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 0aa7dd48ca..0c5ed3e4e8 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -315,10 +315,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace,
-                   paddle::operators::CudnnConvOpKernel<float>,
-                   paddle::operators::CudnnConvOpKernel<double>);
-
+// TODO(dzhwinter) : below register should be removed
 REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
                         paddle::operators::CudnnConvOpKernel<float>,
                         paddle::operators::CudnnConvOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index ad84524e17..1468e3eb96 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -230,7 +230,6 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
-namespace ops = paddle::operators;
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index 024e1d061a..30626028c1 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -120,17 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        ctx.device_context());
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::OpKernelType& actual_kernel_type) const override {
-    return framework::OpKernelType(actual_kernel_type.data_type_,
-                                   platform::CPUPlace());
+        platform::CPUPlace());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
index f6827b7b11..ce2f4e6622 100644
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -28,9 +28,6 @@ template <typename DeviceContext, typename T>
 class CRFDecodingOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "The crf_decoding operator can only run on CPU.");
-
     auto* emission_weights = ctx.Input<LoDTensor>("Emission");
     auto* transition_weights = ctx.Input<Tensor>("Transition");
     auto* label = ctx.Input<LoDTensor>("Label");
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index fe39cb481a..7abd5b1c61 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of cross_entropy
   // is determined by its input "X".
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
index f6bdc63cc2..571a75c9dc 100644
--- a/paddle/operators/detail/CMakeLists.txt
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -1 +1 @@
-grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
new file mode 100644
index 0000000000..5a4db2d7e6
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "grpc_client.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::AsyncSendVariable(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& var_name,
+                                  int64_t time_out) {
+  sendrecv::VariableMessage req;
+  auto* var = scope.FindVar(var_name);
+  SerializeToMessage(var_name, var, ctx, &req);
+
+  // varhandle
+  VarHandle var_h;
+  var_h.ep = ep;
+  var_h.scope = &scope;
+  var_h.name = var_name;
+  var_h.ctx = &ctx;
+
+  // stub context
+  auto ch = GetChannel(ep);
+  SendProcessor* s = new SendProcessor(ch);
+  s->Prepare(var_h, time_out);
+  s->response_call_back_ = NULL;
+
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& ret_msg) {
+  auto* outvar = var_h.scope->FindVar(var_h.name);
+
+  std::istringstream iss(ret_msg.serialized());
+  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+}
+
+bool RPCClient::AsyncGetVariable(const std::string& ep,
+                                 const platform::DeviceContext& ctx,
+                                 const framework::Scope& scope,
+                                 const std::string& var_name,
+                                 int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(var_name);
+
+  auto* var = scope.FindVar(var_name);
+  SerializeToMessage(var_name, var, ctx, &req);
+
+  // varhandle
+  VarHandle var_h;
+  var_h.ep = ep;
+  var_h.scope = &scope;
+  var_h.name = var_name;
+  var_h.ctx = &ctx;
+
+  // stub context
+  auto ch = GetChannel(ep);
+  GetProcessor* s = new GetProcessor(ch);
+  s->Prepare(var_h, time_out);
+  s->response_call_back_ = ProcGetResponse;
+
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::wait() {
+  bool ok = true;
+
+  while (true) {
+    if (req_count_ <= 0) {
+      break;
+    }
+
+    if (!Proceed()) {
+      LOG(ERROR) << "Get meets CompletionQueue error";
+      return false;
+    }
+  }
+
+  return ok;
+}
+
+bool RPCClient::Proceed() {
+  void* tag = NULL;
+  bool ok = false;
+
+  // request counts.
+  if (!cq_.Next(&tag, &ok)) {
+    return false;
+  }
+  req_count_--;
+
+  GPR_ASSERT(ok);
+  PADDLE_ENFORCE(tag);
+
+  // TODO(gongwb): add more retries.
+  ClientBase* c = static_cast<ClientBase*>(tag);
+  if (!c->status_.ok()) {
+    delete c;
+    return true;
+  }
+
+  c->Process();
+  delete c;
+  return true;
+}
+
+std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  auto ch = std::shared_ptr<grpc::Channel>(
+      grpc::CreateChannel(ep, grpc::InsecureChannelCredentials()));
+
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
new file mode 100644
index 0000000000..d27b5ced9e
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <time.h>
+#include <chrono>
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+struct VarHandle {
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  std::string name;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << "name:[" << name << "] ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& msg);
+
+class ClientBase {
+ public:
+  explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+    context_ = NULL;
+  }
+
+  virtual ~ClientBase() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+    RequestSendCallBack;
+
+class SendProcessor : public ClientBase {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VoidMessage reply_;
+  RequestSendCallBack response_call_back_ = NULL;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+    RequestGetCallBack;
+
+class GetProcessor : public ClientBase {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VariableMessage reply_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class RPCClient {
+ public:
+  bool AsyncSendVariable(const std::string& ep,
+                         const platform::DeviceContext& ctx,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         int64_t time_out = 600 * 1000);
+
+  bool AsyncGetVariable(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& var_name,
+                        int64_t time_out = 600 * 1000);
+  bool wait();
+
+ private:
+  bool Proceed();
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  int64_t req_count_ = 0;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
new file mode 100644
index 0000000000..e8d561a57f
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/grpc_server.h"
+
+using grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq)
+      : service_(service), cq_(cq), status_(PROCESS) {}
+  virtual ~RequestBase() {}
+  virtual void Process() { assert(false); }
+
+  CallStatus Status() { return status_; }
+  void SetStatus(CallStatus status) { status_ = status; }
+
+ protected:
+  grpc::ServerContext ctx_;
+  sendrecv::SendRecvService::AsyncService* service_;
+  grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+};
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq,
+                       SimpleBlockQueue<MessageWithName>* queue)
+      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
+    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
+                                  this);
+  }
+
+  virtual ~RequestSend() {}
+
+  virtual void Process() {
+    MessageWithName msg_with_name =
+        std::make_pair(request_.varname(), std::move(request_));
+    queue_->Push(std::move(msg_with_name));
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  SimpleBlockQueue<MessageWithName>* queue_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
+                      grpc::ServerCompletionQueue* cq, framework::Scope* scope)
+      : RequestBase(service, cq), responder_(&ctx_), scope_(scope) {
+    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+  }
+
+  virtual ~RequestGet() {}
+
+  virtual void Process() {
+    // proc request.
+    std::string var_name = request_.varname();
+    auto* var = scope_->FindVar(var_name);
+    SerializeToMessage(var_name, var, platform::CPUDeviceContext(), &reply_);
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VariableMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
+  framework::Scope* scope_;
+};
+
+void AsyncGRPCServer::RunSyncUpdate() {
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service_);
+
+  cq_send_ = builder.AddCompletionQueue();
+  cq_get_ = builder.AddCompletionQueue();
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << address_ << std::endl;
+
+  std::function<void()> send_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
+  std::function<void()> get_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+
+  t_send_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, false,
+                                cq_send_.get(), "cq_send", send_register)));
+
+  t_get_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, true,
+                                cq_get_.get(), "cq_get", get_register)));
+
+  // wait server
+  server_->Wait();
+  t_send_->join();
+  t_get_->join();
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  cq_send_->Shutdown();
+  cq_get_->Shutdown();
+  is_shut_down_ = true;
+}
+
+// This URL explains why shutdown is complicate:
+// https://stackoverflow.com/questions/35708348/grpc-what-is-the-recommended-way-to-shut-down-an-asynchronous-server-in-c
+void AsyncGRPCServer::ShutDown() {
+  server_->Shutdown();
+  ShutdownQueue();
+}
+
+void AsyncGRPCServer::TryToRegisterNewSendOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestSend* send =
+      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+  VLOG(4) << "create RequestSend status:" << send->Status();
+}
+
+void AsyncGRPCServer::TryToRegisterNewGetOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_);
+  VLOG(4) << "create Requestget status:" << get->Status();
+}
+
+void AsyncGRPCServer::SetFinishOrDelete(RequestBase*& last) {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    delete last;
+    last = NULL;
+    return;
+  }
+
+  last->SetStatus(FINISH);
+  return;
+}
+
+void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
+                                    std::string cq_name,
+                                    std::function<void()> TryToRegisterNewOne) {
+  TryToRegisterNewOne();
+
+  void* tag = NULL;
+  bool ok = false;
+  while (true) {
+    if (!cq->Next(&tag, &ok)) {
+      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      break;
+    }
+
+    if (wait && !done_) {
+      Wait();
+    }
+
+    RequestBase* base = (RequestBase*)tag;
+    if (!ok) {
+      VLOG(4) << cq_name << " recv no regular event";
+      TryToRegisterNewOne();
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        TryToRegisterNewOne();
+        base->Process();
+        SetFinishOrDelete(base);
+        break;
+      }
+      case FINISH: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+void AsyncGRPCServer::Wait() {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  condition_.wait(lock, [=] { return this->done_ == true; });
+}
+
+void AsyncGRPCServer::Reset() {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void AsyncGRPCServer::Done() {
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    done_ = true;
+  }
+  condition_.notify_all();
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
new file mode 100644
index 0000000000..041fe05b2e
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <thread>
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+class RequestBase;
+
+class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+ public:
+  explicit AsyncGRPCServer(std::string address) { address_ = address; }
+
+  void RunSyncUpdate();
+
+  void Reset();
+
+  void Done();
+
+  void SetScope(framework::Scope *scope) { scope_ = scope; }
+
+  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+
+  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+
+  void ShutDown();
+
+ protected:
+  void Wait();
+  void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
+                     std::string cq_name,
+                     std::function<void()> TryToRegisterNewOne);
+  void TryToRegisterNewSendOne();
+  void TryToRegisterNewGetOne();
+  void SetFinishOrDelete(RequestBase *&last);
+  void ShutdownQueue();
+
+ private:
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+
+  sendrecv::SendRecvService::AsyncService service_;
+  std::unique_ptr<grpc::Server> server_;
+
+  std::string address_;
+  framework::Scope *scope_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<MessageWithName> var_recv_queue_;
+
+  // condition of the sub program
+  std::mutex mutex_;
+  volatile mutable bool done_;
+  std::condition_variable condition_;
+
+  std::unique_ptr<std::thread> t_send_;
+  std::unique_ptr<std::thread> t_get_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
deleted file mode 100644
index 319404e56a..0000000000
--- a/paddle/operators/detail/recv_impl.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "send_recv_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-Status SendRecvServerImpl::SendVariable(ServerContext *context,
-                                        const VariableMessage *in_var,
-                                        VoidMessage *out_var) {
-  MessageWithName msg_with_name =
-      std::make_pair(in_var->varname(), std::move(*in_var));
-  var_recv_queue_.Push(std::move(msg_with_name));
-  return Status::OK;
-}
-
-Status SendRecvServerImpl::GetVariable(ServerContext *context,
-                                       const VariableMessage *in_var,
-                                       VariableMessage *out_var) {
-  std::string get_var_name = in_var->varname();
-  auto *var = scope_->FindVar(get_var_name);
-
-  SerializeToMessage(get_var_name, var, platform::CPUDeviceContext(), out_var);
-  return Status::OK;
-}
-
-Status SendRecvServerImpl::Wait(ServerContext *context,
-                                const VoidMessage *in_var,
-                                VoidMessage *out_var) {
-  {
-    std::unique_lock<std::mutex> lock(this->mutex_);
-    condition_.wait(lock, [=] { return this->done_ == true; });
-  }
-  return Status::OK;
-}
-
-void SendRecvServerImpl::Reset() {
-  std::lock_guard<std::mutex> lock(this->mutex_);
-  done_ = false;
-}
-
-void SendRecvServerImpl::Done() {
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_);
-    done_ = true;
-  }
-  condition_.notify_all();
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
deleted file mode 100644
index ae85cf2cec..0000000000
--- a/paddle/operators/detail/send_impl.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "send_recv_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-bool RPCClient::SendVariable(const framework::Scope& scope,
-                             const std::string& inname) {
-  ClientContext context;
-  VariableMessage msg;
-  VoidMessage out_msg;
-  // FIXME(typhoonzero): pass device context to here.
-  auto ctx = platform::CPUDeviceContext();
-  auto* var = scope.FindVar(inname);
-  PADDLE_ENFORCE(var);
-  SerializeToMessage(inname, var, ctx, &msg);
-
-  Status status = stub_->SendVariable(&context, msg, &out_msg);
-  if (!status.ok()) {
-    LOG(ERROR) << "gRPC error: " << status.error_message();
-    return false;
-  }
-  return true;
-}
-
-bool RPCClient::GetVariable(const framework::Scope& scope,
-                            const std::string& outname) {
-  ClientContext context;
-  VariableMessage call_msg, ret_msg;
-  call_msg.set_varname(outname);
-  auto ctx = platform::CPUDeviceContext();
-  Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
-  auto* outvar = scope.FindVar(outname);
-  if (!status.ok()) {
-    LOG(ERROR) << "gRPC error: " << status.error_message();
-    return false;
-  }
-
-  std::istringstream iss(ret_msg.serialized());
-  DeserializeFromMessage(ret_msg, ctx, outvar);
-
-  return true;
-}
-
-void RPCClient::Wait() {
-  ClientContext context;
-  VoidMessage call_msg, ret_msg;
-  stub_->Wait(&context, call_msg, &ret_msg);
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index f141c755ce..8f962b4c69 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -21,8 +21,6 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // wait for one execution of the program
-  rpc Wait(VoidMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
deleted file mode 100644
index 1fe54f1f05..0000000000
--- a/paddle/operators/detail/send_recv_impl.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/var_type.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-
-#include "paddle/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/operators/detail/send_recv.pb.h"
-
-#include <grpc++/grpc++.h>
-
-using grpc::Channel;
-using grpc::Server;
-using grpc::ServerContext;
-using grpc::ServerReader;
-using grpc::ServerBuilder;
-
-using grpc::ClientContext;
-using grpc::ClientReader;
-using grpc::ClientReaderWriter;
-using grpc::ClientWriter;
-using grpc::Status;
-using sendrecv::SendRecvService;
-using sendrecv::VariableMessage;
-using sendrecv::VoidMessage;
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-
-class SendRecvServerImpl final : public SendRecvService::Service {
- public:
-  explicit SendRecvServerImpl() {}
-
-  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
-                      VoidMessage *out_var) override;
-  Status GetVariable(ServerContext *context, const VariableMessage *in_var,
-                     VariableMessage *out_var) override;
-  Status Wait(ServerContext *context, const VoidMessage *in_var,
-              VoidMessage *out_var) override;
-  void Reset();
-  void Done();
-  void SetScope(framework::Scope *scope) { scope_ = scope; };
-
-  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
-
-  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
-
- private:
-  // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_recv_queue_;
-  framework::Scope *scope_;
-  // condition of the sub program
-  std::mutex mutex_;
-  bool done_;
-  std::condition_variable condition_;
-};
-
-// RPCClient is a class to send tensors to pserver sub-network
-// using different hashing methods.
-class RPCClient {
- public:
-  RPCClient(std::shared_ptr<Channel> channel)
-      : stub_(SendRecvService::NewStub(channel)) {}
-
-  bool SendVariable(const framework::Scope &scope, const std::string &inname);
-  bool GetVariable(const framework::Scope &scope, const std::string &outname);
-  void Wait();
-
- private:
-  std::unique_ptr<SendRecvService::Stub> stub_;
-};
-
-inline void SerializeToMessage(const std::string &name,
-                               const framework::Variable *var,
-                               const platform::DeviceContext &ctx,
-                               VariableMessage *msg) {
-  msg->set_varname(name);
-  std::ostringstream oss;
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarDesc_VarType_LOD_TENSOR:
-      msg->set_type(sendrecv::VarType::LOD_TENSOR);
-      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
-      break;
-    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
-      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
-      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
-                                   ctx);
-      break;
-    default: {
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-  msg->set_serialized(oss.str());
-}
-
-inline void DeserializeFromMessage(const VariableMessage &msg,
-                                   const platform::DeviceContext &ctx,
-                                   framework::Variable *var) {
-  using namespace paddle::framework::proto;
-  std::istringstream iss(msg.serialized());
-  switch (msg.type()) {
-    case sendrecv::VarType::LOD_TENSOR:
-      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
-      break;
-    case sendrecv::VarType::SELECTED_ROWS: {
-      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
-                            ctx);
-      break;
-    }
-    default: {
-      PADDLE_THROW("Deserialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.cc b/paddle/operators/detail/sendrecvop_utils.cc
new file mode 100644
index 0000000000..7635b9e8db
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg) {
+  msg->set_varname(name);
+  std::ostringstream oss;
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarDesc_VarType_LOD_TENSOR:
+      msg->set_type(sendrecv::VarType::LOD_TENSOR);
+      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
+      break;
+    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
+      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
+      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
+                                   ctx);
+      break;
+    default: {
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+  msg->set_serialized(oss.str());
+}
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var) {
+  std::istringstream iss(msg.serialized());
+  switch (msg.type()) {
+    case sendrecv::VarType::LOD_TENSOR:
+      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
+      break;
+    case sendrecv::VarType::SELECTED_ROWS: {
+      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
+                            ctx);
+      break;
+    }
+    default: {
+      PADDLE_THROW("Deserialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/operators/detail/sendrecvop_utils.h
new file mode 100644
index 0000000000..bc6581afab
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg);
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var);
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h
index f8abd5b640..86285b748a 100644
--- a/paddle/operators/detection_output_op.h
+++ b/paddle/operators/detection_output_op.h
@@ -98,16 +98,16 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
     T* conf_data = conf_tensor.data<T>();
     if (platform::is_gpu_place(context.GetPlace())) {
       loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
-      framework::CopyFrom(loc_tensor, platform::CPUPlace(),
-                          context.device_context(), &loc_cpu);
+      framework::Copy(loc_tensor, platform::CPUPlace(),
+                      context.device_context(), &loc_cpu);
       loc_data = loc_cpu.data<T>();
       conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
-      framework::CopyFrom(conf_tensor, platform::CPUPlace(),
-                          context.device_context(), &conf_cpu);
+      framework::Copy(conf_tensor, platform::CPUPlace(),
+                      context.device_context(), &conf_cpu);
       conf_data = conf_cpu.data<T>();
       priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
-      framework::CopyFrom(*in_priorbox, platform::CPUPlace(),
-                          context.device_context(), &priorbox_cpu);
+      framework::Copy(*in_priorbox, platform::CPUPlace(),
+                      context.device_context(), &priorbox_cpu);
       priorbox_data = priorbox_cpu.data<T>();
     }
     // get decode bboxes
@@ -158,8 +158,8 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
                                 batch_size, all_indices, all_decoded_bboxes,
                                 out_data);
     if (platform::is_gpu_place(context.GetPlace())) {
-      framework::CopyFrom(out_cpu, platform::CUDAPlace(),
-                          context.device_context(), out);
+      framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
+                      out);
     }
   }
 };
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/operators/edit_distance_op.cc
new file mode 100644
index 0000000000..e383f07fa9
--- /dev/null
+++ b/paddle/operators/edit_distance_op.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/edit_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class EditDistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto hyp_dims = ctx->GetInputDim("Hyps");
+    auto ref_dims = ctx->GetInputDim("Refs");
+    PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1,
+                   "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1,
+                   "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Hyps",
+             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "The indices for hypothesis strings.");
+    AddInput("Refs",
+             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "The indices for reference strings.");
+    AddAttr<bool>("normalized",
+                  "(bool, default false) Indicated whether to normalize "
+                  "the edit distance by the length of reference string.")
+        .SetDefault(false);
+    AddOutput("Out",
+              "(2-D Tensor with shape [`batch_size` x 1]) "
+              "The output edit distances of EditDistance operator.");
+    AddComment(R"DOC(
+
+EditDistance operator computes the edit distances between a batch of hypothesis
+strings and their references.
+
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings 
+are by counting the minimum number of operations to transform one string into anthor. 
+Here the operations include insertion, deletion, and substitution. For example, 
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance 
+is 3 for A will be transformed into B at least after two substitutions and one 
+insertion:
+  
+   "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total 
+number denoted by `batch_size`, and the separation is specified by the LoD information. 
+And the `batch_size` reference strings are arranged in order in the same way in the 
+LoDTensor Input(Refs).
+
+Output(Out) contains the `batch_size` results and each stands for the edit stance 
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance 
+will be divided by the length of reference string.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    edit_distance, ops::EditDistanceKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/operators/edit_distance_op.cu
new file mode 100644
index 0000000000..cf5ebc5c38
--- /dev/null
+++ b/paddle/operators/edit_distance_op.cu
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void FillFirstRow(T* dist, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < N + 1) {
+    dist[idx] = idx;
+  }
+}
+
+template <typename T>
+__global__ void FillFirstColumn(T* dist, const int M, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < M + 1) {
+    dist[idx * (N + 1)] = idx;
+  }
+}
+
+template <typename T>
+__global__ void Levenshtein(T* dist, const int* x1, const int* x2, const int M,
+                            const int N, const int start) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int offset = N;
+  int index = start + idx * offset;
+  int row = index / (N + 1);
+  int col = index % (N + 1);
+  if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
+    int cost = x1[row - 1] == x2[col - 1] ? 0 : 1;
+    int dels = dist[(row - 1) * (N + 1) + col] + 1;
+    int ins = dist[row * (N + 1) + col - 1] + 1;
+    int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost;
+    dist[index] = min(dels, min(ins, subs));
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(T* out, const T* dist, const int M, const int N,
+                          bool normalized) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx == 0) {
+    out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N];
+  }
+}
+
+template <typename Place, typename T>
+class EditDistanceGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+
+    auto normalized = ctx.Attr<bool>("normalized");
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+
+    auto num_strs = hyp_lod.size() - 1;
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<T>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; num++) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+      if (m == 0 || n == 0) {
+        distance = std::max(m, n);
+        if (normalized) {
+          PADDLE_ENFORCE(n > 0,
+                         "The reference string (#%d) cannot be empty "
+                         "when Attr(normalized) is enabled.",
+                         n);
+          distance = distance / n;
+        }
+        memory::Copy(boost::get<Place>(ctx.GetPlace()), out + num,
+                     platform::CPUPlace(), &distance, sizeof(T), stream);
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x2 = x2_t->data<int>() + ref_lod[num];
+
+        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
+                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
+
+        FillFirstRow<T><<<1 + n / PADDLE_CUDA_NUM_THREADS,
+                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n);
+        // Compute the elements of distance matrix in the anti-diagonal diretion
+        for (int64_t slice = 2; slice < m + n + 1; ++slice) {
+          int z_m = slice < m + 1 ? 0 : slice - m;
+          int z_n = slice < n + 1 ? 0 : slice - n;
+          int size = slice - (z_m + z_n) + 1;  // number of elments in the same
+                                               // anti-diagonal line to update
+          // the start index at which computes from
+          int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
+          Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
+                           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2,
+                                                                 m, n, start);
+        }
+        SetOutput<T><<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    edit_distance,
+    ops::EditDistanceGPUKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/edit_distance_op.h b/paddle/operators/edit_distance_op.h
new file mode 100644
index 0000000000..537e70281a
--- /dev/null
+++ b/paddle/operators/edit_distance_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class EditDistanceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+
+    auto normalized = ctx.Attr<bool>("normalized");
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+    auto num_strs = hyp_lod.size() - 1;
+
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<float>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; ++num) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+
+      if (m == 0) {
+        distance = n;
+      } else if (n == 0) {
+        distance = m;
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x2 = x2_t->data<int>() + ref_lod[num];
+        for (int64_t i = 0; i < m + 1; ++i) {
+          dist[i * (n + 1)] = i;
+        }
+        for (int64_t j = 0; j < n + 1; ++j) {
+          dist[j] = j;
+        }
+        for (int64_t i = 1; i < m + 1; ++i) {
+          for (int64_t j = 1; j < n + 1; ++j) {
+            int cost = x1[i - 1] == x2[j - 1] ? 0 : 1;
+            int dels = dist[(i - 1) * (n + 1) + j] + 1;
+            int ins = dist[i * (n + 1) + (j - 1)] + 1;
+            int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost;
+            dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs));
+          }
+        }
+        distance = dist[m * (n + 1) + n];
+      }
+
+      if (normalized) {
+        PADDLE_ENFORCE(n > 0,
+                       "The reference string (#%d) cannot be empty "
+                       "when Attr(normalized) is enabled.",
+                       n);
+        distance = distance / n;
+      }
+      out[num] = distance;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index 70b7c9f2ec..37951fa758 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -21,7 +21,7 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Add", "$Out = X + Y$");
+    SetComment("Add", "Out = X + Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 1fa960866f..6ebd58b1b3 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -21,7 +21,7 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "$Out = X / Y$");
+    SetComment("Div", "Out = X / Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index a6d1173619..450dd05c79 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -22,7 +22,7 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "$Out = X \\odot\\ Y$");
+    SetComment("Mul", "Out = X \\odot\\ Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index f308ee05e1..a342595b54 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -58,7 +58,8 @@ Limited Elementwise {name} Operator.
 
 The equation is:
 
-{equation}
+.. math::
+  {equation}
 
 X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
 or equal to the dimensions of X. 
@@ -71,15 +72,16 @@ For case 2:
 Y will be broadcasted to match the shape of X and axis should be 
 the starting dimension index for broadcasting Y onto X.
 
-example:
-  shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-  shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+For example
+  .. code-block:: python
 
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Either of the inputs X and Y or none can carry the LoD (Level of Details) information. However, the output only shares the LoD information with input X.
 
 )DOC";
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 2a8d0845b1..d3c51f0a69 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -21,7 +21,7 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "$Out = X - Y$");
+    SetComment("Sub", "Out = X - Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 1d9012cd4a..a4994cf3a5 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -126,8 +126,7 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
       auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
       out0->mutable_data<T>(context.GetPlace());
-      framework::CopyFrom(*in0, context.GetPlace(), context.device_context(),
-                          out0);
+      framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
     } else {
       switch (dims) {
         REP_EXPAND_GRAD_TEMPLATE(72)
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 48da52c3b6..d738e1850c 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -52,7 +52,7 @@ class FeedOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
-    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
+    framework::Copy(feed_item, place, dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 387d1e0a74..7205ee2a87 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -53,9 +53,9 @@ class FetchOp : public framework::OperatorBase {
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
+    auto &dev_ctx = *pool.Get(src_item.place());
 
-    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
+    Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
 
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 852ecdfe45..c74a5b6ced 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index 084ba1db62..4f5a2ed169 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -72,7 +72,7 @@ class FillOp : public framework::OperatorBase {
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
-      framework::CopyFrom(tensor, place, dev_ctx, &out);
+      framework::Copy(tensor, place, dev_ctx, &out);
     }
   }
 };
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 45e9d8df70..597fdad079 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 9ed493a7d0..2dca05760e 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -60,7 +60,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
diff --git a/paddle/operators/get_places_op.cc b/paddle/operators/get_places_op.cc
new file mode 100644
index 0000000000..24fafb2307
--- /dev/null
+++ b/paddle/operators/get_places_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/platform/gpu_info.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+static size_t CUDADevCount() {
+#ifdef PADDLE_WITH_CUDA
+  return platform::GetCUDADeviceCount();
+#else
+  return 0UL;
+#endif
+}
+
+class GetPlacesOp : public framework::OperatorBase {
+ public:
+  GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    bool is_gpu;
+    if (Attr<std::string>("device_type") == "AUTO") {
+      is_gpu = platform::is_gpu_place(place);
+    } else {
+      is_gpu = Attr<std::string>("device_type") == "CUDA";
+    }
+    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
+    if (device_count == 0) {
+      device_count =
+          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
+    }
+    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
+                      is_gpu ? "GPU" : "CPU");
+
+    auto out_var_name = Output("Out");
+    auto &places =
+        *(detail::Ref(scope.FindVar(out_var_name),
+                      "Output variable %s cannot be found", out_var_name)
+              .GetMutable<platform::PlaceList>());
+    places.reserve(device_count);
+    if (is_gpu) {
+      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
+                        "Only %d CUDA devices found, cannot set to %d",
+                        CUDADevCount(), device_count);
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
+      }
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CPUPlace());
+      }
+    }
+  }
+};
+
+class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "vector of Place");
+    AddAttr<int>("device_count", "device count").SetDefault(0);
+    AddAttr<std::string>("device_type", "device type")
+        .InEnum({"CUDA", "CPU", "AUTO"})
+        .SetDefault("AUTO");
+    AddComment(R"DOC(
+Returns a list of places based on flags. The list will be used for parallel
+execution.
+)DOC");
+  }
+};
+
+class GetPlacesInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o_name : op_desc.Output("Out")) {
+      block->FindRecursiveOrCreateVar(o_name).SetType(
+          framework::proto::VarDesc::PLACE_LIST);
+    }
+  }
+};
+
+class GetPlacesInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    // Do nothing
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
+                  ops::GetPlacesInferVarType, ops::GetPlacesInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 666207ea07..975e394c78 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of computation kernel of linear_chain_crf
   // is determined by its input "Emission".
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
  protected:
   // Explicitly set that the data type of output of the linear_chain_crf_grad
   // operator is determined by its input: gradients of LogLikelihood.
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index 19c6715ec8..f502ebefde 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -196,7 +196,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyLoDTensor = [](const platform::DeviceContext& ctx,
                             const LoDTensor& src, LoDTensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
+      framework::Copy(src, platform::CPUPlace(), ctx, dst);
     };
 
     copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -204,8 +204,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
 
     transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
                                             platform::CPUPlace());
-    framework::CopyFrom(transition_weights_src, platform::CPUPlace(), ctx,
-                        transition_weights_dst);
+    framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
+                    transition_weights_dst);
   }
 
   void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
@@ -220,7 +220,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(platform::CUDAPlace());
-      framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst);
+      framework::Copy(src, platform::CUDAPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // Copy the inputs from GPU memory to CPU memory when this operators runs on
     // GPU device.
     label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    framework::CopyFrom(label_src, platform::CPUPlace(), ctx, label_dst);
+    framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
 
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
+      framework::Copy(src, platform::CPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                          Tensor* dst) {
       if (src && dst) {
         dst->mutable_data<T>(platform::CUDAPlace());
-        framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst);
+        framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
       }
     };
     copyTensor(ctx, emission_grad_src, emission_grad_dst);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 7f551f101f..f886b423ac 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -53,7 +53,7 @@ class LoadOp : public framework::OperatorBase {
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(cpu_tensor.lod());
-      CopyFrom(cpu_tensor, place, dev_ctx, tensor);
+      Copy(cpu_tensor, place, dev_ctx, tensor);
     }
   }
 };
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index 8711dd62c8..692b9bf371 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -66,7 +66,7 @@ class LoDRankTableInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o)->SetType(
+      block->FindRecursiveOrCreateVar(o).SetType(
           framework::proto::VarDesc::LOD_RANK_TABLE);
     }
   }
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
index f3c0badf2a..3d7b15edcf 100644
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index 306373fb1f..c1bbba7a83 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -33,8 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
       auto* lod = lod_t->data<int>();
       if (platform::is_gpu_place(ctx.GetPlace())) {
         framework::Tensor lod_cpu;
-        framework::CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context(),
-                            &lod_cpu);
+        framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                        &lod_cpu);
         lod = lod_cpu.data<int>();
       }
       level0 = std::vector<int>(lod, lod + lod_t->numel());
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 8d164b4abc..685a807a8a 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -92,9 +92,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
             platform::DeviceContextPool::Instance();
         auto &dev_ctx = *pool.Get(place);
 
-        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                                    static_cast<int>(each_range.end)),
-                            x.place(), dev_ctx, &slice);
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index 7417192479..fedd325cf4 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx);
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // LogicalOp kernel's device type is decided by input tensor place
     kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 6e5cbd6f8c..bb03def439 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index b8fcec0f29..3b90b64b4e 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 7ebcfb9ab9..fd59eef7d6 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -12,6 +12,7 @@ if(WITH_GPU)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
+    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
     nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
@@ -27,6 +28,7 @@ else()
     cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
+    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
     cc_library(unpooling SRCS unpooling.cc DEPS device_context)
@@ -38,3 +40,4 @@ cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index 4036614086..218de9fb95 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -149,7 +149,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
             Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         if (down_pad > 0) {  // add down pad
@@ -179,7 +179,7 @@ class ContextProjectFunctor {
                 (down_pad_begin_row + t) * context_length);
             Tensor w_sub = padding_data.Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 26c038e435..1ba24325ff 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/operators/math/im2col.h"
 #include <gtest/gtest.h>
-#include <iostream>
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -63,7 +62,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    CopyFrom(input_tmp, *place, *context, &input);
+    Copy(input_tmp, *place, *context, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -88,7 +87,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
+    Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -99,9 +98,10 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
+    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
+
   for (int i = 0; i < 6; ++i) {
     EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
   }
@@ -119,7 +119,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    CopyFrom(input_tmp, *place, *context, &input);
+    Copy(input_tmp, *place, *context, &input);
   }
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -128,7 +128,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -140,7 +140,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    CopyFrom(input_tmp, *place, *context, &input);
+    Copy(input_tmp, *place, *context, &input);
   }
 
   col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -148,12 +148,15 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
     EXPECT_EQ(in_ptr[i], col2im_data[i]);
   }
+
+  delete place;
+  delete context;
 }
 
 TEST(math, im2col) {
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 4325a79664..d1139ac988 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
   auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
-  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
   auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
-  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
   auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
   paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
-  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
   auto* gpu_place = new paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
   paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
-  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
   context.Wait();
 
   EXPECT_EQ(input3_ptr[0], 0);
@@ -205,15 +205,15 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CUDADeviceContext context(*gpu_place);
-  paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a);
-  paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b);
+  paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
 
   paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
-  paddle::framework::CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context,
-                              &vec_c);
+  paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
+                          &vec_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 0a2e36f68a..38808e1301 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
-  CopyFrom(*tensor2, cpu_place, ctx, &tensor2_cpu);
+  Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
-  CopyFrom(*tensor1, cpu_place, ctx, &tensor1_cpu);
+  Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
diff --git a/paddle/operators/math/sequence_padding.cc b/paddle/operators/math/sequence_padding.cc
new file mode 100644
index 0000000000..fd66455eae
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    for (size_t i = 0; i < max_sequence_length; ++i) {
+      for (size_t j = 0; j < num_sequences; ++j) {
+        size_t start_pos = abs_offset_lod[level][j];
+        size_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
+        if (i < sequence_length) {
+          // i > 0 => sequence_length > 0
+          T scale =
+              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+          for (size_t k = 0; k < sequence_width; ++k) {
+            padding_data[(i * num_sequences + j) * sequence_width + k] =
+                seq_data[(start_pos + i) * sequence_width + k] * scale;
+          }
+        } else {
+          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
+                 sequence_width * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    for (size_t i = 0; i < num_sequences; ++i) {
+      size_t start_pos = abs_offset_lod[level][i];
+      size_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
+      for (size_t j = 0; j < sequence_length; ++j) {
+        // sequence_width > j > 0
+        T scale =
+            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+        for (size_t k = 0; k < sequence_width; ++k) {
+          seq_data[(start_pos + j) * sequence_width + k] =
+              padding_data[(j * num_sequences + i) * sequence_width + k] *
+              scale;
+        }
+      }
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
new file mode 100644
index 0000000000..e4be178f81
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool NormByTimes, bool Padding>
+__global__ void SequencePaddingKernel(T* padding, T* sequence,
+                                      const size_t* sequence_start_positions,
+                                      const size_t sequence_width,
+                                      const size_t max_sequence_length,
+                                      const size_t num_sequences) {
+  size_t padding_idx = blockIdx.y;
+  size_t start_pos = sequence_start_positions[padding_idx];
+  size_t sequence_length =
+      sequence_start_positions[padding_idx + 1] - start_pos;
+
+  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t padding_base_idx =
+      (sequence_idx * num_sequences + padding_idx) * sequence_width;
+  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
+
+  if (sequence_idx < sequence_length) {
+    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
+      }
+    } else {
+      /* padding -> sequence */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
+      }
+    }
+  } else if (sequence_idx < max_sequence_length) {
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = 0;
+      }
+    }
+  }
+}
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(seq, context.GetPlace(), context, &padding);
+      padding.Resize(padding_dims);
+      return;
+    }
+
+    const size_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const size_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(padding, context.GetPlace(), context, &seq);
+      seq.Resize(seq_dims);
+      return;
+    }
+
+    const size_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.h b/paddle/operators/math/sequence_padding.h
new file mode 100644
index 0000000000..8f586c5eb4
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+inline static size_t MaximumSequenceLength(const framework::LoD& lod,
+                                           const size_t level) {
+  const size_t num_sequences = lod[level].size() - 1;
+  size_t max_sequence_length = 0;
+  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    max_sequence_length =
+        std::max(max_sequence_length,
+                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
+  }
+  return max_sequence_length;
+}
+
+/*
+ * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
+ *          [max_sequence_length, num_sequences, sequence_width].
+ *
+ *  Padding sequence:
+ *        padding[i] = seq[lod[level][i]]
+ *  Unpadding sequence:
+ *        seq[lod[level][i]] = padding[i]
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+ *
+ * \param context       device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param padding       Tensor which is padded to the same length, the shape is
+ *                      [max_sequence_length, num_sequences, sequence_width].
+ * \param norm_by_times whether dividing sequence's length.
+ *
+ * \note  transposition is also done in this functor.
+ */
+template <typename DeviceContext, typename T>
+class PaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
+                  framework::Tensor& padding, bool norm_by_times);
+};
+
+template <typename DeviceContext, typename T>
+class UnpaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const framework::Tensor& padding, bool norm_by_times);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/operators/math/sequence_padding_test.cc
new file mode 100644
index 0000000000..9799bcd65d
--- /dev/null
+++ b/paddle/operators/math/sequence_padding_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+#include <gtest/gtest.h>
+
+template <typename DeviceContext, typename Place, typename T>
+void TestSequencePadding(const paddle::framework::LoD& lod,
+                         const size_t sequence_width) {
+  paddle::framework::LoDTensor cpu_seq;
+  paddle::framework::LoDTensor cpu_seq_back;
+  paddle::framework::LoDTensor seq;
+  paddle::framework::LoDTensor seq_back;
+  paddle::framework::Tensor padding;
+
+  const size_t level = lod.size() - 1;
+  auto seq_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
+                                    static_cast<int64_t>(sequence_width)});
+
+  cpu_seq.set_lod(lod);
+  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
+  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+    cpu_seq.data<T>()[i] = static_cast<T>(i);
+  }
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    seq = cpu_seq;
+  } else {
+    Copy(cpu_seq, *place, *context, &seq);
+    seq.set_lod(lod);
+  }
+
+  const size_t max_sequence_length =
+      paddle::operators::math::MaximumSequenceLength(lod, level);
+  const size_t num_sequences = lod[level].size() - 1;
+  auto padding_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                                    static_cast<int64_t>(num_sequences),
+                                    static_cast<int64_t>(sequence_width)});
+  padding.mutable_data<T>(padding_dims, *place);
+  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq, padding, false);
+
+  seq_back.set_lod(lod);
+  seq_back.mutable_data<T>(seq_dims, *place);
+  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq_back, padding, false);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_seq_back = seq_back;
+  } else {
+    Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    cpu_seq_back.set_lod(lod);
+  }
+
+  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
+  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
+  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
+  }
+
+  delete place;
+  delete context;
+};
+
+TEST(Seq2BatchPadding, CPU) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod2, 128);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(SequencePadding, CUDA) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod2, 128);
+}
+#endif
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index 3794f0e52d..7a308ca814 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -71,7 +71,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    CopyFrom(input_tmp, *place, *context, &input);
+    Copy(input_tmp, *place, *context, &input);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
@@ -85,7 +85,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    CopyFrom(output, paddle::platform::CPUPlace(), *context, &output_tmp);
+    Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -99,7 +99,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    CopyFrom(input_tmp, *place, *context, &input);
+    Copy(input_tmp, *place, *context, &input);
   }
 
   paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
@@ -109,7 +109,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 3f999e404f..87644d316d 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -49,7 +49,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -104,8 +104,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         continue;
       }
       auto slice = out->Slice(out_offset, out_offset + len);
-      framework::CopyFrom(input->Slice(start_offset, end_offset), place,
-                          dev_ctx, &slice);
+      framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
+                      &slice);
       out_offset += len;
       (*in_idx) += 1;
     }
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 11e047b5d5..78263da2fb 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index f49ee71f10..4372dc2c65 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
     platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index d39ca87d53..84ba3ead2b 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
index 78b5e27678..03302f5cbf 100644
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -56,11 +56,11 @@ void NetOp::CompleteAddOp(bool calc) {
   std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
 }
 
-std::string NetOp::DebugString() const {
+std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
   std::ostringstream os;
-  os << OperatorBase::DebugString() << std::endl;
+  os << OperatorBase::DebugStringEx(scope) << std::endl;
   for (auto& op : ops_) {
-    std::istringstream is(op->DebugString());
+    std::istringstream is(op->DebugStringEx(scope));
     for (std::string line; std::getline(is, line);) {
       os << "    " << line << std::endl;
     }
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 85d0153b32..b24042f5ef 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -106,7 +106,8 @@ class NetOp : public framework::OperatorBase {
 
   void CompleteAddOp(bool calculate = true);
 
-  std::string DebugString() const override;
+  std::string DebugStringEx(
+      const framework::Scope* scope = nullptr) const override;
 
   bool IsNetOp() const override;
   std::vector<std::string> OutputVars(bool has_intermediate) const override;
diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc
index b198b76cd4..0eeafcaae0 100644
--- a/paddle/operators/norm_op.cc
+++ b/paddle/operators/norm_op.cc
@@ -39,7 +39,7 @@ class NormOpMaker : public framework::OpProtoAndCheckerMaker {
               "M = C * H * W");
     AddComment(R"DOC(
        "Input shape: $(N, C, H, W)$
-        Sclae shape: $(C, 1)$
+        Scale shape: $(C, 1)$
         Output shape: $(N, C, H, W)$
         Where
         forward
diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h
index 7bee48919e..5759d6f1f0 100644
--- a/paddle/operators/norm_op.h
+++ b/paddle/operators/norm_op.h
@@ -66,7 +66,7 @@ class NormKernel : public framework::OpKernel<T> {
                                  context.GetPlace());
       auto tmp = framework::EigenVector<T, Eigen::RowMajor,
                                         Eigen::DenseIndex>::Flatten(tmp_tensor);
-      // get colsum  and sqrt , inverse
+      // get colsum and sqrt , inverse
       auto dim = Eigen::array<int, 1>({{0}});
       tmp.device(*place) = x_square_batch_eigen.sum(dim);
       tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
new file mode 100644
index 0000000000..e1bec0421e
--- /dev/null
+++ b/paddle/operators/parallel_do_op.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/threadpool.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kInputs[] = "inputs";
+static constexpr char kParameters[] = "parameters";
+static constexpr char kPlaces[] = "places";
+
+static constexpr char kOutputs[] = "outputs";
+static constexpr char kParallelScopes[] = "parallel_scopes";
+
+static constexpr char kParallelBlock[] = "sub_block";
+
+// using ParallelScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+using OperatorBase = framework::OperatorBase;
+
+void SplitTensorAndMoveTensorToScopes(
+    const framework::Scope &scope,
+    const std::vector<framework::Scope *> &sub_scopes,
+    const std::vector<platform::Place> &places,
+    const std::vector<std::string> &names) {
+  PADDLE_ENFORCE_EQ(sub_scopes.size(), places.size());
+  for (auto &argu : names) {
+    auto *var = scope.FindVar(argu);
+    const auto &tensor = var->Get<LoDTensor>();
+    auto lod_tensors = tensor.SplitLoDTensor(places);
+
+    for (auto &lod : lod_tensors) {
+      VLOG(3) << lod.dims();
+    }
+
+    for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      *sub_scopes[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+    }
+  }
+}
+
+void WaitOnPlaces(const std::vector<platform::Place> places) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+
+  for (auto &place : places) {
+    auto &dev_ctx = *pool.Get(place);
+    dev_ctx.Wait();
+  }
+}
+
+class ParallelDoOp : public framework::OperatorBase {
+ public:
+  ParallelDoOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
+                            ->GetMutable<std::vector<framework::Scope *>>();
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      sub_scopes.push_back(&scope.NewScope());
+    }
+
+    // split input
+    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+                                     Inputs(kInputs));
+    // copy parameter
+    for (auto &param : Inputs(kParameters)) {
+      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
+                     "Only support parameter type as LoDTensor");
+      auto &src = scope.FindVar(param)->Get<LoDTensor>();
+      for (size_t i = 0; i < places.size(); ++i) {
+        auto &place = places[i];
+        auto *sub_scope = sub_scopes[i];
+        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
+        framework::Copy(src, place, dst);
+      }
+    }
+    WaitOnPlaces(places);
+
+    std::vector<std::future<void>> workers;
+    workers.reserve(places.size());
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      VLOG(3) << "Run " << place_idx;
+
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // merge output
+    for (auto &o_name : Outputs(kOutputs)) {
+      std::vector<const framework::LoDTensor *> lod_tensors;
+      lod_tensors.reserve(sub_scopes.size());
+      for (auto *sub_scope : sub_scopes) {
+        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
+      }
+
+      auto *lod_tensor_to_be_merged =
+          scope.FindVar(o_name)->GetMutable<LoDTensor>();
+      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
+    }
+    WaitOnPlaces(places);
+  }
+};
+
+class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "").AsDuplicable();
+    AddInput(kParameters, "").AsDuplicable();
+    AddInput(kPlaces, "");
+    AddOutput(kOutputs, "").AsDuplicable();
+    AddOutput(kParallelScopes, "");
+    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
+    AddComment(R"DOC(
+ParallelDo Operator.
+)DOC");
+  }
+};
+
+class ParallelDoGradOp : public OperatorBase {
+ public:
+  ParallelDoGradOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // // get device context from pool
+    // platform::DeviceContextPool &pool =
+    //        platform::DeviceContextPool::Instance();
+    // auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
+                           ->Get<std::vector<framework::Scope *>>();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    // feed output@grad
+    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+                                     Inputs(framework::GradVarName(kOutputs)));
+    WaitOnPlaces(places);
+
+    // for debugging
+    for (auto &s : Inputs(framework::GradVarName(kOutputs))) {
+      VLOG(3) << s;
+      VLOG(3) << scope.FindVar(s)->Get<LoDTensor>();
+      for (auto *sub_scope : sub_scopes) {
+        VLOG(3) << sub_scope->FindVar(s)->Get<LoDTensor>();
+      }
+    }
+
+    // exe run
+    std::vector<std::future<void>> workers;
+    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+      VLOG(3) << "Run " << place_idx;
+
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      // execute
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // merge grad
+    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      VLOG(3) << "merge grad " << s;
+
+      auto &t = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
+      VLOG(3) << t;
+
+      std::string s_buf = s + "@BUF";
+      auto *t_buf = sub_scopes[0]->Var(s_buf)->GetMutable<LoDTensor>();
+
+      for (size_t place_idx = 1; place_idx < places.size(); ++place_idx) {
+        auto &tt = sub_scopes[place_idx]->FindVar(s)->Get<LoDTensor>();
+        VLOG(3) << place_idx;
+        VLOG(3) << tt;
+        framework::Copy(tt, places[0], t_buf);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {s, s_buf}}}, {{"Out", {s}}},
+            framework::AttributeMap{});
+        sum_op->Run(*sub_scopes[0], places[0]);
+        WaitOnPlaces(places);
+      }
+
+      VLOG(3) << t;
+      framework::Copy(t, place, scope.FindVar(s)->GetMutable<LoDTensor>());
+    }
+  }
+};
+
+class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("parallel_do_grad");
+    for (auto &input_param : this->InputNames()) {
+      VLOG(3) << input_param;
+      grad->SetInput(input_param, this->Input(input_param));
+      if (input_param != kPlaces) {
+        grad->SetOutput(framework::GradVarName(input_param),
+                        this->InputGrad(input_param, false));
+      }
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kParallelScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    std::vector<std::string> input{kParameters, kInputs};
+    std::vector<std::string> output{kOutputs};
+    for (auto &s : input) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
+    }
+    for (auto &s : output) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+    }
+    for (auto &s : input) {
+      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+    }
+    if (ctx->HasInputs(kParameters)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                         ctx->GetInputsDim(kParameters));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
+                  paddle::operators::ParallelDoOpProtoMaker,
+                  paddle::operators::ParallelDoGradOpDescMaker);
+REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
+                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 76c5123527..1d31d813af 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
index a6b23c995b..5aa5167dbb 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index c5753147ef..f1598d53ca 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
diff --git a/paddle/operators/print_op.cc b/paddle/operators/print_op.cc
new file mode 100644
index 0000000000..89e41d806c
--- /dev/null
+++ b/paddle/operators/print_op.cc
@@ -0,0 +1,206 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <ctime>
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+#define CLOG std::cout
+
+struct Formater {
+  std::string message;
+  std::string name;
+  std::vector<int> dims;
+  std::type_index dtype{typeid(char)};
+  framework::LoD lod;
+  int summarize;
+  void* data{nullptr};
+
+  void operator()(size_t size) {
+    PrintMessage();
+    PrintName();
+    PrintDims();
+    PrintDtype();
+    PrintLod();
+    PrintData(size);
+  }
+
+ private:
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
+  void PrintName() {
+    if (!name.empty()) {
+      CLOG << "Tensor[" << name << "]" << std::endl;
+    }
+  }
+  void PrintDims() {
+    if (!dims.empty()) {
+      CLOG << "\tshape: [";
+      for (auto i : dims) {
+        CLOG << i << ",";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+  void PrintDtype() {
+    if (dtype.hash_code() != typeid(char).hash_code()) {
+      CLOG << "\tdtype: " << dtype.name() << std::endl;
+    }
+  }
+  void PrintLod() {
+    if (!lod.empty()) {
+      CLOG << "\tLoD: [";
+      for (auto level : lod) {
+        CLOG << "[ ";
+        for (auto i : level) {
+          CLOG << i << ",";
+        }
+        CLOG << " ]";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+
+  void PrintData(size_t size) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    // print float
+    if (dtype.hash_code() == typeid(float).hash_code()) {
+      Display<float>(size);
+    }
+    if (dtype.hash_code() == typeid(double).hash_code()) {
+      Display<double>(size);
+    }
+    if (dtype.hash_code() == typeid(int).hash_code()) {
+      Display<int>(size);
+    }
+    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+      Display<int64_t>(size);
+    }
+  }
+
+  template <typename T>
+  void Display(size_t size) {
+    auto* d = (T*)data;
+    CLOG << "\tdata: ";
+    if (summarize != -1) {
+      summarize = std::min(size, (size_t)summarize);
+      for (int i = 0; i < summarize; i++) {
+        CLOG << d[i] << ",";
+      }
+    } else {
+      for (size_t i = 0; i < size; i++) {
+        CLOG << d[i] << ",";
+      }
+    }
+    CLOG << std::endl;
+  }
+};
+
+// TODO(ChunweiYan) there should be some other printers for TensorArray
+class TensorPrintOp : public framework::OperatorBase {
+ public:
+  TensorPrintOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  TensorPrintOp(const TensorPrintOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    // Only run the `first_n` times.
+    int first_n = Attr<int>("first_n");
+    if (first_n > 0 && ++times_ > first_n) return;
+
+    PADDLE_ENFORCE(!Inputs("input").empty(), "input should be set");
+    auto* input_var = scope.FindVar(Input("input"));
+    PADDLE_ENFORCE_NOT_NULL(input_var);
+    auto& tensor = input_var->Get<framework::LoDTensor>();
+
+    // TODO(ChunweiYan) support GPU
+    PADDLE_ENFORCE(platform::is_cpu_place(tensor.place()));
+
+    Formater formater;
+    if (Attr<bool>("print_tensor_name")) {
+      formater.name = Inputs("input").front();
+    }
+    if (Attr<bool>("print_tensor_type")) {
+      formater.dtype = tensor.type();
+    }
+    if (Attr<bool>("print_tensor_shape")) {
+      formater.dims.assign(tensor.dims()[0],
+                           tensor.dims()[tensor.dims().size() - 1]);
+    }
+    if (Attr<bool>("print_tensor_lod")) {
+      formater.lod = tensor.lod();
+    }
+    formater.summarize = Attr<int>("summarize");
+    formater.data = (void*)tensor.data<void>();
+    formater(tensor.numel());
+  }
+
+ private:
+  mutable int times_{0};
+};
+
+class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "the tensor that will be displayed.");
+    AddAttr<int>("first_n", "Only log `first_n` number of times.");
+    AddAttr<std::string>("message", "A string message to print as a prefix.");
+    AddAttr<int>("summarize", "Print this number of elements in the tensor.");
+    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
+    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
+    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
+    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
+    AddComment(R"DOC(
+    Creates a print op that will print when a tensor is accessed.
+
+    Wraps the tensor passed in so that whenever that a tensor is accessed,
+    the message `message` is printed, along with the current value of the
+    tensor `t`.)DOC");
+  }
+};
+
+class InferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("input"), "input should be set");
+  }
+};
+
+class InferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(print, paddle::operators::TensorPrintOp,
+                  paddle::operators::PrintOpProtoAndCheckMaker,
+                  paddle::operators::InferShape,
+                  paddle::operators::InferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 056fa46949..a136c5b447 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -290,7 +290,7 @@ class RecurrentOp : public RecurrentBase {
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out);
+            framework::Copy(src_tensor, place, dev_ctx, &dst_out);
           });
 
       scopes.Next();
@@ -376,7 +376,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor);
+          framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
         }
       }
 
@@ -450,7 +450,7 @@ class RecurrentGradOp : public RecurrentBase {
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            framework::CopyFrom(inside, place, dev_ctx, &dst);
+            framework::Copy(inside, place, dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -463,7 +463,7 @@ class RecurrentGradOp : public RecurrentBase {
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
               outside->mutable_data(place, inside.type());
-              framework::CopyFrom(inside, place, dev_ctx, outside);
+              framework::Copy(inside, place, dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 82fceb3da7..55b33343af 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -19,13 +19,13 @@ limitations under the License. */
 
 #include <unistd.h>
 
-#include "paddle/framework/data_type.h"
 #include "paddle/framework/executor.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
@@ -33,16 +33,23 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-void RunServer(Server **rpc_server,
-               std::shared_ptr<detail::SendRecvServerImpl> service,
-               const std::string &server_address) {
-  ServerBuilder builder;
-  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
-  builder.RegisterService(service.get());
-  std::unique_ptr<Server> server(builder.BuildAndStart());
-  *rpc_server = server.get();
-  LOG(INFO) << "Server listening on " << server_address;
-  server->Wait();
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VraibleMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
 }
 
 class RecvOp : public framework::OperatorBase {
@@ -52,10 +59,9 @@ class RecvOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
     if (!rpc_service_) {
-      rpc_service_.reset(new detail::SendRecvServerImpl());
       std::string endpoint = Attr<std::string>("endpoint");
-      server_thread_.reset(
-          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
     }
   }
 
@@ -63,7 +69,7 @@ class RecvOp : public framework::OperatorBase {
     detail::MessageWithName term_msg;
     term_msg.first = LISTEN_TERMINATE_MESSAGE;
     rpc_service_->Push(term_msg);
-    rpc_server_->Shutdown();
+    rpc_service_->ShutDown();
     server_thread_->join();
   }
 
@@ -86,10 +92,12 @@ class RecvOp : public framework::OperatorBase {
     auto grad_list = Attr<std::vector<std::string>>("GradList");
     auto trainer_count = Attr<int>("Trainers");
     size_t param_count = param_list.size();
+
     rpc_service_->Reset();
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
     while (!exit_flag) {
+      // TODO(gognwb): simply this loop.
       // Get from multiple trainers, we don't care about order in which
       // the gradient arrives, just add suffix 0~n then average the gradient.
       for (size_t i = 0; i < param_count * trainer_count; ++i) {
@@ -97,6 +105,7 @@ class RecvOp : public framework::OperatorBase {
         const detail::MessageWithName &v = rpc_service_->Get();
         auto grad_var_name = v.first;
         if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          VLOG(4) << "received LISTEN_TERMINATE_MESSAGE and RunOp.Run() exit";
           exit_flag = true;
           break;
         }
@@ -105,15 +114,19 @@ class RecvOp : public framework::OperatorBase {
         if (it != grad_list.end()) {
           param_var_name = param_list[it - grad_list.begin()];
         } else {
-          LOG(ERROR) << "grad have no paired param found!";
+          LOG(ERROR) << "grad have no paired param found!\"" << grad_var_name
+                     << "\"";
         }
         VLOG(3) << "recved grad: " << grad_var_name
                 << " updating param: " << param_var_name;
+
         auto *merged_grad = recv_scope.FindVar(grad_var_name);
         if (merged_grad == nullptr) {
-          // create output of merged var.
-          auto merged_var = recv_scope.Var(grad_var_name);
-          merged_var->GetMutable<framework::LoDTensor>();
+          auto *ptr = recv_scope.Var(grad_var_name);
+          CreateTensorFromMessageType(ptr, v.second.type());
+          VLOG(3) << "Create Variable " << grad_var_name
+                  << " on recv scope, which pointer is " << ptr << " type is "
+                  << v.second.type();
         }
 
         if (trainer_count > 1) {
@@ -126,9 +139,11 @@ class RecvOp : public framework::OperatorBase {
         auto &dev_ctx = *pool.Get(dev_place);
         detail::DeserializeFromMessage(v.second, dev_ctx, var);
       }
+
       if (exit_flag) {
         break;
       }
+
       rpc_service_->Reset();
 
       std::string program_str = Attr<std::string>("OptimizeProgram");
@@ -143,17 +158,14 @@ class RecvOp : public framework::OperatorBase {
       } catch (std::exception &e) {
         LOG(ERROR) << "run sub program error " << e.what();
       }
+
       rpc_service_->Done();
       grads_counter_.clear();
     }  // while(true)
   }
 
  protected:
-  // grpc server instance to track status and gracefully shutdown.
-  // borrow an pointer from server thread.
-  Server *rpc_server_{nullptr};
-  // grpc send/recv service implement to register.
-  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
   std::shared_ptr<std::thread> server_thread_;
   mutable std::unordered_map<std::string, int> grads_counter_;
 };
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index a3ff4a6ca0..172d28bb3b 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -77,6 +77,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
     }
   }
 };
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index 8d652ff806..a055cdf7e8 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -88,20 +88,33 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
   std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
       const framework::LoDTensor &x) const {
     std::vector<AbsoluteRankTableItem> absolute_table;
-    size_t level = 0;
-    size_t size = x.lod()[level].size();
 
-    for (size_t i = 0; i < size - 1; ++i) {
-      auto lod_offset =
-          framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+    if (x.lod().empty()) {
+      // For Tensor without lod, such as the output of sequence_pool_op
+      size_t size = x.dims()[0];
+      absolute_table.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        absolute_table.emplace_back();
+        absolute_table.back().length = 1;
+        absolute_table.back().offset = i;
+      }
+    } else {
+      size_t level = 0;
+      size_t size = x.lod()[level].size();
+
+      for (size_t i = 0; i < size - 1; ++i) {
+        auto lod_offset =
+            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
 
-      auto &offset = lod_offset.second;
+        auto &offset = lod_offset.second;
 
-      absolute_table.emplace_back();
-      absolute_table.back().length = offset.second - offset.first;
-      absolute_table.back().offset = offset.first;
-      absolute_table.back().lod = lod_offset.first;
+        absolute_table.emplace_back();
+        absolute_table.back().length = offset.second - offset.first;
+        absolute_table.back().offset = offset.first;
+        absolute_table.back().lod = lod_offset.first;
+      }
     }
+
     return absolute_table;
   }
 
@@ -133,7 +146,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
-    framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
     out_offset += len;
     return out_offset;
   }
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index a4eb34a0ad..d884b03cad 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto out_dims = out->dims();
     out->mutable_data<T>(ctx.GetPlace());
-    framework::CopyFrom(*in, ctx.GetPlace(), ctx.device_context(), out);
+    framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
     out->Resize(out_dims);
   }
 };
@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    framework::CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+    framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index ef1804d976..a7351f11c5 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 806dccc6ca..b653348906 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 95c207221a..4d145250bd 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -19,59 +19,45 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 
-#include "paddle/operators/detail/send_recv_impl.h"
-#include "paddle/operators/detail/simple_block_queue.h"
+#include <future>
+#include "paddle/operators/detail/grpc_client.h"
 
 namespace paddle {
 namespace operators {
 
-// TODO(typhoonzero): this is a simple implementation which only send
-// one tensor
 class SendOp : public framework::OperatorBase {
  public:
-  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    // init client when the operator is created at runtime.
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
-    for (auto ep : endpoints) {
-      client_map_[ep].reset(new detail::RPCClient(
-          grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
-    }
-  }
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
     auto ins = Inputs("X");
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    // TODO(typhoonzero): use async calls to send multiple variable asyncly.
-    for (size_t i = 0; i < ins.size(); ++i) {
-      bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
-      if (!ret) {
-        LOG(ERROR) << "send variable error: " << ins[i];
-      }
+
+    // FIXME(gongwb): DeviceContext?
+    auto ctx = platform::CPUDeviceContext();
+    for (size_t i = 0; i < ins.size(); i++) {
+      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
     }
-    // TODO(typhoonzero): support async optimization
-    client_map_[epmap[0]]->Wait();
-    for (size_t i = 0; i < outs.size(); ++i) {
-      bool ret = client_map_[epmap[i]]->GetVariable(scope, outs[i]);
-      if (!ret) {
-        LOG(ERROR) << "GetVariable error: " << outs[i];
-      }
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
     }
+
+    client_.wait();
   }
 
- protected:
-  mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
-      client_map_;
+ private:
+  mutable detail::RPCClient client_;
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
     AddOutput("Out", "(Tensor) Output tensor to get from server")
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index fa94424bf9..ea09169479 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -140,7 +140,7 @@ void StartServerNet(bool is_sparse) {
 
 TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false);
-  sleep(3);  // wait server to start
+  sleep(10);  // wait server to start
   // local net
   f::Scope scope;
   p::CPUPlace place;
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/operators/sequence_erase_op.cc
new file mode 100644
index 0000000000..d17b268623
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_erase_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEraseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceEraseOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceEraseOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
+                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
+                   "with the 2nd dimension equal to 1.");
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dim. equal to 1) "
+             "Input LoDTensor of SequenceEraseOp.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dim. equal to 1) "
+              "Output LoDTensor of SequenceEraseOp.");
+    AddAttr<std::vector<int>>("tokens",
+                              "(vector<int>) Tokens need to be erased from "
+                              "input sequences.");
+    AddComment(R"DOC(
+Sequence Erase Operator.
+
+Sequence erase operator erases tokens specified by Attr(tokens) from the input 
+sequences Input(X), and outputs the remaining data and modifies the LoD 
+information at the same time. For example, given a 2-D LoDTensor
+
+    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
+
+with lod = [[0, 3, 6, 10]], there are three sequences in the input:
+   
+     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
+
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+operation, the three sequences become
+
+    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
+
+Hence the LoDTensor Output(Out) should be
+
+    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
+
+with lod = [[0, 1, 3, 7]].
+
+An example usage for this operator is to remove the special tokens when 
+computing the edit distance between two strings, such as blank, start token, 
+and end token.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
+                             ops::SequenceEraseOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_erase,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>);
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
new file mode 100644
index 0000000000..5da8eba3e1
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cu
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/sequence_erase_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void LabelErasedIdx(const T* in_dat, const int in_len,
+                               const T* tokens, const int tokens_len,
+                               int* num_erased) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    int erased = 0;
+    for (int i = 0; i < tokens_len; ++i) {
+      if (in_dat[index] == tokens[i]) {
+        erased = 1;
+      }
+    }
+    num_erased[index + 1] = erased;
+    if (index == 0) {
+      num_erased[0] = 0;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GetOutLod(const T* num_erased, const int* in_lod,
+                          const int lod_len, int* out_lod0) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < lod_len) {
+    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(const T* in_dat, const int in_len,
+                          const int* num_erased, T* out_dat) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    if (in_dat[index] != in_dat[index + 1]) {
+      out_dat[index - num_erased[index]] = in_dat[index];
+    }
+  }
+}
+
+template <typename T>
+class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<T>>("tokens");
+    auto tokens_len = tokens.size();
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    thrust::host_vector<T> host_tokens(tokens_len);
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      host_tokens[i] = tokens[i];
+    }
+    thrust::device_vector<T> dev_tokens = host_tokens;
+    thrust::device_vector<int> num_erased(in_len + 1);
+
+    T* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
+    int* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+
+    auto stream = ctx.cuda_device_context().stream();
+    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_dat, in_len, dev_tokens_ptr, tokens_len, num_erased_ptr);
+    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
+                           num_erased.begin() + 1);
+
+    // Calc LoD
+    auto lod_len = lod0.size();
+    thrust::host_vector<int> host_lod(lod_len);
+    for (size_t i = 0; i < lod_len; ++i) {
+      host_lod[i] = lod0[i];
+    }
+    thrust::device_vector<int> dev_in_lod = host_lod;
+    thrust::device_vector<int> dev_out_lod(lod_len);
+    int* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+    int* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
+    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
+    thrust::host_vector<int> host_out_lod = dev_out_lod;
+    std::vector<int> out_lod0(lod_len, 0);
+    for (size_t i = 0; i < lod_len; i++) {
+      out_lod0[i] = host_out_lod[i];
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+
+    // Set output
+    out->Resize({out_lod0.back(), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
+                                                      num_erased_ptr, out_dat);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(sequence_erase,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>);
diff --git a/paddle/operators/sequence_erase_op.h b/paddle/operators/sequence_erase_op.h
new file mode 100644
index 0000000000..cb2d7be009
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SequenceEraseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    std::vector<size_t> num_erased(in_len + 1, 0);
+    std::vector<size_t> out_lod0(1, 0);
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      size_t num_out = 0;
+      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
+        num_erased[j] = num_erased[j - 1];
+        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
+            tokens.end()) {
+          num_erased[j] += 1;
+        } else {
+          num_out += 1;
+        }
+      }
+      out_lod0.push_back(out_lod0.back() + num_out);
+    }
+
+    auto out_len = in_len - num_erased[in_len];
+    out->Resize({static_cast<int64_t>(out_len), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int64_t i = 0; i < in_len; ++i) {
+      if (num_erased[i] == num_erased[i + 1]) {
+        out_dat[i - num_erased[i]] = in_dat[i];
+      }
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index aea98744d8..549d9620ef 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
@@ -115,12 +115,32 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_pool_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    if (boost::get<std::string>(GetAttr("pooltype")) == "MAX") {
+      op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex"));
+    }
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
-            sequence_pool_grad, ops::SequencePoolGradOp);
+REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
+                  ops::SequencePoolGradOpMaker);
+REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_pool,
     ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 98bd885490..f79106ff0f 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
index 14bcaebbb4..0e4e4cf65f 100644
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -66,13 +66,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
-                          &offset_cpu);
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
-                          &length_cpu);
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
@@ -127,13 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
-                          &offset_cpu);
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
-                          &length_cpu);
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index b37269b471..3f5b2a9b84 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
 #include "paddle/operators/array_operator.h"
 #include "paddle/operators/math/math_function.h"
 
@@ -46,8 +47,21 @@ class ShrinkRNNMemoryOp : public ArrayOp {
     auto *out_var = scope.FindVar(Output("Out"));
     PADDLE_ENFORCE(out_var != nullptr, "Output Out must be set");
     auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+
+    size_t height = dst_num_rows;
+
+    // do shrink for the top level LoD
+    if (x_tensor.lod().size() > 0 &&
+        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
+      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
+                                                              dst_num_rows, 0);
+      height = lod_offset.second.second;
+      auto out_lod = out_tensor.mutable_lod();
+      framework::AppendLoD(out_lod, lod_offset.first);
+    }
+
     if (dst_num_rows != 0) {
-      out_tensor.ShareDataWith(x_tensor.Slice(0, dst_num_rows));
+      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
     }
   }
 };
@@ -64,11 +78,11 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
     AddComment(
         R"DOC(
-        In dynamic RNN, we are able to handle sequences of different lengths. 
-        Because of the multiple lengths, the size of each step input can be 
+        In dynamic RNN, we are able to handle sequences of different lengths.
+        Because of the multiple lengths, the size of each step input can be
         different, which may lead to a mismatching between the input of
-        the current step and the memory generated by the previous one. This 
-        operator shrinks memory according to the size of the next step input, 
+        the current step and the memory generated by the previous one. This
+        operator shrinks memory according to the size of the next step input,
         to make sure that they can match each other.
         )DOC");
   }
@@ -115,7 +129,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
       auto slice = dx_tensor.Slice(0, static_cast<int>(height));
-      framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
       if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
@@ -132,6 +146,7 @@ class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
     context->SetOutputDim(framework::GradVarName("X"),
                           context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
   }
 };
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index e7306bc5f1..cef1f1fc99 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -31,6 +31,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(x_dims.size() == 2UL,
                    "The input of softmax op must be a matrix.");
     ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 41e65b701e..7135780c92 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index 2d8787d740..bd93c49201 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -53,7 +53,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -111,9 +111,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
         // out[offset: offset+len] = x[each_range.begin: each_range.end]
         auto slice = out->Slice(static_cast<int>(offset),
                                 static_cast<int>(offset + len));
-        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                                    static_cast<int>(each_range.end)),
-                            x.place(), dev_ctx, &slice);
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index b86e826642..88ed67f7ba 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
@@ -122,17 +122,17 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
 
     for (auto& name : op_desc.Input("X")) {
       VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name)->GetType();
+               << block->FindRecursiveOrCreateVar(name).GetType();
     }
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name)->GetType() ==
+          return block->FindRecursiveOrCreateVar(name).GetType() ==
                  framework::proto::VarDesc::LOD_TENSOR;
         });
 
     auto is_tensor_array = [block](const std::string& name) {
-      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
+      return block->FindRecursiveOrCreateVar(name).GetType() ==
              framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     };
 
@@ -146,8 +146,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
         std::ostringstream os;
         for (auto& each : inputs) {
           os << "    " << each << " type is "
-             << detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType()
-             << "\n";
+             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
         }
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
@@ -158,7 +157,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     }
 
     auto out_var_name = op_desc.Output("Out").front();
-    auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name));
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
     out_var.SetType(var_type);
     auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
     out_var.SetDataType(in_var.GetDataType());
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 552b48f608..48201b344d 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -70,6 +70,7 @@ class SumKernel : public framework::OpKernel<T> {
     } else if (out_var->IsType<framework::SelectedRows>()) {
       PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
       auto *out = context.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
       auto *out_value = out->mutable_value();
 
       // Runtime InferShape
@@ -107,8 +108,8 @@ class SumKernel : public framework::OpKernel<T> {
               out_array.resize(i + 1);
             }
             if (out_array[i].numel() == 0) {
-              framework::CopyFrom(in_array[i], in_array[i].place(),
-                                  context.device_context(), &out_array[i]);
+              framework::Copy(in_array[i], in_array[i].place(),
+                              context.device_context(), &out_array[i]);
               out_array[i].set_lod(in_array[i].lod());
             } else {
               PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
diff --git a/paddle/operators/tensor.save b/paddle/operators/tensor.save
deleted file mode 100644
index c24308a7d0..0000000000
Binary files a/paddle/operators/tensor.save and /dev/null differ
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index d5ff3e3fce..a70be8b875 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -44,7 +44,7 @@ class WriteToArrayOp : public ArrayOp {
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
 
-      CopyFrom(x_tensor, place, dev_ctx, out_tensor);
+      Copy(x_tensor, place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
@@ -106,8 +106,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
     auto x_name = op_desc.Input("X")[0];
     auto out_name = op_desc.Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
-                            "Cannot found %s", out_name);
+    auto &out = block->FindRecursiveOrCreateVar(out_name);
     out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
     auto *x = block->FindVarRecursive(x_name);
     if (x != nullptr) {
@@ -136,7 +135,7 @@ class ReadFromArrayOp : public ArrayOp {
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
-      framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
+      framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
       VLOG(10) << "offset " << offset << " >= " << x_array.size();
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index bb72210bb6..a8ddd72973 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -41,6 +41,8 @@ class TopkOp : public framework::OperatorWithKernel {
     dims[dims.size() - 1] = k;
     ctx->SetOutputDim("Out", dims);
     ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 4d5dd86cb8..3a314bdb9b 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   }
 
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index aeed9679b2..50cee11a7a 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -71,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
 
 class UnpoolOp : public framework::OperatorWithKernel {
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
@@ -110,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
 
 class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
-  framework::OpKernelType GetActualKernelType(
+  framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
diff --git a/paddle/operators/warpctc_op.cc b/paddle/operators/warpctc_op.cc
new file mode 100644
index 0000000000..bd0c5f9957
--- /dev/null
+++ b/paddle/operators/warpctc_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/warpctc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class WarpCTCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
+                   "Output(WarpCTCGrad) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of WarpCTCOp should not be null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    int sequence_width =
+        static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
+    int blank = ctx->Attrs().Get<int>("blank");
+    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
+                   "The value of Attr(blank) should be in interval [0, %d).",
+                   sequence_width);
+    // TODO(liuyiqun): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(LodTensor, default: LoDTensor<float>), the unscaled "
+             "probabilities of variable-length sequences, which is a 2-D "
+             "Tensor with LoD information. It's shape is "
+             "[Lp, num_classes + 1], where Lp is the sum of all input "
+             "sequences' length and num_classes is the true number of classes "
+             "(not including the blank label).");
+    AddInput("Label",
+             "(LodTensor, default: LoDTensor<int>), the ground truth "
+             "of variable-length sequence, which is a 2-D Tensor with LoD "
+             "information. It is of the shape [Lg, 1], where Lg is th sum of "
+             "all labels' length.");
+    AddOutput("WarpCTCGrad",
+              "(Tensor, default: Tensor<float>), a temporary "
+              "output Tensor to store the gradients of warp-ctc, which is "
+              "computed with loss together in one call. It is a 3-D Tensor of "
+              "the shape [max_sequence_length, batch_size, num_classes + 1].")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), the Connectionist "
+              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
+              "the shape [batch_size, 1]");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label of Connectionist "
+                 "Temporal Classification (CTC) loss, which is in the "
+                 "half-opened interval [0, num_classes + 1).")
+        .SetDefault(0);
+    AddAttr<bool>("norm_by_times",
+                  "(bool, default: false), whether to "
+                  "normalize the gradients by the number of time-step, "
+                  "which is also the sequence's length.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+An operator integrating the open-source
+[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
+[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
+https://arxiv.org/pdf/1512.02595v1.pdf),
+to compute Connectionist Temporal Classification (CTC) loss.
+It can be aliased as softmax with ctc, since a native softmax activation is
+interated to the warp-ctc library, to to normlize values for each row of the
+input tensor.
+
+More detail of CTC loss can be found by refering to
+[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
+Recurrent Neural Networks](
+http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
+)DOC");
+  }
+};
+
+class WarpCTCGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
+                   "Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+    ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
+            ops::WarpCTCGradOp);
+REGISTER_OP_CPU_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.cu.cc b/paddle/operators/warpctc_op.cu.cc
new file mode 100644
index 0000000000..7d8527ac75
--- /dev/null
+++ b/paddle/operators/warpctc_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/warpctc_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.h b/paddle/operators/warpctc_op.h
new file mode 100644
index 0000000000..41899c7fe0
--- /dev/null
+++ b/paddle/operators/warpctc_op.h
@@ -0,0 +1,218 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_padding.h"
+#include "paddle/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext>
+class WarpCTCFunctor {
+ public:
+  /*
+   * \brief Compute the connectionist temporal classification loss,
+   *        and optionally compute the gradient with respect to the inputs.
+   *
+   * If gradient is nullptr, it only computes the ctc loss,
+   * or computes both ctc loss and gradient.
+   *
+   * \param ctx               execution context of this functor
+   * \param input             batch matrix of input probabilities, in
+   *                          max_sequence_length x num_sequences x
+   *                          sequence_width, (row-major) format
+   * \param gradient          batch matrix of gradient, with the same shape as
+   *                          input.
+   * \param cpu_labels        labels always in CPU memory.
+   * \param cpu_label_lengths length of all labels in CPU memory.
+   * \param cpu_input_lengths length of all sequences in CPU memory.
+   * \param sequence_width    number of possible output symbols.
+   * \param num_sequences     number of sequence.
+   * \param blank             blank label used in ctc loss function.
+   * \param cpu_losss         cost of each sequence in CPU memory.
+   */
+  void operator()(const framework::ExecutionContext& ctx, const float* input,
+                  float* gradient, const int* cpu_labels,
+                  const int* cpu_label_lengths, const int* cpu_input_lengths,
+                  const size_t sequence_width, const size_t num_sequences,
+                  const size_t blank, float* cpu_loss) {
+    // Init warp-ctc options
+    init(ctx, blank);
+
+    // Compute the required workspace size.
+    // There is no memory allocated operations within warp-ctc.
+    size_t workspace_bytes = 0;
+    ctcStatus_t status = platform::dynload::get_workspace_size(
+        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
+        static_cast<int>(num_sequences), options_, &workspace_bytes);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in get_workspace_size: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+    PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
+                      "Bytes of workspace got by warp-ctc function, "
+                      "get_workspace_size(), should be larger than 0.");
+
+    Tensor workspace;
+    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
+    float* workspace_data = workspace.mutable_data<float>(
+        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
+        ctx.GetPlace());
+    math::SetConstant<DeviceContext, float>()(
+        ctx.template device_context<DeviceContext>(), &workspace,
+        static_cast<float>(0));
+
+    // compute loss and gradient
+    status = platform::dynload::compute_ctc_loss(
+        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
+        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+        cpu_loss, workspace_data, options_);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in compute_ctc_loss: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+  }
+
+ protected:
+  void init(const framework::ExecutionContext& ctx, const size_t blank) {
+    warpctc_version_ = platform::dynload::get_warpctc_version();
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      options_.loc = CTC_GPU;
+      options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream();
+#else
+      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+#endif
+    } else {
+      options_.loc = CTC_CPU;
+      options_.num_threads = 1;
+    }
+
+    options_.blank_label = blank;
+  }
+
+ private:
+  int warpctc_version_;
+  ctcOptions options_;
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<LoDTensor>("Logits");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    const size_t level = 0;
+
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+
+    const size_t sequence_width = logits->numel() / logits_dims[0];
+    auto loss_dims =
+        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
+
+    // warpctc needs sequences data stored in transposed padding format
+    Tensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod, level);
+    auto warpctc_logits_dims =
+        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                              static_cast<int64_t>(num_sequences),
+                              static_cast<int64_t>(sequence_width)});
+    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
+        false);
+    const T* warpctc_logits_data = warpctc_logits.data<T>();
+
+    std::vector<int> warpctc_label_lengths(num_sequences);
+    std::vector<int> warpctc_logits_lengths(num_sequences);
+
+    for (size_t i = 0; i < num_sequences; ++i) {
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
+    }
+
+    // warpctc computes loss and gradient in one call, gradient data also stored
+    // in batch format
+    T* warpctc_grad_data =
+        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
+
+    // warpctc accesses labels in CPU memory
+    Tensor warpctc_label;
+    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
+    const int* warpctc_label_data = warpctc_label.data<int>();
+
+    // warpctc stores loss in CPU memory
+    Tensor warpctc_loss;
+    T* warpctc_loss_data =
+        warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
+
+    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+
+    WarpCTCFunctor<DeviceContext>()(
+        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+        sequence_width, num_sequences, blank, warpctc_loss_data);
+
+    // Copy the loss back
+    Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
+    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+
+    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        *warpctc_grad, norm_by_times);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 65d827e0e0..7a3400919e 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -211,59 +211,54 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
 
  protected:
   std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad = new framework::OpDesc();
-    grad->SetType("while_grad");
-    grad->SetInput(kX, Input(kX));
+    auto *while_grad = new framework::OpDesc();
+    while_grad->SetType("while_grad");
+    while_grad->SetInput(kX, Input(kX));
+    while_grad->SetInput(kOutputs, Output(kOutputs));
+    while_grad->SetInput(kStepScopes, Output(kStepScopes));
+
+    auto *grad_block = this->grad_block_[0];
+    auto *fwd_block = grad_block->ParentBlock();
 
     // Not all of IGs will be generated by inner gradient operators of while op.
     // Ignore IGs that is not generated by the inside block.
-    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
-    std::unordered_set<std::string> all_outs;
-    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
-      for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
-        all_outs.insert(oname);
+    std::unordered_set<std::string> inner_op_outputs;
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &oname : op->OutputArgumentNames()) {
+        inner_op_outputs.insert(oname);
       }
     }
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
     for (auto &each_ig : igs) {
-      if (all_outs.find(each_ig) == all_outs.end()) {
+      if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
         VLOG(10) << "Ignore " << each_ig;
         each_ig = framework::kEmptyVarName;
       }
     }
-
-    grad->SetOutput(framework::GradVarName(kX), igs);
-
-    grad->SetInput(kOutputs, Output(kOutputs));
+    while_grad->SetOutput(framework::GradVarName(kX), igs);
 
     // OG should be re-calculated by step blocks, since many outputs of while op
     // do not need to calculate gradients.
     std::unordered_set<std::string> block_ins;
-    auto *fwd_block = this->grad_block_[0]->ParentBlock();
-    {
-      for (auto &p : Input(kX)) {
-        block_ins.insert(p);
-      }
-      for (auto &o : Output(kOutputs)) {
-        block_ins.insert(o);
-      }
+    block_ins.reserve(Input(kX).size() + Output(kOutputs).size());
+    for (auto &p : Input(kX)) {
+      block_ins.insert(p);
+    }
+    for (auto &o : Output(kOutputs)) {
+      block_ins.insert(o);
     }
     std::unordered_set<std::string> extra_inputs;
-    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
-      for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) {
-        if (block_ins.find(input_name) != block_ins.end()) {
-          continue;
-        }
-
-        // If the input of Op is generated by the forward block, do not make it
-        // as input again.
-        if (fwd_block->FindVar(input_name) != nullptr) {
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &input_name : op->InputArgumentNames()) {
+        // If the input of Op has been recorded or is generated by the forward
+        // block, do not make it as input again.
+        if (block_ins.find(input_name) != block_ins.end() ||
+            fwd_block->FindVar(input_name) != nullptr) {
           continue;
         }
-
         extra_inputs.insert(input_name);
       }
-
-      for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) {
+      for (auto &output_name : op->OutputArgumentNames()) {
         block_ins.insert(output_name);
       }
     }
@@ -272,15 +267,15 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     extra_inputs_list.resize(extra_inputs.size());
     std::copy(extra_inputs.begin(), extra_inputs.end(),
               extra_inputs_list.begin());
-    grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
-    grad->SetInput(kStepScopes, Output(kStepScopes));
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+
+    while_grad->SetAttrMap(this->Attrs());
+    while_grad->SetBlockAttr(kStepBlock, *grad_block);
     // record the original output gradient names, since the gradient name of
     // while operator could be renamed.
-    grad->SetAttr("original_output_grad", extra_inputs_list);
+    while_grad->SetAttr("original_output_grad", extra_inputs_list);
 
-    return std::unique_ptr<framework::OpDesc>(grad);
+    return std::unique_ptr<framework::OpDesc>(while_grad);
   }
 };
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 8c4803b973..44f6d85cd1 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -21,10 +21,16 @@ ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
 
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
 # memcpy deoends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
-    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS})
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 4bf643e048..9d9348079a 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -168,5 +168,69 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
+    : CPUDeviceContext(place), ready_(false) {
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+}
+
+template <typename T>
+void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                     const T& value) {
+  if (GetElement<T>(op_key)) {
+    return;
+  }
+  GetElementPool<T>().emplace(op_key, std::move(value));
+}
+
+template <typename T>
+const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
+  auto it = GetElementPool<T>().find(op_key);
+  return it == GetElementPool<T>().end() ? nullptr : it->second;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+  return memory_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
+  return primitive_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  return primitive_desc_pool_;
+}
+
+void MKLDNNDeviceContext::Execute(bool block) {
+  if (pipeline_.empty()) {
+    return;
+  }
+  ResetStream();
+  stream_->submit(pipeline_).wait(block);
+  ready_ = false;
+  pipeline_.clear();
+}
+
+void MKLDNNDeviceContext::ResetStream() {
+  if (ready_) {
+    return;
+  }
+  // TODO(TJ): change me when mkldnn have specific method to reset this state
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  ready_ = true;
+}
+
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 609ea4bd3a..9826a64276 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -21,6 +21,10 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/platform/mkldnn_helper.h"
+#endif
+
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -105,6 +109,54 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
 
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+class MKLDNNDeviceContext : public CPUDeviceContext {
+ public:
+  explicit MKLDNNDeviceContext(CPUPlace place);
+
+  /* \brief  Add new element: memory, primitive or primitive desc */
+  template <typename T>
+  void AddElement(const std::string& op_key, const T& value);
+
+  /* \brief  Get existed element: memory, primitive or primitive desc */
+  template <typename T>
+  const T& GetElement(const std::string& op_key) const;
+
+  /* \brief  Get element pool: memory, primitive or primitive desc pool */
+  template <typename T>
+  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
+  GetElementPool() const;
+
+  /* \brief  Get the active engine */
+  const MKLDNNEngine& engine() const { return *engine_; }
+
+  /* \brief  Submit primitive to pipeline */
+  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
+
+  /*! \brief  Execute all submitted primitives in pipeline */
+  void Execute(bool block = true);
+
+ protected:
+  /*! \brief  Reset the stream to prepare next exectue */
+  void ResetStream();
+
+ private:
+  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                     std::hash<std::string>>
+      memory_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                     std::hash<std::string>>
+      primitive_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                     std::hash<std::string>>
+      primitive_desc_pool_;
+  std::vector<MKLDNNPrimitive> pipeline_;
+  MKLDNNStreamPtr stream_;
+  MKLDNNEnginePtr engine_;
+  bool ready_;
+};
+#endif
+
 /*! \brief device context pool singleton */
 class DeviceContextPool {
  public:
@@ -133,6 +185,8 @@ class DeviceContextPool {
         const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
   }
 
+  size_t size() const { return device_contexts_.size(); }
+
  private:
   static DeviceContextPool* pool;
   constexpr static int LEFT_SHIFT = 8;
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index f4fda65907..cf2081b434 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,3 +1,4 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
         DEPS dynamic_loader nccl)
+cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
index 9cd2a1f565..6aca716657 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/platform/dynload/cublas.h>
+#include "paddle/platform/dynload/cublas.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/dynload/warpctc.cc b/paddle/platform/dynload/warpctc.cc
new file mode 100644
index 0000000000..9b7d01a6e8
--- /dev/null
+++ b/paddle/platform/dynload/warpctc.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/warpctc.h b/paddle/platform/dynload/warpctc.h
new file mode 100644
index 0000000000..acafcaff2c
--- /dev/null
+++ b/paddle/platform/dynload/warpctc.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mutex>
+#include "ctc.h"
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> decltype(__name(args...)) {     \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
+      std::call_once(warpctc_dso_flag,                               \
+                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
+                     &warpctc_dso_handle);                           \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(get_workspace_size)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/mkldnn_helper.h b/paddle/platform/mkldnn_helper.h
new file mode 100644
index 0000000000..cd52a8b4c4
--- /dev/null
+++ b/paddle/platform/mkldnn_helper.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkldnn.hpp>
+
+namespace paddle {
+namespace platform {
+
+using MKLDNNStream = mkldnn::stream;
+using MKLDNNEngine = mkldnn::engine;
+using MKLDNNMemory = mkldnn::memory;
+using MKLDNNPrimitive = mkldnn::primitive;
+using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
+
+typedef std::unique_ptr<MKLDNNStream> MKLDNNStreamPtr;
+typedef std::unique_ptr<MKLDNNEngine> MKLDNNEnginePtr;
+typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
+typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
+typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 249527e3e1..f05260ccac 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -51,6 +51,18 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
 }
 
+bool is_same_place(const Place &p1, const Place &p2) {
+  if (places_are_same_class(p1, p2)) {
+    if (is_cpu_place(p1)) {
+      return true;
+    } else {
+      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
+    }
+  } else {
+    return false;
+  }
+}
+
 std::ostream &operator<<(std::ostream &os, const Place &p) {
   detail::PlacePrinter printer(os);
   boost::apply_visitor(printer, p);
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 76b5c502cc..fbb43fa043 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -52,6 +52,8 @@ struct IsCUDAPlace : public boost::static_visitor<bool> {
 
 typedef boost::variant<CUDAPlace, CPUPlace> Place;
 
+using PlaceList = std::vector<Place>;
+
 void set_place(const Place &);
 const Place &get_place();
 
@@ -61,6 +63,7 @@ const CPUPlace default_cpu();
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
+bool is_same_place(const Place &, const Place &);
 
 std::ostream &operator<<(std::ostream &, const Place &);
 
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 4e89e5c600..7e2e2d968e 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/platform/profiler.h"
+#include <iomanip>
+#include <map>
+#include "glog/logging.h"
 
 namespace paddle {
 namespace platform {
 
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
 
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
              DeviceContext* dev_ctx)
-    : kind_(kind),
-      name_(std::move(name)),
-      thread_id_(thread_id),
-      has_cuda_(false) {
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
   auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
   if (cuda_dev_ctx) {
@@ -72,11 +74,11 @@ std::string Event::kind() const {
   PADDLE_THROW("Unknown EventKind.");
 }
 
-double Event::CpuElapsedUs(const Event& e) const {
-  return (e.cpu_ns_ - cpu_ns_) / (1000.0);
+double Event::CpuElapsedMs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
 }
 
-double Event::CudaElapsedUs(const Event& e) const {
+double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const {
   PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
   float ms;
   PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
-  return ms * 1000.0;
+  return ms;
 #else
   PADDLE_THROW("CUDA is not enabled");
 #endif
@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
 }
 
 void Mark(const std::string& name, DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
-                        dev_ctx);
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+}
+
+void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 
 RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
   if (g_state == ProfilerState::kDisabled) return;
   dev_ctx_ = dev_ctx;
-  GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
-                        dev_ctx_);
+  name_ = name;
+  PushEvent(name_, dev_ctx_);
 }
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled) return;
-  GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id,
-                        dev_ctx_);
+  PopEvent(name_, dev_ctx_);
 }
 
 void EnableProfiler(ProfilerState state) {
@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
                  "The profiling state should be disabled when calling ",
                  "EnableProfiler.");
   g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy evenets first to reduce the startup overhead.
@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
   return result;
 }
 
+void ParseEvents(std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by) {
+  if (g_profiler_place == "") return;
+
+  std::string sorted_domain;
+  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sorted_domain = "number of calls";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.calls > b.calls;
+      };
+      break;
+    case EventSortingKey::kTotal:
+      sorted_domain = "total time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.total_time > b.total_time;
+      };
+      break;
+    case EventSortingKey::kMin:
+      sorted_domain = "minimum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.min_time > b.min_time;
+      };
+      break;
+    case EventSortingKey::kMax:
+      sorted_domain = "maximum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.max_time > b.max_time;
+      };
+      break;
+    case EventSortingKey::kAve:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.ave_time > b.ave_time;
+      };
+      break;
+    default:
+      sorted_domain = "event end time";
+  }
+
+  std::vector<std::vector<EventItem>> events_table;
+  size_t max_name_width = 0;
+  for (size_t i = 0; i < events.size(); i++) {
+    std::list<Event> pushed_events;
+    std::vector<EventItem> event_items;
+    std::unordered_map<std::string, int> event_idx;
+
+    for (size_t j = 0; j < events[i].size(); j++) {
+      if (events[i][j].kind() == "push") {
+        pushed_events.push_back(events[i][j]);
+      } else if (events[i][j].kind() == "pop") {
+        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
+          ++rit;
+        }
+
+        if (rit != pushed_events.rend()) {
+          double event_time = (g_profiler_place == "CUDA")
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
+          std::string event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+          max_name_width = std::max(max_name_width, event_name.size());
+
+          if (event_idx.find(event_name) == event_idx.end()) {
+            event_idx[event_name] = event_items.size();
+            EventItem event_item = {event_name, 1,          event_time,
+                                    event_time, event_time, event_time};
+            event_items.push_back(event_item);
+          } else {
+            int index = event_idx[event_name];
+            event_items[index].calls += 1;
+            // total time
+            event_items[index].total_time += event_time;
+            // min time
+            event_items[index].min_time =
+                std::min(event_time, event_items[index].min_time);
+            // max time
+            event_items[index].max_time =
+                std::max(event_time, event_items[index].max_time);
+          }
+
+          // remove the push marker from the list
+          pushed_events.erase((++rit).base());
+        } else {
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
+        }
+      }
+    }
+    // average time
+    for (auto& item : event_items) {
+      item.ave_time = item.total_time / item.calls;
+    }
+    // sort
+    if (sorted_by != EventSortingKey::kDefault) {
+      std::sort(event_items.begin(), event_items.end(), sorted_func);
+    }
+
+    events_table.push_back(event_items);
+    // log warning if there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                   << "\', which will be ignored in profiling report.";
+      ++rit;
+    }
+  }
+
+  // Print report
+  PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
+}
+
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
index 47104ea9d0..6df48ef880 100644
--- a/paddle/platform/profiler.h
+++ b/paddle/platform/profiler.h
@@ -33,6 +33,7 @@ class Event {
 
   std::string kind() const;
   std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
   bool has_cuda() const { return has_cuda_; }
 
 #ifdef PADDLE_WITH_CUDA
@@ -40,8 +41,8 @@ class Event {
   int device() const { return device_; }
 #endif
 
-  double CpuElapsedUs(const Event& e) const;
-  double CudaElapsedUs(const Event& e) const;
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
 
  private:
   EventKind kind_;
@@ -94,6 +95,10 @@ enum ProfilerState {
 
 void Mark(const std::string& name, DeviceContext* dev_ctx);
 
+void PushEvent(const std::string& name, DeviceContext* dev_ctx);
+
+void PopEvent(const std::string& name, DeviceContext* dev_ctx);
+
 struct RecordEvent {
   explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
 
@@ -101,6 +106,8 @@ struct RecordEvent {
 
   // The device context is used by Event to get the current cuda stream.
   DeviceContext* dev_ctx_;
+  // Event name
+  std::string name_;
 };
 
 // Enable the profiling function.
@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> DisableProfiler();
 
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Candidate keys to sort the profiling report
+enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+
+// Parse the event list and output the profiling report
+void ParseEvents(std::vector<std::vector<Event>>&,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault);
+
+// Print results
+void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
+                          std::string& sorted_domain, const size_t name_width,
+                          const size_t data_width);
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
index 47cf7be146..13dea713c7 100644
--- a/paddle/platform/profiler_test.cc
+++ b/paddle/platform/profiler_test.cc
@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
     counter++;
   }
   Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
-  EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
+  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 
 #ifdef PADDLE_WITH_CUDA
@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
     counter++;
   }
   Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
-  EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0);
+  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
 }
 #endif
 
@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
   using paddle::platform::EventKind;
   using paddle::platform::RecordEvent;
   using paddle::platform::ProfilerState;
+  using paddle::platform::EventSortingKey;
 
   ProfilerState state = ProfilerState::kCPU;
   DeviceContext* dev_ctx = nullptr;
@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
 #endif
   EnableProfiler(state);
 
+  /* Usage 1:
+  *  PushEvent(evt_name, dev_ctx);
+  *  ...
+  *  code to be analyzed
+  *  ...
+  * PopEvent(evt_name, dev_ctx);
+  */
+  for (int loop = 0; loop < 3; ++loop) {
+    for (int i = 1; i < 5; ++i) {
+      std::string name = "op_" + std::to_string(i);
+      PushEvent(name, dev_ctx);
+      int counter = 1;
+      while (counter != i * 1000) counter++;
+      PopEvent(name, dev_ctx);
+    }
+  }
+
+  /* Usage 2:
+   * {
+   *   RecordEvent record_event(name, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   * }
+   */
   for (int i = 1; i < 5; ++i) {
-    std::string name = "op_" + std::to_string(i);
+    std::string name = "evs_op_" + std::to_string(i);
     RecordEvent record_event(name, dev_ctx);
     int counter = 1;
     while (counter != i * 1000) counter++;
   }
+
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
   std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
+  // Will remove parsing-related code from test later
+  ParseEvents(events, EventSortingKey::kTotal);
+
   int cuda_startup_count = 0;
   int start_profiler_count = 0;
   int stop_profiler_count = 0;
@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
       if (events[i][j].name() == "push") {
         EXPECT_EQ(events[i][j + 1].name(), "pop");
 #ifdef PADDLE_WITH_CUDA
-        EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0);
+        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
-        EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0);
+        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
 #endif
       }
     }
diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc
index 761635aa5e..b13ad42ea2 100644
--- a/paddle/pybind/const_value.cc
+++ b/paddle/pybind/const_value.cc
@@ -23,11 +23,6 @@ void BindConstValue(pybind11::module& m) {
   m.def("kTempVarName", [] { return framework::kTempVarName; });
   m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
-
-  // for kernel_hint key
-  m.def("kUseCPU", [] { return framework::kUseCPU; });
-  m.def("kUseCUDNN", [] { return framework::kUseCUDNN; });
-  m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; });
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 564a370001..4f95948153 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -231,7 +231,8 @@ void BindVarDsec(py::module &m) {
       .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
       .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
       .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY);
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 364db62cba..5d170c66e9 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -430,6 +430,12 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices", &framework::InitDevices);
 
+  m.def("use_cpu", framework::UseCPU);
+  m.def("use_mkldnn", framework::UseMKLDNN);
+  m.def("use_cuda", framework::UseCUDA);
+  m.def("use_cudnn", framework::UseCUDNN);
+  m.def("use_all", framework::UseALL);
+
   m.def("is_compile_gpu", IsCompileGPU);
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 92039ec6b0..e70d04d901 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -193,6 +193,16 @@ EOF
 EOF
 }
 
+function gen_capi_package() {
+  if [[ ${WITH_C_API} == "ON" ]]; then
+    install_prefix="/paddle/build/capi_output"
+    rm -rf $install_prefix
+    make DESTDIR="$install_prefix" install
+    cd $install_prefix/usr/local
+    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
+  fi
+}
+
 set -xe
 
 cmake_gen ${PYTHON_ABI:-""}
@@ -200,6 +210,11 @@ run_build
 run_test
 gen_docs
 gen_dockerfile
-
-printf "If you need to install PaddlePaddle in develop docker image,"
-printf "please make install or pip install build/python/dist/*.whl.\n"
+gen_capi_package
+
+if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
+  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
+else
+  printf "If you need to install PaddlePaddle in develop docker image,"
+  printf "please make install or pip install build/python/dist/*.whl.\n"
+fi
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 8a352b0078..80fa0c72af 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -49,7 +49,18 @@ function cpu_config() {
   if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
-  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
   if [ $ht -eq 1 ]; then # HT is OFF
     if [ -z "$KMP_AFFINITY" ]; then
       export KMP_AFFINITY="granularity=fine,compact,0,0"
@@ -72,7 +83,15 @@ function threads_config() {
   # according to trainer_count and total processors
   # only when MKL enabled
   # auto set OPENBLAS_NUM_THREADS when do not use MKL
-  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
+    return 0
+  fi
   trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
   if [ -z $trainers ]; then
     trainers=1
@@ -92,6 +111,9 @@ function threads_config() {
     if [ -z "$OPENBLAS_NUM_THREADS" ]; then
       export OPENBLAS_NUM_THREADS=$threads
     fi
+    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
+      export OPENBLAS_MAIN_FREE=1
+    fi
   fi
   
 }
@@ -145,11 +167,7 @@ else:
   sys.exit(0)
 EOF
 
-if [ "`uname -s`" == "Linux" ]; then
-  # only support on linux yet, with mac can use v2
-  cpu_config
-fi
-
+cpu_config
 # echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 108ff335bf..a7fb50ee41 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -34,11 +34,11 @@ int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
   testing::InitGoogleTest(&argc, argv);
   paddle::memory::Used(paddle::platform::CPUPlace());
-  std::vector<std::string> devs = {"CPU"};
+
 #ifdef PADDLE_WITH_CUDA
   paddle::memory::Used(paddle::platform::CUDAPlace(0));
-  devs.push_back("GPU:0");
 #endif
-  paddle::framework::InitDevices(devs);
+
+  paddle::framework::InitDevices();
   return RUN_ALL_TESTS();
 }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6f589e9169..36919ab00b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -29,8 +29,8 @@ if(WITH_MKLML)
 endif()
 
 if(WITH_MKLDNN)
-  list(APPEND MKL_SHARED_LIBS "${MKLDNN_LIB}" "${MKLDNN_LIB}.0")
-  list(APPEND MKL_DEPENDS mkldnn)
+  list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
+  list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
 endif()
 
 if(WITH_GPU)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index ecba871910..e6f87ce61b 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -58,12 +58,12 @@ def is_compatible_with(x, Type):
 
 class HookAttribute(object):
     """
-    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs 
+    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
     during training process of a layer with parameters, such as img_conv layer, fc layer.
 
-    :param  type: Hook type, currently supported types: 
+    :param  type: Hook type, currently supported types:
                         'pruning' :  user specify a sparsity_ratio before training started, and the
-                            network will prune the parameters based on the sparsity_ratio. 
+                            network will prune the parameters based on the sparsity_ratio.
                             eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
                             The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
                                                                        num_channels=3, num_filters=64,
@@ -71,10 +71,10 @@ class HookAttribute(object):
                             The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
     :type type: string
 
-    :param sparsity_ratio: Must be specified if hook type is 'pruning', 
+    :param sparsity_ratio: Must be specified if hook type is 'pruning',
                         it represents the ratio of the zero elements to be set by the Parameter.
     :type sparsity_ratio: float or None
-	
+
     """
 
     def __init__(self, type, sparsity_ratio=None):
@@ -130,10 +130,12 @@ class ParameterAttribute(object):
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
+    :param update_hooks: A HookAttribute object.
+    :type update_hooks: HookAttribute
     :param initializer: If not None, it should be a callable object which accepts
                         a parameter name and returns numpy array for the initial
                         value of the parameter
-    :param initializer: callable object
+    :type initializer: callable object
     """
 
     def __init__(self,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 19e2ab1b7d..eac2cb3168 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2542,15 +2542,21 @@ def img_conv_layer(input,
     what-are-deconvolutional-layers/>`_ .
     The num_channel means input image's channel number. It may be 1 or 3 when
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
-    num_filters * num_group.
+    num_filters.
 
     There are several groups of filters in PaddlePaddle implementation.
-    Each group will process some channels of the input. For example, if
-    num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
-    32*4 = 128 filters to process the input. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will be processed by first 32 filters. The
-    rest channels will be processed by the rest groups of filters.
-
+    If the groups attribute is greater than 1, for example groups=2,
+    the input will be splitted into 2 parts along the channel axis, and
+    the filters will also be splitted into 2 parts. The first half of the filters 
+    is only connected to the first half of the input channels, while the second 
+    half of the filters is only connected to the second half of the input. After
+    the computation of convolution for each part of input,
+    the output will be obtained by concatenating the two results.
+
+    The details of grouped convolution, please refer to:
+    `ImageNet Classification with Deep Convolutional Neural Networks
+    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
+    
     The example usage is:
 
     ..  code-block:: python
@@ -2575,7 +2581,8 @@ def img_conv_layer(input,
     :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
                           is not set, it will be set automatically according to filter_size.
     :type filter_size_y: int
-    :param num_filters: Each filter group's number of filter
+    :param num_filters: The number of filters. It is as same as the output image channel.
+    :type num_filters: int
     :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
     :param groups: The group number. 1 is the default group number.
@@ -7177,7 +7184,7 @@ def img_conv3d_layer(input,
     :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
                         is set to one integer, the three dimensions will be same.
     :type filter_size: int | tuple | list
-    :param num_filters: The number of filters in each group.
+    :param num_filters: The number of filters. It is as same as the output image channel.
     :type num_filters: int
     :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
diff --git a/python/paddle/utils/dump_v2_config.py b/python/paddle/utils/dump_v2_config.py
new file mode 100644
index 0000000000..5dc2111e37
--- /dev/null
+++ b/python/paddle/utils/dump_v2_config.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+from paddle.trainer_config_helpers.layers import LayerOutput
+from paddle.v2.layer import parse_network
+from paddle.proto import TrainerConfig_pb2
+
+__all__ = ["dump_v2_config"]
+
+
+def dump_v2_config(topology, save_path, binary=False):
+    """ Dump the network topology to a specified file.
+
+    This function is only used to dump network defined by using PaddlePaddle V2
+    APIs. This function will NOT dump configurations related to PaddlePaddle
+    optimizer.
+
+    :param topology: The output layers (can be more than one layers given in a
+                     Python List or Tuple) of the entire network. Using the
+                     specified layers (if more than one layer is given) as root,
+                     traversing back to the data layer(s), all the layers
+                     connected to the specified output layers will be dumped.
+                     Layers not connceted to the specified will not be dumped.
+    :type topology: LayerOutput|List|Tuple
+    :param save_path: The path to save the dumped network topology.
+    :type save_path: str
+    :param binary: Whether to dump the serialized network topology or not.
+                   The default value is false. NOTE that, if you call this
+                   function to generate network topology for PaddlePaddle C-API,
+                   a serialized version of network topology is required. When
+                   using PaddlePaddle C-API, this flag MUST be set to True.
+    :type binary: bool
+    """
+
+    if isinstance(topology, LayerOutput):
+        topology = [topology]
+    elif isinstance(topology, collections.Sequence):
+        for out_layer in topology:
+            assert isinstance(out_layer, LayerOutput), (
+                "The type of each element in the parameter topology "
+                "should be LayerOutput.")
+    else:
+        raise RuntimeError("Error input type for parameter topology.")
+
+    model_str = parse_network(topology)
+    with open(save_path, "w") as fout:
+        if binary:
+            fout.write(model_str.SerializeToString())
+        else:
+            fout.write(str(model_str))
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 421e953d27..2b10020772 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -30,7 +30,8 @@ def merge_v2_model(net, param_file, output_file):
     which ends with .tar.gz.
 
     @param  net            The output layer of the network for inference.
-    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by
+                           v2 api.
     @param  output_file    Path of the merged file which will be generated.
 
     Usage:
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 0de417df2c..df710c33d0 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -62,12 +62,15 @@ __all__ = [
 cp.begin_parse()
 
 
-def set_omp_mkl_env_vars(trainer_count):
+def set_env_vars(trainer_count):
     '''Auto set CPU environment if have not set before.
-       export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
-       export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+       For MKL:
+         export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
+         export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+       For OpenBLAS:
+         export OPENBLAS_NUM_THREADS, OPENBLAS_MAIN_FREE according to trainer_count. 
     '''
-    import platform
+    import platform, paddle
     if not platform.system() in ['Linux', 'Darwin']:
         return
 
@@ -103,16 +106,22 @@ def set_omp_mkl_env_vars(trainer_count):
 
     num_cores = num_physical_cores()
     num_processors = num_logical_processors()
-    if num_processors > num_cores:  # Hyper Threading is enabled
-        set_env("OMP_DYNAMIC", "true")
-        set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
-    else:
-        set_env("OMP_DYNAMIC", "false")
-        set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    if paddle.version.mkl() == 'ON':
+        if num_processors > num_cores:  # Hyper Threading is enabled
+            set_env("OMP_DYNAMIC", "true")
+            set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+        else:
+            set_env("OMP_DYNAMIC", "false")
+            set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
     threads = num_processors / trainer_count
     threads = '1' if threads < 1 else str(threads)
-    set_env("OMP_NUM_THREADS", threads)
-    set_env("MKL_NUM_THREADS", threads)
+    if paddle.version.mkl() == 'ON':
+        set_env("OMP_NUM_THREADS", threads)
+        set_env("MKL_NUM_THREADS", threads)
+    else:
+        set_env("OPENBLAS_NUM_THREADS", threads)
+        if threads > 1:
+            set_env("OPENBLAS_MAIN_FREE", '1')
 
 
 def init(**kwargs):
@@ -129,7 +138,7 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
-    set_omp_mkl_env_vars(kwargs.get('trainer_count', 1))
+    set_env_vars(kwargs.get('trainer_count', 1))
 
     if 'use_gpu' in kwargs:
         cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index a14422ee92..2dbdebf3fa 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -20,12 +20,14 @@ from core import LoDTensor, CPUPlace, CUDAPlace
 from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 import clip
+from memory_optimization_transpiler import memory_optimize
 
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
     'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
-    'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr'
-    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler'
+    'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr',
+    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
+    'memory_optimize'
 ]
 
 
@@ -58,15 +60,11 @@ def __bootstrap__():
 
     read_env_flags = ['use_pinned_memory', 'check_nan_inf']
     if core.is_compile_gpu():
-        read_env_flags.append('fraction_of_gpu_memory_to_use')
+        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
-
-    if core.is_compile_gpu():
-        core.init_devices(["CPU", "GPU:0"])
-    else:
-        core.init_devices(["CPU"])
+    core.init_devices()
 
 
 __bootstrap__()
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index ac60bf5436..cea2d1e090 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,13 +1,14 @@
 from paddle.v2.fluid import framework as framework
 from . import core
 import collections
+import copy
 
-__all__ = ['append_backward']
+__all__ = ['append_backward', 'calc_gradient']
 
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
     """
-    Traverse all ops in op_descs[begin_idx : end_idx], 
+    Traverse all ops in op_descs[begin_idx : end_idx],
     if any op has inputs/outputs named "old_name", rename it as 'new_name'
     """
     if begin_idx is None:
@@ -65,6 +66,18 @@ def _all_in_set_(cands, s):
     return True
 
 
+def _some_in_set_(cands, s):
+    """
+    Test if some elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if c in s:
+            return True
+    return False
+
+
 def _strip_grad_suffix_(name):
     """
     Strip the grad suffix from the given varibale name
@@ -162,15 +175,15 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
             if core.grad_var_suffix() in arg and arg in no_grad_set:
                 to_insert.append((_create_op_desc_("fill_zeros_like", {
                     "X": [_strip_grad_suffix_(arg)]
-                }, {"Y": [arg]}, {}), idx))
+                }, {"Out": [arg]}, {}), idx))
 
     map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
 
     return op_descs
 
 
-def _append_backward_ops_(target,
-                          block,
+def _append_backward_ops_(block,
+                          ops,
                           target_block,
                           no_grad_dict,
                           grad_to_var,
@@ -179,32 +192,43 @@ def _append_backward_ops_(target,
     Create all grad ops, and insert them into given block
 
     Args:
-        target(Variable): the target variable of forward pass
         block(Block): the block where forward ops are
+        ops(Op): the forward operators whose backward ops need to be added
         target_block(Block): the block which is going to hold new generated grad ops
-        no_grad_dict(dict): 
+        no_grad_dict(dict):
             key(int)  block index
             val(set) a set of varibale names. These varibales have no gradient
         grad_to_var(dict)(output argument):
             key(str): grad variable name
             val(str): corresponding forward variable name
+        callback(callable object): a callable object used to decorate new generated grad ops
     """
+    if callback is None:
+
+        def empty_callback(block, context):
+            pass
+
+        callback = empty_callback
+    elif not hasattr(callback, '__call__'):
+        raise ValueError("'callback' must be a callable object.")
+
     # grad_op_descs holds created grad_op, and will be appended to target_block
     grad_op_descs = []
     program = block.program
-    for op in reversed(block.ops):
+    for op in reversed(ops):
         grad_sub_block_list = []
         # If the op has its own sub-block, deal with the sub-block first
         if op.has_attr("sub_block"):
             sub_block = program.block(op.block_attr("sub_block"))
             grad_sub_block = program.create_block(parent_idx=sub_block.idx)
-            _append_backward_ops_(target, sub_block, grad_sub_block,
-                                  no_grad_dict, grad_to_var, callback)
+            _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
+                                  no_grad_dict, grad_to_var)
             grad_sub_block_list.append(grad_sub_block.desc)
 
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+
         grad_op_descs.extend(grad_op_desc)
         grad_to_var.update(op_grad_to_var)
 
@@ -213,18 +237,11 @@ def _append_backward_ops_(target,
     grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
                                             no_grad_dict[block.idx])
 
-    if target_block.idx == 0:
-        grad_op_descs.insert(
-            0,
-            _create_op_desc_("fill_constant", {}, {
-                "Out": [_append_grad_suffix_(target.name)]
-            }, {"shape": [1],
-                "value": 1.0,
-                "dtype": target.dtype}))
     # append op_desc in grad_op_descs to target_block
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
 
 
 def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
@@ -240,7 +257,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
             In most cases, this dict is generated by _append_backward_ops_()
         grad_info_map(dict)(output argument):
             key(str): forward variable name
-            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+            val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
     """
     for op_idx in range(start_op_idx, block.desc.op_size()):
         op_desc = block.desc.op(op_idx)
@@ -267,41 +284,63 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                 _infer_var_data_type_(arg, block)
 
 
-def append_backward(loss, parameter_list=None, no_grad_set=None):
+def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
+    var_map = copy.copy(target_grad_map)
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        for name in op_desc.input_arg_names():
+            if name in var_map:
+                op_desc.rename_input(name, var_map[name])
+
+        for name in op_desc.output_arg_names():
+            if block.desc.find_var(name.encode("ascii")):
+                new_name = "%s_%s" % (name, core.unique_integer(name))
+                op_desc.rename_output(name, new_name)
+                var_map[name] = new_name
+
+    for g, ng in var_map.iteritems():
+        if g in grad_to_var:
+            grad_to_var[ng] = grad_to_var[g]
+            grad_to_var.pop(g)
+
+
+def _get_stop_gradients_(program):
+    no_grad_dict = dict()
+    assert isinstance(program, framework.Program)
+    for block in program.blocks:
+        assert isinstance(block, framework.Block)
+        block_no_grad_set = set()
+        for var in block.vars.itervalues():
+            assert isinstance(var, framework.Variable)
+            if var.stop_gradient:
+                block_no_grad_set.add(_append_grad_suffix_(var.name))
+        no_grad_dict[block.idx] = block_no_grad_set
+    return no_grad_dict
+
+
+def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
     """
     Append backward part to main_program
 
     Args:
         loss(Variable): The variable generated by cost function.
-        parameter_list(list): Parameters that need to be updated by optimizer.
-            If None, it means all parameters need to be updated.
-        no_grad_set(set): Variables that have no gradients in Block 0. 
-            If None, the set will be generated inside the function and 
-            contains all variables with `step_gradient=True` from all blocks.
+        parameter_list(list[string]): Parameters that need to be updated by
+            optimizer. If None, it means all parameters need to be updated.
+        no_grad_set(set): Variables that have no gradients in Block 0.
+            All variables with `step_gradient=True` from all blocks will be
+            automatically added.
 
     Return:
-        (list[Variable]): list of (parameters, gradients) pair.
+        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
     """
     assert isinstance(loss, framework.Variable)
 
     program = loss.block.program
-    no_grad_dict = dict()
     if no_grad_set is None:
-        assert isinstance(program, framework.Program)
-        for block in program.blocks:
-            assert isinstance(block, framework.Block)
-            block_no_grad_set = set()
-            for var in block.vars.itervalues():
-                assert isinstance(var, framework.Variable)
-                if var.stop_gradient:
-                    block_no_grad_set.add(_append_grad_suffix_(var.name))
-            no_grad_dict[block.idx] = block_no_grad_set
-    elif isinstance(no_grad_set, set):
-        no_grad_dict = {
-            0: set([_append_grad_suffix_(name) for name in no_grad_set])
-        }
-    else:
-        raise ValueError("'no_grad_set' should be a set or None.")
+        no_grad_set = set()
+    no_grad_set = copy.copy(no_grad_set)
+    no_grad_dict = _get_stop_gradients_(program)
+    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
 
     grad_info_map = dict()
     root_block = program.block(0)
@@ -310,8 +349,25 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
     current_block_idx = program.current_block_idx
     grad_to_var = dict()
 
-    _append_backward_ops_(loss, root_block, root_block, no_grad_dict,
-                          grad_to_var)
+    op_desc = _create_op_desc_("fill_constant", {}, {
+        "Out": [_append_grad_suffix_(loss.name)]
+    }, {"shape": [1],
+        "value": 1.0,
+        "dtype": loss.dtype})
+    root_block.desc.append_op().copy_from(op_desc)
+
+    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
+    op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
+    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+
+    _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
+                          grad_to_var, callback)
+
+    # Because calc_gradient may be called multiple times,
+    # we need rename the internal gradient variables so that they have
+    # different names.
+    _rename_grad_(root_block, fwd_op_num, grad_to_var, {})
+
     _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
 
     program.current_block_idx = current_block_idx
@@ -322,6 +378,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
     else:
         params = program.global_block().all_parameters()
         parameters = [param.name for param in params]
+
     params_and_grads = []
     for param in parameters:
         if param not in grad_info_map:
@@ -339,3 +396,147 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
         else:
             params_and_grads.append((param_var, None))
     return params_and_grads
+
+
+def _as_list(x):
+    if x is None:
+        return []
+    return list(x) if isinstance(x, collections.Sequence) else [x]
+
+
+def _find_op_path_(block, outputs, inputs, no_grad_set):
+    """
+    no_grad_set will also be changed
+    """
+    input_names = set([inp.name for inp in inputs])
+    output_names = set([out.name for out in outputs])
+
+    relevant_op_flags = [True] * len(block.ops)
+
+    # All the inputs of the block are used if inputs is empty,
+    if inputs:
+        for i, op in enumerate(block.ops):
+            if _some_in_set_(op.desc.input_arg_names(), input_names):
+                for name in op.desc.output_arg_names():
+                    if name not in no_grad_set:
+                        input_names.add(name)
+            else:
+                relevant_op_flags[i] = False
+
+    for i, op in reversed(list(enumerate(block.ops))):
+        if _some_in_set_(op.desc.output_arg_names(), output_names):
+            for name in op.desc.input_arg_names():
+                if name not in no_grad_set:
+                    output_names.add(name)
+        else:
+            relevant_op_flags[i] = False
+
+    op_path = [
+        block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
+    ]
+
+    if inputs:
+        for op in op_path:
+            for name in op.desc.input_arg_names():
+                if name not in input_names:
+                    no_grad_set.add(name)
+
+    return op_path
+
+
+def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
+    """
+    Backpropagate the graidents of targets to inputs.
+
+    Args:
+        targets(Variable|list[Variable]): The target variables
+        inputs(Variable|list[Variable]): The input variables
+        no_grad_set(set[string]): The names of variables that have no gradients
+            in Block 0. All variables with `stop_gradient=True` from all blocks
+            will be automatically added.
+
+    Return:
+        (list[Variable]): list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient variable
+        will be None
+    """
+    targets = _as_list(targets)
+    inputs = _as_list(inputs)
+    target_gradients = _as_list(target_gradients)
+
+    block = targets[0].block
+    prog = block.program
+    block_idx = block.idx
+
+    if not target_gradients:
+        target_gradients = [None] * len(targets)
+
+    if len(targets) != len(target_gradients):
+        raise ValueError(
+            "Should have the same number of target_gradients as targets")
+
+    if no_grad_set is None:
+        no_grad_set = set()
+    no_grad_set = copy.copy(no_grad_set)
+    no_grad_dict = _get_stop_gradients_(prog)
+    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+
+    fwd_op_num = block.desc.op_size()
+
+    target_grad_map = {}
+    for i, grad in enumerate(target_gradients):
+        target = targets[i]
+        if grad is None:
+            grad_name = _append_grad_suffix_(target.name)
+            op_desc = _create_op_desc_("fill_constant_batch_size_like",
+                                       {"Input": [target.name]},
+                                       {"Out": [grad_name]}, {
+                                           "shape": target.shape,
+                                           "value": 1.0,
+                                           "dtype": target.dtype,
+                                           'input_dim_idx': 0,
+                                           'output_dim_idx': 0
+                                       })
+            block.desc.append_op().copy_from(op_desc)
+        else:
+            if target.block.idx != block_idx or target.block.program != prog:
+                raise ValueError("all targets must be in the same block")
+            if target.shape != grad.shape:
+                raise ValueError(
+                    "The shapes of target and grad are different: %s %s" % (
+                        target.name, grad.name))
+            target_grad_map[_append_grad_suffix_(target.name)] = grad.name
+
+    for input in inputs:
+        if input.block.program != prog:
+            raise "input must be in the same program as targets"
+
+    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
+    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
+    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    grad_to_var = dict()
+    grad_info_map = dict()
+    _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
+
+    # Because calc_gradient may be called multiple times,
+    # we need rename the internal gradient variables so that they have
+    # different names.
+    _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
+
+    _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
+    prog.sync_with_cpp()
+
+    grad_vars = []
+    for input_var in inputs:
+        if input_var.name not in grad_info_map:
+            grad_vars.append(None)
+        else:
+            grad_info = grad_info_map[input_var.name]
+            grad_block = grad_info[1]
+            grad_var = grad_block.var(grad_info[0])
+            grad_vars.append(grad_var)
+
+    if len(grad_vars) == 1:
+        return grad_vars[0]
+    else:
+        return grad_vars
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
index d7ec2fbe13..b1fd1c2b65 100644
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@@ -1,7 +1,46 @@
 import functools
 import layers
+from . import core
 
-__all__ = ['GradientClipByValue', 'append_gradient_clip_ops']
+__all__ = [
+    'GradientClipByValue', 'append_gradient_clip_ops', 'error_clip_callback'
+]
+
+
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        block.append_op(
+            type="clip",
+            inputs={"X": grad_name},
+            outputs={"Out": grad_name},
+            attrs={"min": self.min,
+                   "max": self.max})
+
+
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
 
 
 class BaseGradientClipAttr(object):
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index b66a8bce5f..3ef6b33192 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -17,10 +17,6 @@ TEMP_VAR_NAME = core.kTempVarName()
 GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 
-USE_CPU = core.kUseCPU()
-USE_CUDNN = core.kUseMKLDNN()
-USE_MKLDNN = core.kUseMKLDNN()
-
 
 def grad_var_name(var_name):
     """
@@ -147,9 +143,11 @@ class Variable(object):
                  dtype=None,
                  lod_level=None,
                  persistable=None,
+                 error_clip=None,
                  stop_gradient=False,
                  **kwargs):
         self.block = block
+        self.error_clip = error_clip
 
         if name is None:
             name = Variable._unique_var_name_()
@@ -238,6 +236,9 @@ class Variable(object):
 
     __repr__ = __str__
 
+    def set_desc(self, input):
+        self.desc = input
+
     @property
     def persistable(self):
         return self.desc.persistable()
@@ -452,7 +453,7 @@ class Operator(object):
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv'
+            'recv', 'parallel_do'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
@@ -626,6 +627,17 @@ class Block(object):
             raise ValueError("var %s not in this block" % name)
         return v
 
+    def var_recursive(self, name):
+        if self.has_var(name):
+            return self.var(name)
+        else:
+            if self.idx == 0:
+                raise ValueError("var %s is not in block(%d) nor its parents." %
+                                 name, self.idx)
+            else:
+                parent_block = self.program.block(self.parent_idx)
+                return parent_block.var_recursive(name)
+
     def all_parameters(self):
         return list(self.iter_parameters())
 
@@ -744,6 +756,7 @@ class Block(object):
                 optimize_attr=p.optimize_attr,
                 regularizer=p.regularizer,
                 clip_attr=p.clip_attr,
+                error_clip=p.error_clip,
                 name=v.name)
             self.vars[new_p.name] = new_p
 
@@ -763,6 +776,9 @@ class Program(object):
         proto = framework_pb2.ProgramDesc.FromString(str(protostr))
         return _debug_string_(proto, throw_on_error)
 
+    def get_desc(self):
+        return self.desc
+
     def clone(self):
         p = Program()
         p.desc = core.ProgramDesc(self.desc)
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 926327b70c..eef1e283c2 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -11,6 +11,16 @@ __all__ = [
 
 
 def is_parameter(var):
+    """Check whether the variable is a Parameter.
+
+    This function checks whether the input variable is a Parameter.
+
+    Args:
+        var : The input variable.
+
+    Returns:
+        boolean result whether the variable is a Parameter.
+    """
     return isinstance(var, Parameter)
 
 
@@ -212,6 +222,11 @@ def save_inference_model(dirname,
             "fetch_var_names": fetch_var_names
         }, f, -1)
 
+    # Save only programDesc of inference_program in binary format
+    # in another file: __model__.dat
+    with open(model_file_name + ".dat", "wb") as fp:
+        fp.write(inference_program.desc.serialize_to_string())
+
     save_params(executor, dirname, main_program)
 
 
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 4469f7285e..325735e679 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -120,11 +120,12 @@ class LayerHelper(object):
             raise ValueError("no Parameter name %s found" % name)
         return param
 
-    def create_tmp_variable(self, dtype):
+    def create_tmp_variable(self, dtype, stop_gradient=False):
         return self.main_program.current_block().create_var(
             name=unique_name(".".join([self.name, 'tmp'])),
             dtype=dtype,
-            persistable=False)
+            persistable=False,
+            stop_gradient=stop_gradient)
 
     def create_variable(self, *args, **kwargs):
         return self.main_program.current_block().create_var(*args, **kwargs)
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
index 249f570e13..50ac0aba01 100644
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -8,6 +8,8 @@ import tensor
 from tensor import *
 import control_flow
 from control_flow import *
+import device
+from device import *
 
 __all__ = []
 __all__ += nn.__all__
@@ -15,3 +17,4 @@ __all__ += io.__all__
 __all__ += tensor.__all__
 __all__ += control_flow.__all__
 __all__ += ops.__all__
+__all__ += device.__all__
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index acc22bef98..0cf17f3083 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -6,12 +6,13 @@ import contextlib
 from ..registry import autodoc
 
 __all__ = [
-    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
-    'StaticRNNMemoryLink', 'WhileGuard', 'While', 'lod_rank_table',
-    'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
-    'increment', 'array_write', 'create_array', 'less_than', 'array_read',
-    'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
-    'StaticRNN', 'reorder_lod_tensor_by_rank'
+    'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard',
+    'BlockGuardWithCompletion', 'StaticRNNMemoryLink', 'WhileGuard', 'While',
+    'lod_rank_table', 'max_sequence_len', 'topk', 'lod_tensor_to_array',
+    'array_to_lod_tensor', 'increment', 'array_write', 'create_array',
+    'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse',
+    'DynamicRNN', 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank',
+    'ParallelDo', 'Print'
 ]
 
 
@@ -109,6 +110,61 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
     return out
 
 
+def Print(input,
+          first_n=-1,
+          message=None,
+          summarize=-1,
+          print_tensor_name=True,
+          print_tensor_type=True,
+          print_tensor_shape=True,
+          print_tensor_lod=True):
+    '''
+    **Print operator**
+
+    This creates a print op that will print when a tensor is accessed.
+
+    Wraps the tensor passed in so that whenever that a tensor is accessed,
+    the message `message` is printed, along with the current value of the
+    tensor `t`.
+
+    Args:
+      input(Variable): A Tensor to print.
+      summarize(int): Print this number of elements in the tensor, will print all
+                 if left negative.
+      message(str): A string message to print as a prefix.
+      first_n(int): Only log `first_n` number of times.
+      print_tensor_name(bool): Print the tensor name.
+      print_tensor_type(bool): Print the tensor type.
+      print_tensor_shape(bool): Print the tensor shape.
+      print_tensor_lod(bool): Print the tensor lod.
+
+    Returns:
+      None
+
+    Examples:
+        .. code-block:: python
+
+        value = some_layer(...)
+        Print(value, summarize=10,
+              message="The content of some_layer: ")
+    '''
+    helper = LayerHelper('print', **locals())
+    out = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type='print',
+        inputs={'input': input},
+        attrs={
+            'first_n': first_n,
+            'summarize': summarize,
+            'message': message or "",
+            'print_tensor_name': print_tensor_name,
+            'print_tensor_type': print_tensor_type,
+            'print_tensor_shape': print_tensor_shape,
+            'print_tensor_lod': print_tensor_lod,
+        })
+    return out
+
+
 class BlockGuard(object):
     """
     BlockGuard class.
@@ -132,29 +188,129 @@ class BlockGuard(object):
         return True
 
 
-class StaticRNNGuard(BlockGuard):
+class ParallelDo(object):
+    """
+    ParallelDo class.
+
+    ParallelDo class is used to create a ParallelDo.
+    """
+
+    def __init__(self, places, name=None):
+        self.helper = LayerHelper("parallel_do", name=name)
+        self.inputs = []
+        self.places = places
+        self.outputs = []
+        self.status = StaticRNN.BEFORE_RNN_BLOCK
+
+    def do(self):
+        return BlockGuardWithCompletion(self)
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def read_input(self, var):
+        self.inputs.append(var)
+        return var
+
+    def write_output(self, var):
+        self.outputs.append(var)
+
+    def get_parameters(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in current_block.ops:
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+
+        params = list()
+        for op in current_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        return [parent_block.var(name) for name in params]
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        self.outputs = [
+            parent_block.create_var(
+                name=o.name,
+                shape=o.shape,
+                dtype=o.dtype,
+                lod_level=o.lod_level,
+                persistable=o.persistable,
+                stop_gradient=o.stop_gradient) for o in self.outputs
+        ]
+
+        inputs = [parent_block.var(i.name) for i in self.inputs]
+        outputs = [parent_block.var(o.name) for o in self.outputs]
+
+        parent_block.append_op(
+            type='parallel_do',
+            inputs={
+                'inputs': inputs,
+                'parameters': self.get_parameters(),
+                'places': self.places
+            },
+            outputs={'outputs': outputs,
+                     'parallel_scopes': [step_scope]},
+            attrs={'sub_block': current_block})
+
+
+class BlockGuardWithCompletion(BlockGuard):
     """
-    StaticRNNGuard class.
+    BlockGuardWithCompletion class.
 
-    StaticRNNGuard class is used to create a StaticRNN block in a program.
+    BlockGuardWithCompletion class is used to create an op with a block in a program.
     """
 
     def __init__(self, rnn):
-        if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes a StaticRNN")
-        super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
+        if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
+            raise TypeError(
+                "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
+        super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
         self.rnn = rnn
 
     def __enter__(self):
         self.rnn.status = StaticRNN.IN_RNN_BLOCK
-        return super(StaticRNNGuard, self).__enter__()
+        return super(BlockGuardWithCompletion, self).__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_rnn_op()
-        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
+        self.rnn.complete_op()
+        return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
+                                                              exc_tb)
 
 
 class StaticRNNMemoryLink(object):
@@ -200,7 +356,7 @@ class StaticRNN(object):
         self.seq_len = None
 
     def step(self):
-        return StaticRNNGuard(self)
+        return BlockGuardWithCompletion(self)
 
     def _assert_in_rnn_block_(self, method):
         if self.status != StaticRNN.IN_RNN_BLOCK:
@@ -316,7 +472,7 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_rnn_op(self):
+    def complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -464,7 +620,7 @@ def lod_rank_table(x, level=0):
     """LoD Rank Table Operator. Given an input variable **x** and a level number
     of LoD, this layer creates a LodRankTable object. A LoDRankTable object
     contains a list of bi-element tuples. Each tuple consists of an index and
-    a length, both of which are int type. Reffering to specified level of LoD,
+    a length, both of which are int type. Refering to specified level of LoD,
     the index is the sequence index number and the length representes the
     sequence length. Please note that the list is ranked in descending order by
     the length. The following is an example:
@@ -897,7 +1053,7 @@ class ConditionalBlock(object):
 
         out_list = [
             parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name not in intermediate
+            if var_name in intermediate
         ]
 
         step_scope = parent_block.create_var(
diff --git a/python/paddle/v2/fluid/layers/device.py b/python/paddle/v2/fluid/layers/device.py
new file mode 100644
index 0000000000..775d40e5b5
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/device.py
@@ -0,0 +1,25 @@
+"""
+All util layers.
+"""
+
+from ..layer_helper import LayerHelper
+from ..framework import unique_name
+from ..registry import autodoc
+
+__all__ = ['get_places']
+
+
+@autodoc
+def get_places(device_count=None, device_type=None):
+    helper = LayerHelper('get_places', **locals())
+    out_places = helper.create_variable(name=unique_name(helper.name + ".out"))
+    attrs = dict()
+    if device_count is not None:
+        attrs['device_count'] = int(device_count)
+    if device_type is not None:
+        attrs['device_type'] = str(device_type)
+
+    helper.append_op(
+        type='get_places', outputs={"Out": [out_places]}, attrs=attrs)
+
+    return out_places
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 6883630ac6..1fb6523f55 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -14,7 +14,7 @@ __all__ = [
     'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
     'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
     'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
-    'sequence_first_step', 'sequence_last_step'
+    'sequence_first_step', 'sequence_last_step', 'dropout'
 ]
 
 
@@ -64,14 +64,14 @@ def fc(input,
                               is flattened: the first `num_flatten_dims`
                               dimensions will be flatten to form the first
                               dimension of the final matrix (height of the
-                              matrix), and the rest `rank(X) - num_col_dims`
+                              matrix), and the rest `rank(X) - num_flatten_dims`
                               dimensions are flattened to form the second
                               dimension of the final matrix (width of the matrix).
                               For example, suppose `X` is a 6-dimensional tensor
                               with a shape [2, 3, 4, 5, 6], and
-                              `x_num_col_dims` = 3. Then, the flattened matrix
+                              `num_flatten_dims` = 3. Then, the flattened matrix
                               will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-                              By default, `x_num_col_dims` is set to 1.
+                              By default, `num_flatten_dims` is set to 1.
        param_attr(ParamAttr|list): The parameter attribute for learnable
                                    parameters/weights of the fully connected
                                    layer.
@@ -243,18 +243,21 @@ def gru_unit(input,
 
             r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
 
-            ch_t & = actNode(xc_t + W_c dot(r_t, h_{t-1}) + b_c)
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
 
-            h_t & = dot((1-u_t), ch_{t-1}) + dot(u_t, h_t)
+            h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
 
     The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
     of the equation above, the :math:`z_t` is split into 3 parts - 
-    :math:`xu_t`, :math:`xr_t` and :math:`xc_t`. This means that in order to 
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to 
     implement a full GRU unit operator for an input, a fully 
     connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
 
-    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t - 1})`
-    and concatenation of :math:`u_t`, :math:`r_t` and :math:`ch_t`.
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates 
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is 
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
 
     Args:
         input (Variable): The fc transformed input value of current step.
@@ -383,6 +386,21 @@ def cos_sim(X, Y, **kwargs):
     return out
 
 
+def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
+    helper = LayerHelper('dropout', **kwargs)
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+    helper.append_op(
+        type='dropout',
+        inputs={'X': [x]},
+        outputs={'Out': [out],
+                 'Mask': [mask]},
+        attrs={'dropout_prob': dropout_prob,
+               'is_test': is_test,
+               'seed': seed})
+    return out
+
+
 def cross_entropy(input, label, **kwargs):
     """
     **Cross Entropy Layer**
@@ -813,6 +831,11 @@ def sequence_pool(input, pool_type, **kwargs):
                  "MaxIndex": max_index},
         attrs={"pooltype": pool_type.upper()})
 
+    # when pool_type is max, variable max_index is initialized,
+    # so we stop the gradient explicitly here
+    if pool_type == 'max':
+        max_index.stop_gradient = True
+
     return pool_out
 
 
@@ -960,14 +983,20 @@ def batch_norm(input,
         default_initializer=Constant(1.0))
 
     bias = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
-        dtype=input.dtype, shape=param_shape, persistable=True)
+        dtype=input.dtype,
+        shape=param_shape,
+        persistable=True,
+        stop_gradient=True)
     helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
 
     variance = helper.create_global_variable(
-        dtype=input.dtype, shape=param_shape, persistable=True)
+        dtype=input.dtype,
+        shape=param_shape,
+        persistable=True,
+        stop_gradient=True)
     helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
 
     # create output
@@ -975,8 +1004,8 @@ def batch_norm(input,
     mean_out = mean
     # variance and variance out share the same memory
     variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype)
-    saved_variance = helper.create_tmp_variable(dtype)
+    saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
 
     batch_norm_out = helper.create_tmp_variable(dtype)
 
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 23fe13f9bb..d3a5b70785 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -7,7 +7,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'dropout',
     'reshape',
     'scale',
     'transpose',
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 9ce25a9e08..57668a7983 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -1,7 +1,12 @@
 from ..layer_helper import LayerHelper
+from ..param_attr import ParamAttr
+from ..framework import convert_np_dtype_to_dtype_
+from ..framework import Variable
+from ..core import DataType
+import numpy
 
 __all__ = [
-    'create_tensor', 'cast', 'concat', 'sums', 'assign',
+    'create_tensor', 'create_parameter', 'cast', 'concat', 'sums', 'assign',
     'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros'
 ]
 
@@ -11,6 +16,33 @@ def create_tensor(dtype, name=None):
     return helper.create_variable(name=helper.name, dtype=dtype)
 
 
+def create_parameter(shape,
+                     dtype,
+                     attr=None,
+                     is_bias=False,
+                     default_initializer=None):
+    """
+    Create a parameter
+    Args:
+        shape(list[int]): shape of the parameter
+        dtype(string): element type of the parameter
+        attr(ParamAttr): attributes of the parameter
+        is_bias(bool): This can affect which default initializer is chosen
+                       when default_initializer is None. If is_bias,
+                       initializer.Constant(0.0) will be used. Otherwise,
+                       Xavier() will be used.
+        default_initializer(Initializer): initializer for the parameter
+
+    Returns:
+        Parameter: the created parameter
+    """
+    helper = LayerHelper("create_parameter")
+    if attr is None:
+        attr = ParamAttr()
+    return helper.create_parameter(attr, shape, dtype, is_bias,
+                                   default_initializer)
+
+
 def cast(x, dtype):
     """
     This function takes in the input with input_dtype
@@ -93,7 +125,7 @@ def assign(input, output):
     This function copies the *input* Variable to the *output* Variable.
 
     Args:
-        input(Variable): The source variable
+        input(Variable|numpy.ndarray): The source variable
         output(Variable): The destination variable
 
     Returns:
@@ -106,11 +138,37 @@ def assign(input, output):
           fluid.layers.assign(hidden, out)
     """
     helper = LayerHelper('assign', **locals())
-    helper.append_op(
-        type='scale',
-        inputs={'X': [input]},
-        outputs={'Out': [output]},
-        attrs={'scale': 1.0})
+    if isinstance(input, Variable):
+        helper.append_op(
+            type='scale',
+            inputs={'X': [input]},
+            outputs={'Out': [output]},
+            attrs={'scale': 1.0})
+    elif isinstance(input, numpy.ndarray):
+        dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == DataType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in input.flat]
+        elif dtype == DataType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in input.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if input.size > 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            })
+    else:
+        raise ValueError("Wrong type for assign input: %s" % type(input))
+
     return output
 
 
@@ -180,7 +238,8 @@ def fill_constant_batch_size_like(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+          data = fluid.layers.fill_constant_batch_size_like(
+              input=like, shape=[1], value=0, dtype='int64')
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
new file mode 100644
index 0000000000..6800d7ddbb
--- /dev/null
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -0,0 +1,156 @@
+from collections import defaultdict
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import backward
+from backward import _rename_arg_
+from . import core
+
+dtype_to_size = {
+    core.DataType.FP16: 2,
+    core.DataType.FP32: 4,
+    core.DataType.FP64: 8,
+    core.DataType.INT16: 2,
+    core.DataType.INT32: 4,
+    core.DataType.INT64: 8,
+    core.DataType.BOOL: 1
+}
+
+
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._program = Program
+        self._succesors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+
+    def _add_connections(self, connections):
+        for node1, node2 in connections:
+            self._add(node1, node2)
+
+    def _add(self, node1, node2):
+        self._succesors[node1].add(node2)
+        self._presucessors[node2].add(node1)
+
+    def _build_graph(self):
+        program_desc = self._program.get_desc()
+        block_size = program_desc.num_blocks()
+
+        # TODO(qijun) handle Program with if/while operators
+        self.global_block_desc = program_desc.block(0)
+        self.op_size = self.global_block_desc.op_size()
+
+        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
+        self._add_connections(op_node_connections)
+
+        self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)]
+
+        for i in range(self.op_size):
+            self._uses[i].update(self.ops[i].input_arg_names())
+            self._defs[i].update(self.ops[i].output_arg_names())
+
+    def _update_graph(self, old_name, new_name, begin_idx=0):
+        for i in range(begin_idx, self.op_size):
+            if old_name in self._uses[i]:
+                self._uses[i].remove(old_name)
+                self._uses[i].add(new_name)
+            if old_name in self._defs[i]:
+                self._defs[i].remove(old_name)
+                self._defs[i].add(new_name)
+            if old_name in self._live_in[i]:
+                self._live_in[i].remove(old_name)
+                self._live_out[i].add(new_name)
+            if old_name in self._live_out[i]:
+                self._live_out[i].remove(old_name)
+                self._live_out[i].add(new_name)
+
+    def _reach_fixed_point(self, live_in, live_out):
+        if len(live_in) != len(self._live_in):
+            return False
+        if len(live_out) != len(self._live_out):
+            return False
+        for i in range(self.op_size):
+            if live_in[i] != self._live_in[i]:
+                return False
+        for i in range(self.op_size):
+            if live_out[i] != self._live_out[i]:
+                return False
+        return True
+
+    def _dataflow_analyze(self):
+        self._build_graph()
+        live_in = defaultdict(set)
+        live_out = defaultdict(set)
+        while True:
+            for i in range(self.op_size):
+                live_in[i] = set(self._live_in[i])
+                live_out[i] = set(self._live_out[i])
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
+                for s in self._succesors[i]:
+                    self._live_out[i] |= self._live_in[s]
+
+            if self._reach_fixed_point(live_in, live_out):
+                break
+
+    def _get_diff(self, a, b):
+        u = a & b
+        return a - u, b - u
+
+    def memory_optimize(self):
+        self._build_graph()
+        self._dataflow_analyze()
+        self.pool = []
+        for i in range(self.op_size):
+            if self.pool:
+                out_pair = [(x, self.global_block_desc.var(str(x)).shape())
+                            for x in self._defs[i]]
+                for x, x_shape in out_pair:
+                    if not self.global_block_desc.var(str(x)).persistable():
+                        for index, cache_pair in enumerate(self.pool):
+                            cache_var = cache_pair[0]
+                            cache_shape = cache_pair[1]
+                            if x_shape == cache_shape:
+                                x_dtype = self.global_block_desc.var(str(
+                                    x)).dtype()
+                                cache_dtype = self.global_block_desc.var(
+                                    str(cache_var)).dtype()
+                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
+                                # and dtype_to_size[cache_dtype]
+                                if x_dtype == cache_dtype:
+                                    print(
+                                        "Hit Cache !!!! cache pool index is %d, var name is %s, cached var name is %s, var shape is %s "
+                                        %
+                                        (index, x, cache_var, str(cache_shape)))
+                                    self.pool.pop(index)
+                                    _rename_arg_(
+                                        self.ops, x, cache_var, begin_idx=i)
+                                    self._program.current_block().var(str(
+                                        x)).desc = self.global_block_desc.var(
+                                            str(cache_var))
+                                    self._update_graph(
+                                        x, cache_var, begin_idx=i)
+                                    break
+
+            in_diff, out_diff = self._get_diff(self._live_in[i],
+                                               self._live_out[i])
+            can_optimize = filter(
+                lambda x: not self.global_block_desc.var(str(x)).persistable(),
+                in_diff)
+            if can_optimize:
+                for var_name in can_optimize:
+                    self.pool.append(
+                        (var_name,
+                         self.global_block_desc.var(str(var_name)).shape()))
+
+    def get_program(self):
+        return self._program
+
+
+def memory_optimize(input_program):
+    graph = ControlFlowGraph(input_program)
+    graph.memory_optimize()
+    result_program = graph.get_program()
+    return result_program
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index ff3e5315a2..40721b5e97 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -6,7 +6,7 @@ from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
-from clip import append_gradient_clip_ops
+from clip import append_gradient_clip_ops, error_clip_callback
 
 __all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
 
@@ -197,7 +197,8 @@ class Optimizer(object):
         This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward(loss, parameter_list, no_grad_set)
+        params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                       error_clip_callback)
 
         params_grads = append_gradient_clip_ops(params_grads)
 
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 8acd470c5e..74ca56182c 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -4,6 +4,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+import time
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -160,7 +161,8 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
+    #place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0)
     feeder = fluid.DataFeeder(
         feed_list=[
             word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
@@ -174,6 +176,7 @@ def main():
     embedding_param.set(
         load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
 
+    start_time = time.time()
     batch_id = 0
     for pass_id in xrange(PASS_NUM):
         chunk_evaluator.reset(exe)
@@ -191,6 +194,9 @@ def main():
                         f1_score) + " pass_precision:" + str(
                             pass_precision) + " pass_recall:" + str(pass_recall)
                       + " pass_f1_score:" + str(pass_f1_score))
+                if batch_id != 0:
+                    print("second per batch: " + str((time.time() - start_time)
+                                                     / batch_id))
 
             # exit early for CI
             exit(0)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/test_dist_fit_a_line.py
new file mode 100644
index 0000000000..bb339c440b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_dist_fit_a_line.py
@@ -0,0 +1,62 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+else:
+    trainer_prog = t.get_trainer_program()
+
+    exe.run(fluid.default_startup_program())
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
+        for data in train_reader():
+            avg_loss_value, = exe.run(trainer_prog,
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+
+            if avg_loss_value[0] < 10.0:
+                exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_dist_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book_distribute/test_dist_label_semantic_roles.py
new file mode 100644
index 0000000000..5fa5e0e5f3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_dist_label_semantic_roles.py
@@ -0,0 +1,225 @@
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+import time
+import os
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        exe.run(fluid.default_startup_program())
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        start_time = time.time()
+        batch_id = 0
+        exe.run(fluid.default_startup_program())
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+        for pass_id in xrange(PASS_NUM):
+            chunk_evaluator.reset(exe)
+            for data in train_data():
+                cost, precision, recall, f1_score = exe.run(
+                    trainer_prog,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                    exe)
+
+                if batch_id % 10 == 0:
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+
+                batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_dist_word2vec.py b/python/paddle/v2/fluid/tests/book_distribute/test_dist_word2vec.py
new file mode 100644
index 0000000000..b41853784d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_dist_word2vec.py
@@ -0,0 +1,96 @@
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+IS_SPARSE = True
+TRAINERS = 2
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+embed_first = fluid.layers.embedding(
+    input=first_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
+    input=second_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
+    input=third_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
+    input=forth_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+
+concat_embed = fluid.layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+train_reader = paddle.batch(
+    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+    exe.run(fluid.default_startup_program())
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            avg_cost_np = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+            print("avg_cost_np", avg_cost_np)
+            if avg_cost_np[0] < 5.0:
+                exit(
+                    0)  # if avg cost less than 10.0, we think our code is good.
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_understand_sentiment_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/test_understand_sentiment_conv_dist.py
new file mode 100644
index 0000000000..db419e23ab
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_understand_sentiment_conv_dist.py
@@ -0,0 +1,110 @@
+from __future__ import print_function
+import os
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    exe.run(fluid.default_startup_program())
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and pass_acc > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/test_assign_value_op.py b/python/paddle/v2/fluid/tests/test_assign_value_op.py
new file mode 100644
index 0000000000..51b99d0918
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_value_op.py
@@ -0,0 +1,40 @@
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class TestAssignValueOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign_value"
+        x = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.inputs = {}
+        self.outputs = {'Out': x}
+        self.attrs = {
+            'shape': x.shape,
+            'dtype': framework.convert_np_dtype_to_dtype_(x.dtype),
+            'fp32_values': [float(v) for v in x.flat]
+        }
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_assign(self):
+        val = (
+            -100 + 200 * numpy.random.random(size=(2, 5))).astype(numpy.int32)
+        x = layers.create_tensor(dtype="float32")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        fetched_x = exe.run(fluid.default_main_program(),
+                            feed={},
+                            fetch_list=[x])[0]
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index abbd48d2b8..ac9418549f 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -341,9 +341,6 @@ class TestBatchNormOp(OpTest):
         if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
             places.append(core.CUDAPlace(0))
 
-            core.init_devices(["CPU", "GPU:0"])
-        else:
-            core.init_devices(["CPU"])
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
                 test_with_place(place, data_format, [2, 3, 4, 5])
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
index 595f132fa8..319a7e49e3 100644
--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -37,13 +37,13 @@ class BeamSearchOpTester(unittest.TestCase):
         print 'lod', selected_ids.lod()
 
     def _create_pre_ids(self):
-        np_data = np.array([[1, 2, 3, 4]], dtype='int32')
+        np_data = np.array([[1, 2, 3, 4]], dtype='int64')
         tensor = create_tensor(self.scope, "pre_ids", np_data)
 
     def _create_ids(self):
         self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
         np_data = np.array(
-            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int32')
+            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
         tensor.set_lod(self.lod)
 
diff --git a/python/paddle/v2/fluid/tests/test_calc_gradient.py b/python/paddle/v2/fluid/tests/test_calc_gradient.py
new file mode 100644
index 0000000000..c34c8ff6d1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_calc_gradient.py
@@ -0,0 +1,25 @@
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import calc_gradient
+
+
+class TestCalcGradient(unittest.TestCase):
+    def test_calc_gradient(self):
+        x = layers.create_parameter(dtype="float32", shape=[5, 10])
+        y = layers.create_parameter(dtype="float32", shape=[10, 8])
+        mul_out = layers.mul(x=x, y=y)
+        mean_out = layers.mean(x=mul_out)
+        a = calc_gradient(mean_out, mul_out)
+        b = calc_gradient(mean_out, x)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index e82e3ab0c9..958300e655 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -1,5 +1,7 @@
 import unittest
 import numpy as np
+
+import paddle.v2.fluid.core as core
 from op_test import OpTest
 
 
@@ -47,6 +49,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 
 class TestConv2dOp(OpTest):
     def setUp(self):
+        core.use_cuda()
         self.init_op_type()
         self.init_group()
         self.init_dilation()
@@ -167,26 +170,31 @@ class TestWithDilation(TestConv2dOp):
 #----------------Conv2dCudnn----------------
 class TestCudnn(TestConv2dOp):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithPad(TestWithPad):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithStride(TestWithStride):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithGroup(TestWithGroup):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWith1x1(TestWith1x1):
     def init_op_type(self):
+        core.use_cudnn()
         self.op_type = "conv2d_cudnn"
 
 
diff --git a/python/paddle/v2/fluid/tests/test_edit_distance_op.py b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
new file mode 100644
index 0000000000..38e87728b3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -0,0 +1,94 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def Levenshtein(hyp, ref):
+    """ Compute the Levenshtein distance between two strings.
+
+    :param hyp: hypothesis string in index
+    :type hyp: list
+    :param ref: reference string in index
+    :type ref: list
+    """
+    m = len(hyp)
+    n = len(ref)
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+
+    dist = np.zeros((m + 1, n + 1)).astype("float32")
+    for i in range(0, m + 1):
+        dist[i][0] = i
+    for j in range(0, n + 1):
+        dist[0][j] = j
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if hyp[i - 1] == ref[j - 1] else 1
+            deletion = dist[i - 1][j] + 1
+            insertion = dist[i][j - 1] + 1
+            substitution = dist[i - 1][j - 1] + cost
+            dist[i][j] = min(deletion, insertion, substitution)
+    return dist[m][n]
+
+
+class TestEditDistanceOp(OpTest):
+    def setUp(self):
+        self.op_type = "edit_distance"
+        normalized = False
+        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int32")
+        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int32")
+        x1 = np.transpose(x1)
+        x2 = np.transpose(x2)
+        x1_lod = [0, 1, 5]
+        x2_lod = [0, 3, 4]
+
+        num_strs = len(x1_lod) - 1
+        distance = np.zeros((num_strs, 1)).astype("float32")
+        for i in range(0, num_strs):
+            distance[i] = Levenshtein(
+                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
+                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+            if normalized is True:
+                len_ref = x2_lod[i + 1] - x2_lod[i]
+                distance[i] = distance[i] / len_ref
+        self.attrs = {'normalized': normalized}
+        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.outputs = {'Out': distance}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestEditDistanceOpNormalized(OpTest):
+    def setUp(self):
+        self.op_type = "edit_distance"
+        normalized = True
+        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int32")
+        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int32")
+        x1 = np.transpose(x1)
+        x2 = np.transpose(x2)
+        x1_lod = [0, 1, 3, 6]
+        x2_lod = [0, 2, 3, 5]
+
+        num_strs = len(x1_lod) - 1
+        distance = np.zeros((num_strs, 1)).astype("float32")
+        for i in range(0, num_strs):
+            distance[i] = Levenshtein(
+                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
+                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+            if normalized is True:
+                len_ref = x2_lod[i + 1] - x2_lod[i]
+                distance[i] = distance[i] / len_ref
+        self.attrs = {'normalized': normalized}
+        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.outputs = {'Out': distance}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_get_places_op.py b/python/paddle/v2/fluid/tests/test_get_places_op.py
new file mode 100644
index 0000000000..c4346f6786
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_get_places_op.py
@@ -0,0 +1,17 @@
+import paddle.v2.fluid as fluid
+import decorators
+import unittest
+
+
+class TestGetPlaces(unittest.TestCase):
+    @decorators.prog_scope()
+    def test_get_places(self):
+        places = fluid.layers.get_places()
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(fluid.default_main_program())
+        self.assertEqual(places.type, fluid.core.VarDesc.VarType.PLACE_LIST)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 77f0f11f1b..a56277d216 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -196,6 +196,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_softmax(x=seq))
         print(str(program))
 
+    def test_get_places(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.get_places(device_count=4)
+            self.assertIsNotNone(x)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
new file mode 100644
index 0000000000..5cce75ddb8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
@@ -0,0 +1,33 @@
+from __future__ import print_function
+import unittest
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.memory_optimization_transpiler import memory_optimize
+
+
+class TestControlFlowGraph(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt = opt.minimize(avg_cost)
+
+        self.program = program
+
+    def test_control_flow_graph(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
new file mode 100644
index 0000000000..2b51a1f504
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -0,0 +1,157 @@
+import unittest
+import paddle.v2.fluid as fluid
+import numpy
+
+
+class BaseParallelForTest(unittest.TestCase):
+    def run_test(self, callback, feed, fetch):
+        """
+        Run the unittest for parallel.for
+        Args:
+            callback(callable): A callable function returns a generator. There 
+                are two yields in the generator function. The first yield 
+                returns the data layers, and the second yield returns the loss. 
+                The modified data variables will be sent back during the first 
+                yield.
+            
+            feed(dict): The executor feeding dictionary.
+            fetch(list|basestr): The fetch name lists. 
+
+        Returns:
+            None
+            
+        Raises:
+            AssertionError when the computation of cpu, parallel.for in cpu, 
+                gpu, parallel.for in gpu are different.
+
+        """
+        cpu = fluid.CPUPlace()
+        result_cpu = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=False)
+        result_cpu_parallel = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=True)
+        if fluid.core.is_compile_gpu():
+            gpu = fluid.CUDAPlace(0)
+            result_gpu = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=False)
+            result_gpu_parallel = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=True)
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel,
+                               result_gpu, result_gpu_parallel)
+        else:
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel)
+
+    def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False):
+        """
+        Run a single test, returns the fetch values
+        Args:
+            place(Place): the computation place. 
+            use_parallel(bool): Whether use parallel.for or not. 
+
+        Returns:
+            Fetched numpy arrays.
+
+        """
+        if isinstance(fetch, basestring):
+            fetch = [fetch]
+        main = fluid.Program()
+        startup = fluid.Program()
+        # Fix seed
+        main.random_seed = 10
+        startup.random_seed = 10
+
+        with fluid.program_guard(main, startup):
+            generator = callback()
+            # Automatically insert parallel do if use_parallel = True
+            if use_parallel:
+                places = fluid.layers.get_places()
+                pd = fluid.layers.ParallelDo(places)
+                data = next(generator)
+
+                if isinstance(data, fluid.Variable):
+                    data = [data]
+
+                with pd.do():
+                    ins = map(pd.read_input, data)
+                    if len(ins) == 1:
+                        ins = ins[0]
+                    loss = generator.send(ins)  # patch input
+                    pd.write_output(loss)
+
+                loss = pd()
+            else:
+                data = next(generator)
+                loss = generator.send(data)
+            self.assertIsNotNone(loss)
+            avg_loss = fluid.layers.mean(x=loss)
+            fluid.backward.append_backward(loss=avg_loss)
+
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        return exe.run(main, feed=feed, fetch_list=fetch)
+
+    def _assert_same_(self, fetch, *args):
+        """
+        Assert the return values of `run_test` are same.
+        Args:
+            fetch: Fetch list. Used for print error message
+            *args: The fetch result lists of each situations.
+
+        Returns:
+            None
+            
+        Raises:
+            AssertionError
+
+        """
+
+        def _impl_(a, b, fetch_id, item_id):
+            item_str = ['CPU', 'ParallelCPU', 'GPU', 'ParallelGPU']
+            flag = numpy.allclose(a, b, rtol=0.1)
+            self.assertTrue(flag, "The {0} are different in {1}".format(
+                fetch[fetch_id], item_str[item_id]))
+
+        for i, items in enumerate(zip(*args)):
+            self.assertGreater(len(items), 0)
+            for j in range(1, len(items)):
+                _impl_(items[0], items[j], fetch_id=i, item_id=j)
+
+
+class ParallelOpTest(BaseParallelForTest):
+    def test_simple_fc(self):
+        def __network__():
+            x = fluid.layers.data(shape=[784], dtype='float32', name='img')
+            # FIXME: This is a bug of parallel.do
+            x.stop_gradient = False
+            x = yield x
+            hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            loss = fluid.layers.mean(x=hidden)
+            yield loss
+
+        self.run_test(
+            callback=__network__,
+            feed={
+                'img':
+                numpy.random.random(size=(128 * 3, 784)).astype('float32')
+            },
+            fetch='fc1.w@GRAD')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_print_op.py b/python/paddle/v2/fluid/tests/test_print_op.py
new file mode 100644
index 0000000000..86a701a020
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_print_op.py
@@ -0,0 +1,21 @@
+import unittest
+import numpy as np
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as pd
+
+
+class TestSumOp(unittest.TestCase):
+    def test_tensor(self):
+        i = pd.zeros(shape=[2, 10], dtype='float32')
+
+        pd.Print(i, message="I am a message", summarize=10)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+
+        exe.run()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
index 7c136f6360..8b79d448e2 100644
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -1,46 +1,186 @@
 import unittest
 import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
 import numpy
 
 
 class TestReorderLoDTensor(unittest.TestCase):
-    def test_reorder(self):
-        dat = fluid.layers.data(name='input', shape=[1], lod_level=2)
+    num_seq = 5
+    # [name, dim, lod_level] pair indicating data info of source and target
+    data_desc = (['input', 9, 0], ['ref', 5, 1])
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        dat = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=[cls.data_desc[0][1]])
         dat.stop_gradient = False
-        rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1)
+        rank_dat = fluid.layers.data(
+            name=cls.data_desc[1][0], shape=[cls.data_desc[1][1]])
         table = fluid.layers.lod_rank_table(rank_dat)
         new_dat = fluid.layers.reorder_lod_tensor_by_rank(
             x=dat, rank_table=table)
-        loss = fluid.layers.mean(x=new_dat)
+        loss = fluid.layers.reduce_sum(new_dat)
         fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
+
+    def run_program(self):
+        outputs = []
+        input_grads = []
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            output, input_grad = exe.run(fluid.default_main_program(),
+                                         feed=self.inputs,
+                                         fetch_list=self.fetch_list,
+                                         return_numpy=False)
+            outputs.append(output)
+            input_grads.append(input_grad)
+        self.actual_outputs = outputs
+        self.actual_grads = input_grads
+
+    def set_data(self):
+        self.data = {}
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_dim = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.num_seq if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(size=[
+                data_lod[-1][-1] if data_lod else self.num_seq, data_dim
+            ]).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def reorder(self):
+        level = 0
+
+        # compute the rank_table according to ref_lod
+        ref_lod = self.data[self.data_desc[1][0]][1][level]
+        rank_table = []  # list of (index, length)
+        for i in range(len(ref_lod) - 1):
+            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
+
+        # compute the input sequence info according to input_lod
+        input_value, input_lod = self.data[self.data_desc[0][0]]
+
+        input_table = []  # list of (offset, length, sub_lod)
+        if input_lod:
+            for i in range(len(input_lod[level]) - 1):
+                start_idx = i
+                end_idx = i + 1
+                sub_lod = []
+                for lod_level_i in input_lod[level:]:
+                    sub_lod_i = []
+                    for idx in range(start_idx, end_idx):
+                        sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
+                            idx])
+                    sub_lod.append(sub_lod_i)
+                    start_idx = lod_level_i[start_idx]
+                    end_idx = lod_level_i[end_idx]
+                input_table.append((start_idx, end_idx - start_idx, sub_lod))
+        else:
+            input_table = [(i, 1, []) for i in range(len(rank_table))]
+
+        # reorder by rank_table
+        output_value = numpy.zeros_like(input_value)
+        output_lod = []
+        offset = 0
+        for index, length in rank_table:
+            input_seq_start = input_table[index][0]
+            input_seq_len = input_table[index][1]
+            input_seq_end = input_seq_start + input_seq_len
+            output_value[offset:offset + input_seq_len] = input_value[
+                input_seq_start:input_seq_end]
+            offset += input_seq_len
+
+            input_seq_sub_lod = input_table[index][2]
+            if len(output_lod) == 0:
+                output_lod = [[0] for i in input_seq_sub_lod]
+            for i, sub_lod_i in enumerate(input_seq_sub_lod):
+                for idx_sub in sub_lod_i:
+                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+        return output_value, output_lod
+
+    def test_reorder_lod_tensor(self):
+        self.data_desc[0][-1] = 2  # input is lod_tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+
+    def test_reorder_tensor(self):
+        self.data_desc[0][-1] = 0  # input is tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+        global outputs_from_tensor_implicit_lod
+        outputs_from_tensor_implicit_lod = self.actual_outputs
 
-        cpu = fluid.CPUPlace()
-        exe = fluid.Executor(cpu)
-        exe.run(fluid.default_startup_program())
-
-        ref = fluid.Tensor()
-        ref_lod = [0, 3, 4, 7, 8, 14]
-        ref.set_lod([ref_lod])
-
-        ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu)
-        input = fluid.Tensor()
-        lod_level_0 = numpy.random.randint(low=1, high=5, size=5)
-        lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist()
-        lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1])
-        lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist()
-
-        input.set_lod([lod_level_0, lod_level_1])
-        input.set(
-            numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'),
-            cpu)
-
-        ig = exe.run(fluid.default_main_program(),
-                     feed={'input': input,
-                           'ref': ref},
-                     fetch_list=['input@GRAD'],
-                     return_numpy=False)[0]
-        self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001)
-        self.assertEqual(input.lod(), ig.lod())
+        # compare outputs between LodTensors with explicit and implicit lod
+        # use the same data but set the input lod explicitly
+        input_lod = [[
+            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
+        ]]
+        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        # preserve the output of LodTensor with implicit lod to compare
+        expect_output = [
+            numpy.array(actual_output) for actual_output in self.actual_outputs
+        ]
+        self.run_program()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
new file mode 100644
index 0000000000..bf257fefea
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
@@ -0,0 +1,35 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_erase(in_seq, lod0, tokens):
+    new_lod0 = [0]
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        num_out = 0
+        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+            if dat not in tokens:
+                out_seq.append(dat)
+                num_out += 1
+        new_lod0.append(new_lod0[-1] + num_out)
+    return np.array(out_seq).astype("int32"), new_lod0
+
+
+class TestSequenceEraseOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
index b54a56aa6d..8bffdd5856 100644
--- a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
@@ -1,13 +1,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-
-
-def stable_softmax(x):
-    """Compute the softmax of vector x in a numerically stable way."""
-    shiftx = x - np.max(x).clip(-64.)
-    exps = np.exp(shiftx)
-    return exps / np.sum(exps)
+from test_softmax_op import stable_softmax
 
 
 class TestSequenceSoftmaxOp(OpTest):
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index be1588fc2d..a14721b9aa 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -3,43 +3,86 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.backward import append_backward
-from paddle.v2.fluid.framework import default_main_program
-import numpy
+from paddle.v2.fluid.framework import default_main_program, switch_main_program
+from paddle.v2.fluid.framework import Program
+import numpy as np
 
-main_program = default_main_program()
 
-
-class TestShrinkRNNMemory(unittest.TestCase):
-    def test_shrink_rnn_memory(self):
+class TestShrinkRNNMemoryBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = Program()
+        switch_main_program(self.main_program)
         x = layers.data('x', shape=[100], dtype='float32')
         x.stop_gradient = False
-        table = layers.lod_rank_table(x=x)
+        rank_table_tensor = layers.data(
+            'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
+        table = layers.lod_rank_table(x=rank_table_tensor)
         i = layers.zeros(dtype='int64', shape=[1])
-        mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        mem2 = layers.shrink_memory(x=mem1, i=i, table=table)
+        self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
         i = layers.increment(x=i)
         i.stop_gradient = True
-        mem3 = layers.shrink_memory(x=mem2, i=i, table=table)
+        self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
+        mem3_mean = layers.mean(x=self.mem3)
+        append_backward(loss=mem3_mean)
+        self.x_grad = self.main_program.global_block().var('x@GRAD')
+
+    def sum_lodtensor(self, tensor):
+        sum_res = 0.0
+        for i in xrange(np.product(tensor.get_dims())):
+            sum_res += tensor.get_float_element(i)
+        return sum_res
 
+
+class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
+    def test_refer_lod(self):
         cpu = core.CPUPlace()
-        tensor = core.LoDTensor()
-        tensor.set_lod([[0, 2, 5, 6]])
-        tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
-        tensor.set(tensor_np, cpu)
+        x_tensor = core.LoDTensor()
+        x_tensor.set_lod([[0, 2, 5, 6]])
+        tensor_np = np.random.random(size=(6, 100)).astype('float32')
+        x_tensor.set(tensor_np, cpu)
+
+        rank_table_tensor = core.LoDTensor()
+        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
+                              cpu)
+
         exe = Executor(cpu)
-        outs = exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3])
-        self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
-        self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
-        self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
+        outs = exe.run(
+            feed={'x': x_tensor,
+                  'rank_table_tensor': rank_table_tensor},
+            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
+            return_numpy=False)
+        self.assertTrue(np.allclose(tensor_np[0:6], outs[0]))
+        self.assertTrue(np.allclose(tensor_np[0:5], outs[1]))
+        self.assertTrue(np.allclose(tensor_np[0:2], outs[2]))
+        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
 
-        mem3_mean = layers.mean(x=mem3)
-        append_backward(loss=mem3_mean)
-        x_grad = exe.run(
-            feed={'x': tensor},
-            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
-        self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
+
+class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
+    def test_no_lod(self):
+        cpu = core.CPUPlace()
+        x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(3, 100)).astype('float32')
+        x_tensor.set(tensor_np, cpu)
+
+        rank_table_tensor = core.LoDTensor()
+        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
+                              cpu)
+
+        exe = Executor(cpu)
+        outs = exe.run(
+            feed={'x': x_tensor,
+                  'rank_table_tensor': rank_table_tensor},
+            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
+            return_numpy=False)
+        self.assertTrue(np.allclose(tensor_np[0:3], outs[0]))
+        self.assertTrue(np.allclose(tensor_np[0:2], outs[1]))
+        self.assertTrue(np.allclose(tensor_np[0:1], outs[2]))
+        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_warpctc_op.py b/python/paddle/v2/fluid/tests/test_warpctc_op.py
new file mode 100644
index 0000000000..59390d5303
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_warpctc_op.py
@@ -0,0 +1,200 @@
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+class CTCForward(object):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
+                 norm_by_times):
+        self.softmax = softmax
+        self.softmax_lod = softmax_lod
+        assert labels.shape[1] == 1
+        self.labels = labels
+        self.labels_lod = labels_lod
+        self.blank = blank
+        self.norm_by_times = norm_by_times
+
+        self.level = 0
+        self.num_classes = softmax.shape[1]
+        self.batch_size = len(softmax_lod[self.level]) - 1
+        assert self.batch_size == len(labels_lod[self.level]) - 1
+
+        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
+        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
+
+        # float64
+        self.EXP_MAX = sys.float_info.max
+        self.EXP_MIN = sys.float_info.min
+        self.LOG_ZERO = np.log(self.EXP_MIN)
+        self.LOG_INFINITY = np.log(self.EXP_MAX)
+
+    def safe_exp(self, x):
+        if x <= self.LOG_ZERO:
+            return 0.0
+        if x >= self.LOG_INFINITY:
+            return self.EXP_MAX
+        return np.exp(x)
+
+    def safe_log(self, x):
+        if x <= self.EXP_MIN:
+            return self.LOG_ZERO
+        return np.log(x)
+
+    # x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb
+    def log_div(self, x, y):
+        res = x - y
+        if res <= self.LOG_ZERO:
+            return self.LOG_ZERO
+        if res >= self.LOG_INFINITY:
+            return self.LOG_INFINITY
+        return res
+
+    # x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb
+    def log_mul(self, x, y):
+        res = x + y
+        if res <= self.LOG_ZERO:
+            return self.LOG_ZERO
+        if res >= self.LOG_INFINITY:
+            return self.LOG_INFINITY
+        return res
+
+    # x = lna and y = lnb are in log scale,
+    # ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a
+    def log_add(self, x, y):
+        if x < y:
+            t = y
+            y = x
+            x = t
+        return x + self.safe_log(1 + self.safe_exp(y - x))
+
+    def segment_range(self, time, total_times, total_segments):
+        start = max(0, total_segments - (2 * (total_times - time)))
+        end = min(total_segments, 2 * (time + 1))
+        return start, end
+
+    def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
+        total_times = softmax_a_sequence.shape[0]
+        total_segments = labels_a_sequence.shape[0] * 2 + 1
+
+        required_times = labels_a_sequence.shape[0]
+        old_label = -1
+        for i in range(labels_a_sequence.shape[0]):
+            # two contingous labels with the same value
+            if labels_a_sequence[i, 0] == old_label:
+                required_times = required_times + 1
+            old_label = labels_a_sequence[i, 0]
+
+        if total_times < required_times:
+            return 0
+
+        # calculate the forward and backward variables,
+        # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
+        # Labelling with Recurrent Neural Networks"
+        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
+        for i in range(total_times):
+            for j in range(self.num_classes):
+                log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
+
+        # calculate the forward variables
+        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
+        for i in range(total_times):
+            for j in range(total_segments):
+                forward_vars[i, j] = self.LOG_ZERO
+
+        for i in range(total_times):
+            # dp initialization at t0
+            if i == 0:
+                forward_vars[i, 0] = log_acts[0, self.blank]
+                if total_segments > 1:
+                    forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]]
+                continue
+
+            # dp from t1
+            start, end = self.segment_range(i, total_times, total_segments)
+            for k in range(end - start):
+                j = k + start
+                if j & 1 == 1:
+                    label_idx = j / 2
+                    label_val = labels_a_sequence[label_idx, 0]
+                    fv = self.log_add(forward_vars[i - 1, j],
+                                      forward_vars[i - 1, j - 1])
+                    if j > 1 and label_val != labels_a_sequence[label_idx - 1,
+                                                                0]:
+                        fv = self.log_add(fv, forward_vars[i - 1, j - 2])
+                    fv = self.log_mul(fv, log_acts[i, label_val])
+                else:
+                    fv = forward_vars[i - 1, j]
+                    if j > 0:
+                        fv = self.log_add(fv, forward_vars[i - 1, j - 1])
+                    fv = self.log_mul(fv, log_acts[i, self.blank])
+                forward_vars[i, j] = fv
+
+        # sum the last two value as log_prob
+        log_prob = forward_vars[total_times - 1, total_segments - 1]
+        if total_segments > 1:
+            log_prob = self.log_add(
+                log_prob, forward_vars[total_times - 1, total_segments - 2])
+
+        return -log_prob
+
+    def forward(self):
+        for i in range(self.batch_size):
+            softmax_start_i = self.softmax_lod[self.level][i]
+            softmax_end_i = self.softmax_lod[self.level][i + 1]
+            labels_start_i = self.labels_lod[self.level][i]
+            labels_end_i = self.labels_lod[self.level][i + 1]
+
+            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
+            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                   labels_a_sequence)
+        return self.loss
+
+
+class TestWarpCTCOp(OpTest):
+    def setUp(self):
+        self.op_type = "warpctc"
+
+        batch_size = 4
+        num_classes = 8
+        logits_lod = [[0, 4, 5, 8, 11]]
+        logits = np.random.uniform(0.1, 1.0,
+                                   [11, num_classes]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels_lod = [[0, 3, 4, 8, 12]]
+        # labels should not be blank
+        labels = np.random.randint(0, num_classes - 1, [12, 1], dtype="int32")
+
+        blank = num_classes - 1
+        norm_by_times = False
+
+        ctc = CTCForward(softmax, logits_lod, labels, labels_lod, blank,
+                         norm_by_times)
+        loss = ctc.forward()
+
+        max_sequence_length = 0
+        for i in range(batch_size):
+            max_sequence_length = max(max_sequence_length,
+                                      logits_lod[0][i + 1] - logits_lod[0][i])
+        gradient = np.zeros(
+            [max_sequence_length, batch_size, num_classes], dtype="float32")
+
+        self.inputs = {
+            "Logits": (logits, logits_lod),
+            "Label": (labels, labels_lod)
+        }
+        self.outputs = {"Loss": loss}
+        self.attrs = {"blank": blank, "norm_by_times": norm_by_times}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+#    def test_check_grad(self):
+#        self.outputs["WarpCTCGrad"] = None
+#        self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 66ccfe8087..65ec58ecf9 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -31,6 +31,7 @@ patch           = '%(patch)d'
 rc              = '%(rc)d'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
 
 def show():
     if istaged:
@@ -41,6 +42,9 @@ def show():
         print 'rc:', rc
     else:
         print 'commit:', commit
+
+def mkl():
+    return with_mkl
 '''
     commit = git_commit()
     with open(filename, 'w') as f:
@@ -51,7 +55,8 @@ def show():
             'rc': RC,
             'version': '${PADDLE_VERSION}',
             'commit': commit,
-            'istaged': ISTAGED})
+            'istaged': ISTAGED,
+            'with_mkl': '@WITH_MKL@'})
 
 write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
 
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
new file mode 100644
index 0000000000..b6cae228a0
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.android
@@ -0,0 +1,55 @@
+FROM ubuntu:16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG ANDROID_ABI
+ARG ANDROID_API
+
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
+ENV ANDROID_API=${ANDROID_API:-21}
+
+ENV HOME=/root \
+    ANDROID_NDK_HOME=/opt/android-ndk-linux \
+    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-dev python-pip python-numpy \
+    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
+    apt-get clean -y
+
+# Install Go and glide
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel sphinx && \
+    pip install pre-commit
+
+# Android NDK
+RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
+    mkdir -p /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
+    unzip -q android-ndk-r14b-linux-x86_64.zip && \
+    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
+    rm -rf /opt/android-ndk-tmp
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
+
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
new file mode 100644
index 0000000000..2c6ba650a5
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.x64
@@ -0,0 +1,54 @@
+# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
+# order to satisfy the build of capnproto library (a nupic.core dependency),
+# which requires some headers and symbols not present on CentOS-5 (e.g.,
+# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
+# https://github.com/sandstorm-io/capnproto/issues/350.
+FROM nvidia/cuda:<baseimg>
+MAINTAINER Numenta, based on the ManyLinux project
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
+
+COPY build_scripts /build_scripts
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# for paddle
+RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+
+
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
+
+# protobuf 3.1.0
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
+    tar xzf protobuf-cpp-3.1.0.tar.gz && \
+    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
+
+
+RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool
+
+RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    go get github.com/Masterminds/glide && \
+    rm -rf /root/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
+    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
+
+RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
+    make -j `nproc` install <NCCL_MAKE_OPTS>  && cd .. && rm -rf nccl
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
new file mode 100644
index 0000000000..cb0a9ac22c
--- /dev/null
+++ b/tools/manylinux1/README.md
@@ -0,0 +1,30 @@
+# buildtools
+
+We release PaddlePaddle and PaddlePaddle Fluid as shared libraries,
+which, we hope could be released as wheel packages on PyPI, so we need
+to make sure that the build follows the
+[manulinux1](https://www.python.org/dev/peps/pep-0513/) standard.
+
+The manylinux standard suggests building Python modules on an old
+system, because that a module would anyway depend on some shared
+libraries, and Linux's shared library standard states that those built
+with newer version compilers cannot work with those with older
+versions.  The suggested building environment is as old as CentOS 5.
+However, PaddlePaddle relies on CUDA, and the earlies version of
+[CentOS works with CUDA is 6](https://hub.docker.com/r/nvidia/cuda/).
+So, here we provide a Docker image basing on CentOS 6 and CUDA for
+building PaddlePaddle and making the release supports "as-manylinux as
+possible."  or "sufficiently many Linux" according to [this
+discussion](https://mail.python.org/pipermail/wheel-builders/2016-July/000175.html).
+
+The build output of our Docker image includes multiple wheel files --
+some contain the CPU-only binary, some others support CUDA; some are
+compatible with the cp27m Python ABI, some others with cp27.
+
+To build these wheels, please run the following commands:
+
+```bash
+git clone https://github.com/paddlepaddle/paddle
+cd paddle/tools/manylinux1
+REPO=[yourrepo] ./build_all.sh
+```
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
new file mode 100755
index 0000000000..097bedb526
--- /dev/null
+++ b/tools/manylinux1/build_all.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -xe
+
+REPO="${REPO:-typhoon1986}"
+
+# NOTE: version matches are determined!
+sed 's/<baseimg>/7.5-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7
+
+sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
new file mode 100644
index 0000000000..93591fa9dd
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+# Python versions to be installed in /opt/$VERSION_NO
+# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
+# remove others to expedite build and reduce docker image size. The original
+# manylinux docker image project builds many python versions.
+# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
+CPYTHON_VERSIONS="2.7.11 3.5.1"
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.0.2l
+OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.49.1
+CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Dependencies for compiling Python that we want to remove from
+# the final image after compiling Python
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+
+# Libraries that are allowed as part of the manylinux1 profile
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+# EPEL support
+yum -y install wget curl
+curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
+check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
+
+# Dev toolset (for LLVM and other projects requiring C++11 support)
+curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
+check_sha256sum devtools-2.repo $DEVTOOLS_HASH
+mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
+rpm -Uvh --replacepkgs epel-release-6*.rpm
+rm -f epel-release-6*.rpm
+
+# Development tools and libraries
+yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    automake which file \
+    kernel-devel-`uname -r` \
+    devtoolset-2-binutils devtoolset-2-gcc \
+    devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
+    ${PYTHON_COMPILE_DEPS}
+
+# Install more recent version of cmake
+# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
+# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
+# rm cmake-3.8.1-Linux-x86_64.sh
+
+wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
+cd cmake-3.5.2 && ./bootstrap && \
+make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+mkdir -p /opt/python
+build_cpythons $CPYTHON_VERSIONS
+
+PY35_BIN=/opt/python/cp35-cp35m/bin
+# NOTE Since our custom manylinux image builds pythons with shared
+# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
+# python.
+ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
+ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.9njs2.tar.gz
+(cd patchelf-0.9njs2 && ./configure && make && make install)
+rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+
+# Install latest pypi release of auditwheel
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+    avahi freetype bitstream-vera-fonts \
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
+yum -y install ${MANYLINUX1_DEPS}
+yum -y clean all > /dev/null 2>&1
+yum list installed
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Add matching directory of libpython shared library to library lookup path
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
+
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Restore LD_LIBRARY_PATH
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
new file mode 100755
index 0000000000..10422ae3bd
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Helper utilities for build
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+# XXX: the official https server at www.openssl.org cannot be reached
+# with the old versions of openssl and curl in Centos 5.11 hence the fallback
+# to the ftp mirror:
+# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
+# Ditto the curl sources
+CURL_DOWNLOAD_URL=http://curl.askapache.com/download
+
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function lex_pyver {
+    # Echoes Python version string padded with zeros
+    # Thus:
+    # 3.2.1 -> 003002001
+    # 3     -> 003000000
+    echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
+}
+
+
+function do_cpython_build {
+    local py_ver=$1
+    check_var $py_ver
+    local ucs_setting=$2
+    check_var $ucs_setting
+    tar -xzf Python-$py_ver.tgz
+    pushd Python-$py_ver
+    if [ "$ucs_setting" = "none" ]; then
+        unicode_flags=""
+        dir_suffix=""
+    else
+        local unicode_flags="--enable-unicode=$ucs_setting"
+        local dir_suffix="-$ucs_setting"
+    fi
+    local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
+    mkdir -p ${prefix}/lib
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+
+    # NOTE --enable-shared for generating libpython shared library needed for
+    # linking of some of the nupic.core test executables.
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+    make -j2 > /dev/null
+    make install > /dev/null
+    popd
+    echo "ZZZ looking for libpython"
+    find / -name 'libpython*.so*'
+    rm -rf Python-$py_ver
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    # NOTE Make libpython shared library visible to python calls below
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
+    if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
+        # NOTE We only need wide unicode for nupic.bindings wheel
+        do_cpython_build $py_ver ucs2
+        do_cpython_build $py_ver ucs4
+    else
+        do_cpython_build $py_ver none
+    fi
+    rm -f Python-$py_ver.tgz
+}
+
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm get-pip.py
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
diff --git a/tools/manylinux1/build_scripts/manylinux1-check.py b/tools/manylinux1/build_scripts/manylinux1-check.py
new file mode 100644
index 0000000000..47fd3d673b
--- /dev/null
+++ b/tools/manylinux1/build_scripts/manylinux1-check.py
@@ -0,0 +1,56 @@
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+    if get_platform() not in ["linux-x86_64", "linux-i686"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+if is_manylinux1_compatible():
+    print("%s is manylinux1 compatible" % (sys.executable, ))
+    sys.exit(0)
+else:
+    print("%s is NOT manylinux1 compatible" % (sys.executable, ))
+    sys.exit(1)
diff --git a/tools/manylinux1/build_scripts/python-tag-abi-tag.py b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
new file mode 100644
index 0000000000..301fbf07a4
--- /dev/null
+++ b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
@@ -0,0 +1,7 @@
+# Utility script to print the python tag + the abi tag for a Python
+# See PEP 425 for exactly what these are, but an example would be:
+#   cp27-cp27mu
+
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
+
+print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
diff --git a/tools/manylinux1/build_scripts/ssl-check.py b/tools/manylinux1/build_scripts/ssl-check.py
new file mode 100644
index 0000000000..a85d91978c
--- /dev/null
+++ b/tools/manylinux1/build_scripts/ssl-check.py
@@ -0,0 +1,32 @@
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+    EXC = OSError
+else:
+    from urllib import urlopen
+    EXC = IOError
+
+print("Connecting to %s should work" % (GOOD_SSL, ))
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print("Connecting to %s should fail" % (BAD_SSL, ))
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")