Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

7 years ago · 182b24ce3c
parent a121c89830 5023530a8a
commit 182b24ce3c
65 changed files with 2577 additions and 1281 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -213,9 +213,11 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
 endif()
 if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()
-    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 include(generic)            # simplify cmake module
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@ -11,6 +11,7 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
@ -27,5 +28,6 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@ -17,7 +17,8 @@ import argparse
 __all__ = ['parse_args', ]
 BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]
@ -67,12 +68,12 @@ def parse_args():
        '--cpus',
        type=int,
        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
-        choices=['cifar10', 'flowers'],
+        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
@ -122,6 +123,11 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
    parser.add_argument(
        '--test_data_path',
        type=str,
        default="",
        help='Directory that contains all the test data (NOT recordio).')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
@ -130,5 +136,9 @@ def parse_args():
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
    parser.add_argument(
        '--use_lars',
        action='store_true',
        help='If set, use lars for optimizers, ONLY support resnet module.')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@ -163,6 +163,19 @@ def gen_job():
        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
    # add ceph volumes
    volumes.append({
        "name": "ceph-data",
        "cephfs": {
            "monitors": ["192.168.16.23:6789"],
            "secretRef": {
                "name": "ceph-secret"
            },
            "user": "admin",
        }
    })
    volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts
--- a/benchmark/fluid/models/init.py
+++ b/benchmark/fluid/models/init.py
@ -13,5 +13,6 @@
 # limitations under the License.
 __all__ = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
    "resnet_with_preprocess"
 ]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """seq2seq model for fluid."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@ -181,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
@ -190,6 +191,9 @@ def get_model(args):
    dict_size = 30000
    beam_size = 3
    max_length = 250
    with fluid.program_guard(main_prog, startup_prog):
        with fluid.unique_name.guard():
            avg_cost, feeding_list = seq_to_seq_net(
                embedding_dim,
                encoder_size,
@ -199,21 +203,15 @@ def get_model(args):
                False,
                beam_size=beam_size,
                max_length=max_length)
-
+    if is_train:
    # clone from default main program
    inference_program = fluid.default_main_program().clone()
        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
        optimizer.minimize(avg_cost)
-    train_batch_generator = paddle.batch(
+    batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size)
            if is_train else paddle.dataset.wmt14.test(dict_size),
            buf_size=1000),
        batch_size=args.batch_size * args.gpus)
-    test_batch_generator = paddle.batch(
+    return avg_cost, optimizer, [], batch_generator, None
        paddle.reader.shuffle(
            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
        batch_size=args.batch_size)
    return avg_cost, inference_program, optimizer, train_batch_generator, \
           test_batch_generator, None
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@ -65,61 +65,50 @@ def cnn_model(data):
    return predict
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
-    if args.use_reader_op:
+    # NOTE: mnist is small, we don't implement data sharding yet.
    filelist = [
        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
    ]
-        data_file = fluid.layers.open_files(
+    with fluid.program_guard(main_prog, startup_prog):
        if args.use_reader_op:
            data_file_handle = fluid.layers.open_files(
                filenames=filelist,
                shapes=[[-1, 1, 28, 28], (-1, 1)],
                lod_levels=[0, 0],
                dtypes=["float32", "int64"],
-            thread_num=args.gpus,
+                thread_num=1,
-            pass_num=args.pass_num)
+                pass_num=1)
            data_file = fluid.layers.double_buffer(
                fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
+                    data_file_handle, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
+        with fluid.unique_name.guard():
            if args.use_reader_op:
                input, label = fluid.layers.read_file(data_file)
            else:
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+                images = fluid.layers.data(
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+                    name='pixel', shape=[1, 28, 28], dtype='float32')
-
+                label = fluid.layers.data(
-    if args.device == 'CPU' and args.cpus > 1:
+                    name='label', shape=[1], dtype='int64')
        places = fluid.layers.get_places(args.cpus)
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            predict = cnn_model(pd.read_input(images))
            label = pd.read_input(label)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
            pd.write_output(avg_cost)
            pd.write_output(batch_acc)
        avg_cost, batch_acc = pd()
        avg_cost = fluid.layers.mean(avg_cost)
        batch_acc = fluid.layers.mean(batch_acc)
    else:
        # Train program
            predict = cnn_model(images)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            # Evaluator
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
    # inference program
    inference_program = fluid.default_main_program().clone()
            # Optimization
            if is_train:
                opt = fluid.optimizer.AdamOptimizer(
                    learning_rate=0.001, beta1=0.9, beta2=0.999)
                opt.minimize()
                if args.memory_optimize:
                    fluid.memory_optimize(main_prog)
    # Reader
-    train_reader = paddle.batch(
+    if is_train:
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+        reader = paddle.dataset.mnist.train()
-    test_reader = paddle.batch(
+    else:
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+        reader = paddle.dataset.mnist.test()
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+    batched_reader = paddle.batch(
        reader, batch_size=args.batch_size * args.gpus)
    return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@ -27,10 +27,17 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
-from recordio_converter import imagenet_train, imagenet_test
+# from recordio_converter import imagenet_train, imagenet_test
 from imagenet_reader import train, val
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+def conv_bn_layer(input,
                  ch_out,
                  filter_size,
                  stride,
                  padding,
                  act='relu',
                  is_train=True):
    conv1 = fluid.layers.conv2d(
        input=input,
        filter_size=filter_size,
@ -39,29 +46,31 @@ def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
        padding=padding,
        act=None,
        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
+    return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
-def shortcut(input, ch_out, stride):
+def shortcut(input, ch_out, stride, is_train=True):
    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        return conv_bn_layer(
            input, ch_out, 1, stride, 0, None, is_train=is_train)
    else:
        return input
-def basicblock(input, ch_out, stride):
+def basicblock(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out, stride)
+    short = shortcut(input, ch_out, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-def bottleneck(input, ch_out, stride):
+def bottleneck(input, ch_out, stride, is_train=True):
-    short = shortcut(input, ch_out * 4, stride)
+    short = shortcut(input, ch_out * 4, stride, is_train=is_train)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    conv3 = conv_bn_layer(
        conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
@ -72,7 +81,11 @@ def layer_warp(block_func, input, ch_out, count, stride):
    return res_out
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+def resnet_imagenet(input,
                    class_dim,
                    depth=50,
                    data_format='NCHW',
                    is_train=True):
    cfg = {
        18: ([2, 2, 2, 1], basicblock),
@ -115,8 +128,9 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
    return out
-def get_model(args):
+def _model_reader_dshape_classdim(args, is_train):
    model = resnet_cifar10
    reader = None
    if args.data_set == "cifar10":
        class_dim = 10
        if args.data_format == 'NCHW':
@ -124,8 +138,10 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-        train_reader = paddle.dataset.cifar.train10()
+        if is_train:
-        test_reader = paddle.dataset.cifar.test10()
+            reader = paddle.dataset.cifar.train10()
        else:
            reader = paddle.dataset.cifar.test10()
    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
@ -133,8 +149,10 @@ def get_model(args):
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
-        train_reader = paddle.dataset.flowers.train()
+        if is_train:
-        test_reader = paddle.dataset.flowers.test()
+            reader = paddle.dataset.flowers.train()
        else:
            reader = paddle.dataset.flowers.test()
    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
@ -145,64 +163,89 @@ def get_model(args):
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
-        train_reader = imagenet_train(args.data_path)
+        if not args.use_reader_op:
-        test_reader = imagenet_test(args.data_path)
+            if is_train:
-
+                reader = train()
    if args.use_reader_op:
        filelist = [
            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1] + dshape, (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        input, label = fluid.layers.read_file(data_file)
            else:
-        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+                reader = val()
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        else:
-
+            if is_train:
-    if args.device == 'CPU' and args.cpus > 1:
+                reader = train(xmap=False)
-        places = fluid.layers.get_places(args.cpus)
+            else:
-        pd = fluid.layers.ParallelDo(places)
+                reader = val(xmap=False)
-        with pd.do():
+    return model, reader, dshape, class_dim
            predict = model(pd.read_input(input), class_dim)
            label = pd.read_input(label)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
            pd.write_output(avg_cost)
            pd.write_output(batch_acc)
-        avg_cost, batch_acc = pd()
+def get_model(args, is_train, main_prog, startup_prog):
-        avg_cost = fluid.layers.mean(avg_cost)
+    model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
-        batch_acc = fluid.layers.mean(batch_acc)
+                                                                     is_train)
    pyreader = None
    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
    with fluid.program_guard(main_prog, startup_prog):
        with fluid.unique_name.guard():
            if args.use_reader_op:
                pyreader = fluid.layers.py_reader(
                    capacity=args.batch_size * args.gpus,
                    shapes=([-1] + dshape, (-1, 1)),
                    dtypes=('float32', 'int64'),
                    name="train_reader" if is_train else "test_reader",
                    use_double_buffer=True)
                input, label = fluid.layers.read_file(pyreader)
            else:
-        predict = model(input, class_dim)
+                input = fluid.layers.data(
                    name='data', shape=dshape, dtype='float32')
                label = fluid.layers.data(
                    name='label', shape=[1], dtype='int64')
            predict = model(input, class_dim, is_train=is_train)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc])
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
            batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
-    batched_train_reader = paddle.batch(
+            # configure optimize
-        train_reader if args.no_random else paddle.reader.shuffle(
+            optimizer = None
-            train_reader, buf_size=5120),
+            if is_train:
                if args.use_lars:
                    lars_decay = 1.0
                else:
                    lars_decay = 0.0
                total_images = 1281167 / trainer_count
                step = int(total_images / args.batch_size + 1)
                epochs = [30, 60, 80, 90]
                bd = [step * e for e in epochs]
                base_lr = args.learning_rate
                lr = []
                lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
                optimizer = fluid.optimizer.Momentum(
                    learning_rate=base_lr,
                    #learning_rate=fluid.layers.piecewise_decay(
                    #    boundaries=bd, values=lr),
                    momentum=0.9,
                    regularization=fluid.regularizer.L2Decay(1e-4))
                optimizer.minimize(avg_cost)
                if args.memory_optimize:
                    fluid.memory_optimize(main_prog)
    # config readers
    if not args.use_reader_op:
        batched_reader = paddle.batch(
            reader if args.no_random else paddle.reader.shuffle(
                reader, buf_size=5120),
            batch_size=args.batch_size * args.gpus,
            drop_last=True)
-    batched_test_reader = paddle.batch(
+    else:
-        test_reader, batch_size=args.batch_size, drop_last=True)
+        batched_reader = None
-
+        pyreader.decorate_paddle_reader(
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
+            paddle.batch(
-                   batched_test_reader, batch_acc
+                reader if args.no_random else paddle.reader.shuffle(
                    reader, buf_size=5120),
                batch_size=args.batch_size))
    return avg_cost, optimizer, [batch_acc1,
                                 batch_acc5], batched_reader, pyreader
--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@ -26,7 +26,6 @@ import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 word_dict = imdb.word_dict()
@ -43,19 +42,7 @@ def crop_sentence(reader, crop_size):
    return __impl__
-def get_model(args):
+def lstm_net(sentence, lstm_size):
    if args.use_reader_op:
        raise Exception(
            "stacked_dynamic_lstm do not support reader op for now.")
    lstm_size = 512
    emb_dim = 512
    crop_size = 1500
    data = fluid.layers.data(
        name="words", shape=[1], lod_level=1, dtype='int64')
    sentence = fluid.layers.embedding(
        input=data, size=[len(word_dict), emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
    rnn = fluid.layers.DynamicRNN()
@ -97,6 +84,24 @@ def get_model(args):
    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
    return logit
 def get_model(args, is_train, main_prog, startup_prog):
    if args.use_reader_op:
        raise Exception(
            "stacked_dynamic_lstm do not support reader op for now.")
    lstm_size = 512
    emb_dim = 512
    crop_size = 1500
    with fluid.program_guard(main_prog, startup_prog):
        with fluid.unique_name.guard():
            data = fluid.layers.data(
                name="words", shape=[1], lod_level=1, dtype='int64')
            sentence = fluid.layers.embedding(
                input=data, size=[len(word_dict), emb_dim])
            logit = lstm_net(sentence, lstm_size)
            loss = fluid.layers.cross_entropy(
                input=logit,
                label=fluid.layers.data(
@ -108,20 +113,18 @@ def get_model(args):
            batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
                        shape=[1], dtype='int64'), total=batch_size_tensor)
-    inference_program = fluid.default_main_program().clone()
+            if is_train:
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
                adam = fluid.optimizer.Adam()
                adam.minimize(loss)
    if is_train:
        reader = crop_sentence(imdb.train(word_dict), crop_size)
    else:
        reader = crop_sentence(imdb.test(word_dict), crop_size)
-    train_reader = batch(
+    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+            reader, buf_size=25000),
        batch_size=args.batch_size * args.gpus)
    test_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
        batch_size=args.batch_size)
-    return loss, inference_program, adam, train_reader, test_reader, batch_acc
+    return loss, adam, [batch_acc], batched_reader, None
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@ -25,7 +25,7 @@ import functools
 import os
-def vgg16_bn_drop(input):
+def vgg16_bn_drop(input, is_train=True):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
@ -46,13 +46,13 @@ def vgg16_bn_drop(input):
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
-def get_model(args):
+def get_model(args, is_train, main_prog, startup_prog):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
@ -65,29 +65,31 @@ def get_model(args):
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
    if args.use_reader_op:
    filelist = [
        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
    ]
-        data_file = fluid.layers.open_files(
+    with fluid.program_guard(main_prog, startup_prog):
        if args.use_reader_op:
            data_file_handle = fluid.layers.open_files(
                filenames=filelist,
                shapes=[[-1] + data_shape, (-1, 1)],
                lod_levels=[0, 0],
                dtypes=["float32", "int64"],
-            thread_num=args.gpus,
+                thread_num=1,
-            pass_num=args.pass_num)
+                pass_num=1)
            data_file = fluid.layers.double_buffer(
                fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
+                    data_file_handle, batch_size=args.batch_size))
        with fluid.unique_name.guard():
            if args.use_reader_op:
                images, label = fluid.layers.read_file(data_file)
            else:
                images = fluid.layers.data(
                    name='data', shape=data_shape, dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+                label = fluid.layers.data(
-
+                    name='label', shape=[1], dtype='int64')
            # Train program
-    net = vgg16_bn_drop(images)
+            net = vgg16_bn_drop(images, is_train=is_train)
            predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
@ -96,26 +98,23 @@ def get_model(args):
            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
            batch_acc = fluid.layers.accuracy(
                input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
            # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+            if is_train:
                optimizer = fluid.optimizer.Adam(
                    learning_rate=args.learning_rate)
                optimizer.minimize(avg_cost)
    # data reader
-    train_reader = paddle.batch(
+    if is_train:
        reader = paddle.dataset.cifar.train10() \
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
    else:
        reader = paddle.dataset.cifar.test10() \
            if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
    batched_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            reader, buf_size=5120),
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -16,16 +16,6 @@ set(ANAKIN_LIBRARY     ${ANAKIN_INSTALL_DIR})
 set(ANAKIN_SHARED_LIB  ${ANAKIN_LIBRARY}/libanakin.so)
 set(ANAKIN_SABER_LIB   ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
 # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
 set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
 set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
 set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
 set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
 execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
 execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
 execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
 execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)
 include_directories(${ANAKIN_INCLUDE}/saber/core/)
@ -48,21 +38,24 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-reorder
    -Wno-error=cpp)
 if(WITH_GPU)
    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=YES -DCUDNN_ROOT=${CUDNN_ROOT} -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR})
 else()
    set(CMAKE_ARGS_PREFIX -DUSE_GPU_PLACE=NO)
 endif()
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLML_PROJECT}
    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+    GIT_TAG             "3c8554f4978628183566ab7dd6c1e7e66493c7cd"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DUSE_GPU_PLACE=YES
+    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                        -DCUDNN_ROOT=${CUDNN_ROOT}
                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -145,12 +145,12 @@ copy(memory_lib
 set(inference_deps paddle_fluid_shared paddle_fluid)
 set(module "inference/api")
-if (WITH_ANAKIN AND WITH_GPU)
+if (WITH_ANAKIN AND WITH_MKL)
    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
        SRCS
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
        ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
     list(APPEND inference_deps anakin_inference_lib)
 endif()
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -822,6 +822,14 @@ pad
 ..  autofunction:: paddle.fluid.layers.pad
    :noindex:
 .. _api_fluid_layers_pad_constant_like:
 pad_constant_like
 ---
 ..  autofunction:: paddle.fluid.layers.pad_constant_like
    :noindex:
 .. _api_fluid_layers_label_smooth:
 label_smooth
@ -1145,6 +1153,14 @@ sigmoid
 ..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:
 .. _api_fluid_layers_hsigmoid:
 hsigmoid
 -------
 ..  autofunction:: paddle.fluid.layers.hsigmoid
    :noindex:
 .. _api_fluid_layers_logsigmoid:
 logsigmoid
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -66,7 +66,7 @@ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'pla
 paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -1,14 +1,21 @@
 set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
 file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
 file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
-function(pass_library TARGET)
+
 # Usage: pass_library(target inference) will append to paddle_inference_pass.h
 function(pass_library TARGET DEST)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass ${op_library_DEPS})
    # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
    if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference")
        message(STATUS "add pass ${TARGET} ${DEST}")
        file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
        set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
    endif()
 endfunction()
 cc_library(node SRCS node.cc DEPS proto_desc)
@ -18,13 +25,15 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-pass_library(graph_to_program_pass)
+pass_library(graph_to_program_pass base)
-pass_library(graph_viz_pass)
+pass_library(graph_viz_pass base)
-pass_library(fc_fuse_pass)
+pass_library(fc_fuse_pass inference)
-pass_library(attention_lstm_fuse_pass)
+pass_library(attention_lstm_fuse_pass inference)
-pass_library(infer_clean_graph_pass)
+pass_library(infer_clean_graph_pass inference)
-pass_library(fc_lstm_fuse_pass)
+pass_library(fc_lstm_fuse_pass inference)
-pass_library(seq_concat_fc_fuse_pass)
+pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@ -29,39 +29,27 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
  std::unordered_set<Node*> nodes2delete;
  GraphPatternDetector gpd;
  // BuildFCPattern(gpd.mutable_pattern());
  auto* x = gpd.mutable_pattern()
                ->NewNode("fc_fuse/x")
                ->AsInput()
                ->assert_is_op_input("mul", "X");
-  patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);
+  patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse");
-
+  fc_pattern(x, true /*with bias*/);
 #define GET_NODE(id)                                                         \
  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
                 "pattern has no Node called %s", #id);                      \
  auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id));        \
  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
  int found_fc_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    VLOG(4) << "handle FC fuse";
-    // Currently, there is no FC op available, so I will just simulate the
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
-    // scenerio.
+    GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
-    // FC's fusion is simple, just op fuse, no need to process the
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
-    // parameters.
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_NODE(x);                // x
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
-    GET_NODE(w);                // Y
+    GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
    GET_NODE(fc_bias);          // bias
    GET_NODE(fc_out);           // Out
    GET_NODE(mul);              // MUL op
    GET_NODE(elementwise_add);  // ELEMENT_ADD op
    GET_NODE(mul_out);          // tmp
 #undef GET_NODE
    // Create an FC Node.
    OpDesc desc;
-    std::string fc_x_in = x->Name();
+    std::string fc_x_in = subgraph.at(x)->Name();
    std::string fc_Y_in = w->Name();
    std::string fc_bias_in = fc_bias->Name();
    std::string fc_out_out = fc_out->Name();
@ -73,7 +61,8 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
-    IR_NODE_LINK_TO(x, fc_node);
+    PADDLE_ENFORCE(subgraph.count(x));
    IR_NODE_LINK_TO(subgraph.at(x), fc_node);
    IR_NODE_LINK_TO(w, fc_node);
    IR_NODE_LINK_TO(fc_bias, fc_node);
    IR_NODE_LINK_TO(fc_node, fc_out);
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@ -0,0 +1,185 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 static int BuildFusion(Graph* graph, const std::string& name_scope,
                       Scope* scope, bool with_fc_bias) {
  GraphPatternDetector gpd;
  auto* pattern = gpd.mutable_pattern();
  // Create pattern.
  patterns::FC fc_pattern(pattern, name_scope);
  patterns::GRU gru_pattern(pattern, name_scope);
  PDNode* x =
      pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
  auto* fc_out = fc_pattern(x, with_fc_bias);
  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
  gru_pattern(fc_out);
  // Create New OpDesc
  auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
                         Node* bias, Node* hidden, Node* fc_bias) {
    OpDesc op_desc;
    op_desc.SetType("fusion_gru");
 #define NEW_NAME(x) name_scope + "/at." #x ".new"
 #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
    SET_IN(X, x);
    SET_IN(WeightX, weight_x);
    SET_IN(WeightH, weight_h);
    if (with_fc_bias) {
      op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
    } else {
      SET_IN(Bias, bias);
    }
 #undef SET_IN
    op_desc.SetInput("H0", {});
    op_desc.SetOutput("Hidden", {hidden->Name()});
    op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
    // TODO(TJ): This should be a option for infer
    op_desc.SetAttr("use_seq", true);
 #define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)})
    SET_IMTERMEDIATE_OUT(ReorderedH0);
    SET_IMTERMEDIATE_OUT(XX);
    SET_IMTERMEDIATE_OUT(BatchedInput);
    SET_IMTERMEDIATE_OUT(BatchedOut);
 #undef SET_IMTERMEDIATE_OUT
    auto* op = graph->CreateOpNode(&op_desc);
    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
    PADDLE_ENFORCE(scope);
    if (with_fc_bias) {
      // Fusion GRU bias = fcbias + grubias
      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias->Name());
      auto* out_bias_tensor =
          fusion_bias_var->GetMutable<framework::LoDTensor>();
      PADDLE_ENFORCE(fusion_bias_var);
      auto* gru_bias_var = scope->FindVar(bias->Name());
      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
      PADDLE_ENFORCE(gru_bias_var);
      PADDLE_ENFORCE(fc_bias_var);
      const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
      // new bias = fc bias + gru bias
      out_bias_tensor->Resize(gru_bias_tenosr.dims());
      auto* data = out_bias_tensor->mutable_data<float>(platform::CPUPlace());
      for (int i = 0; i < out_bias_tensor->numel(); i++) {
        data[i] =
            fc_bias_tensor.data<float>()[i] + gru_bias_tenosr.data<float>()[i];
      }
    }
 #undef GET_NODE
 #define NEW_IMTERMEDIATE_OUT(key) \
  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
    NEW_IMTERMEDIATE_OUT(ReorderedH0);
    NEW_IMTERMEDIATE_OUT(XX);
    NEW_IMTERMEDIATE_OUT(BatchedInput);
    NEW_IMTERMEDIATE_OUT(BatchedOut);
 #undef NEW_NAME
 #undef NEW_IMTERMEDIATE_OUT
    IR_NODE_LINK_TO(x, op);
    IR_NODE_LINK_TO(weight_x, op);
    IR_NODE_LINK_TO(weight_h, op);
    IR_NODE_LINK_TO(bias, op);  // actually should link to new bias if have
    IR_NODE_LINK_TO(op, hidden);
    // h0?
    return op;
  };
  int fusion_count{0};
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
    auto* x_n = subgraph.at(x);
    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern);
    // nodes need be removed
    GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern);
    if (with_fc_bias) {
      GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
      gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
      // Remove unneeded nodes.
      std::unordered_set<const Node*> marked_nodes(
          {mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate,
           BatchResetHiddenPrev, BatchHidden});
      GraphSafeRemoveNodes(graph, marked_nodes);
    } else {
      gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr);
      // Remove unneeded nodes.
      std::unordered_set<const Node*> marked_nodes(
          {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden});
      GraphSafeRemoveNodes(graph, marked_nodes);
    }
 #undef GET_NODE
    ++fusion_count;
  };
  gpd(graph, handler);
  return fusion_count;
 }
 std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  FusePassBase::Init(name_scope_, graph.get());
  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
                                 false /*with_fc_bias*/);
  AddStatis(fusion_count);
  return graph;
 }
 std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  FusePassBase::Init(name_scope_, graph.get());
  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
                                 true /*with_fc_bias*/);
  AddStatis(fusion_count);
  return graph;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
 REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@ -0,0 +1,50 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 namespace paddle {
 namespace framework {
 namespace ir {
 // The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
 class FCGRUFusePass : public FusePassBase {
 public:
  virtual ~FCGRUFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"fc_gru_fuse"};
 };
 // Just FC without bias
 class MulGRUFusePass : public FusePassBase {
 public:
  virtual ~MulGRUFusePass() {}
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"fc_nobias_gru_fuse"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
@ -19,44 +20,29 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::string GenNodeName(const std::string& prefix, const std::string& name) {
  return prefix + "/" + name;
 }
 void BuildPattern(PDPattern* pattern, const std::string& name_scope,
                  bool with_fc_bias) {
  PDNode* x = pattern->NewNode(name_scope, "x")
                  ->assert_is_op_input("mul")
                  ->assert_var_not_persistable();
  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
  patterns::LSTM(pattern, name_scope, fc_out);
  // LOG(INFO) << "\n" << pattern->DotString();
 }
 int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
                bool with_fc_bias) {
  GraphPatternDetector gpd;
  auto* pattern = gpd.mutable_pattern();
-  BuildPattern(pattern, name_scope, with_fc_bias);
+  // Build pattern
  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
                  ->assert_is_op_input("mul")
                  ->assert_var_not_persistable();
  patterns::FC fc_pattern(pattern, name_scope);
  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
  auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate();
  patterns::LSTM lstm_pattern(pattern, name_scope);
  lstm_pattern(fc_out);
  // Create New OpDesc
-  auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
+  auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x,
-                          int bias, int hidden, int cell, int xx, int fc_bias) {
+                          Node* weight_h, Node* bias, Node* hidden, Node* cell,
-#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
+                          Node* xx, Node* fc_bias) {
    GET_NODE(input);
    GET_NODE(weight_x);
    GET_NODE(weight_h);
    GET_NODE(bias);
    GET_NODE(hidden);
    GET_NODE(cell);
    GET_NODE(xx);
    GET_NODE(lstm);
    OpDesc op_desc;
    op_desc.SetType("fusion_lstm");
-#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()});
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
    SET_IN(X, input);
    SET_IN(WeightX, weight_x);
    SET_IN(WeightH, weight_h);
@ -69,13 +55,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
      auto* bias_var = scope->Var(new_bias_var);
      PADDLE_ENFORCE(bias_var);
      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
-      auto* lstm_bias_var = scope->FindVar(bias_n->Name());
+      auto* lstm_bias_var = scope->FindVar(bias->Name());
      PADDLE_ENFORCE(lstm_bias_var);
      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
      bias_tensor->Resize(lstm_bias_tensor.dims());
-      GET_NODE(fc_bias);
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
@ -86,31 +71,36 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
      }
      op_desc.SetInput("Bias", {new_bias_var});
    }
 #undef GET_NODE
    // Create temp variables.
-    scope->Var(name_scope + "/BatchedInput.new")
+    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
-        ->GetMutable<framework::LoDTensor>();
+    const std::string BatchedCellPreAct =
-    scope->Var(name_scope + "/BatchCellPreAct.new")
+        patterns::UniqueKey("BatchedCellPreAct");
-        ->GetMutable<framework::LoDTensor>();
+    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
-    scope->Var(name_scope + "/BatchedGate.new")
+
-        ->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
-    op_desc.SetOutput("Hidden", {hidden_n->Name()});
+    op_desc.SetOutput("Hidden", {hidden->Name()});
-    op_desc.SetOutput("Cell", {cell_n->Name()});
+    op_desc.SetOutput("Cell", {cell->Name()});
-    op_desc.SetOutput("XX", {xx_n->Name()});
+    op_desc.SetOutput("XX", {xx->Name()});
-    op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
+    op_desc.SetOutput("BatchedGate", {BatchedGate});
-    op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
+    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
-    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
+    op_desc.SetOutput("BatchedInput", {BatchedInput});
-    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
+    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr
    op_desc.SetAttr("use_seq", true);
-#define TMP_NAME(x) "at.new.tmp." #x
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
 #define OP_SET_OUT(x)                            \
  const std::string x = patterns::UniqueKey(#x); \
  op_desc.SetOutput(#x, {x});                    \
  scope->Var(x)->GetMutable<LoDTensor>()
    OP_SET_OUT(BatchedCell);
    OP_SET_OUT(BatchedHidden);
    OP_SET_OUT(ReorderedH0);
@ -118,22 +108,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
 #undef OP_SET_OUT
    auto* op = graph->CreateOpNode(&op_desc);
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    IR_NODE_LINK_TO(input, op);
-    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    IR_NODE_LINK_TO(weight_x, op);
-
+    IR_NODE_LINK_TO(weight_h, op);
-#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
+    IR_NODE_LINK_TO(bias, op);
-    TMP_NEW(BatchedCell);
+    IR_NODE_LINK_TO(op, hidden);
    TMP_NEW(BatchedHidden);
    TMP_NEW(ReorderedH0);
    TMP_NEW(ReorderedC0);
 #undef TMP_NEW
 #undef TMP_NAME
    IR_NODE_LINK_TO(input_n, op);
    IR_NODE_LINK_TO(weight_x_n, op);
    IR_NODE_LINK_TO(weight_h_n, op);
    IR_NODE_LINK_TO(bias_n, op);
    IR_NODE_LINK_TO(op, hidden_n);
    return op;
  };
@ -141,39 +120,32 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
 #define GET_NODE(name__)                                \
  std::string name__##key = name_scope + "/" + #name__; \
  auto* name__##n = pattern->RetrieveNode(name__##key); \
  PADDLE_ENFORCE(name__##n);                            \
  PADDLE_ENFORCE(subgraph.count(name__##n));            \
  Node* name__##_n = subgraph.at(name__##n);            \
  int name__ __attribute__((unused)) = name__##_n->id();
    GET_NODE(x);
    GET_NODE(w);
    GET_NODE(mul);
    GET_NODE(fc_out);
    GET_NODE(Weight);
    GET_NODE(lstm);
    GET_NODE(Bias);
    GET_NODE(Hidden);
    GET_NODE(Cell);
    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
    if (with_fc_bias) {
-      GET_NODE(fc_bias);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
-      GET_NODE(elementwise_add);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
-      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
                   fc_bias);
      // Remove unneeded nodes.
      std::unordered_set<const Node*> marked_nodes(
-          {mul_n, lstm_n, elementwise_add_n});
+          {mul, lstm, elementwise_add});
      GraphSafeRemoveNodes(graph, marked_nodes);
    } else {
-      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
      lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
                   nullptr);
      // Remove unneeded nodes.
-      std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
+      std::unordered_set<const Node*> marked_nodes({mul, lstm});
      GraphSafeRemoveNodes(graph, marked_nodes);
    }
 #undef GET_NODE
    ++fusion_count;
  };
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/printf.h"
 namespace paddle {
 namespace framework {
@ -106,8 +107,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
  for (auto& pdnode : pattern_.nodes()) {
    if (!pdnodes2nodes_.count(pdnode.get())) {
      VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
-
+      // return false;
      return false;
    }
  }
  for (auto& item : pdnodes2nodes_) {
@ -517,61 +517,51 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
  return false;
 }
-PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
+PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
-                     PDNode* x, bool with_bias) {
+                                 bool with_bias) {
-  // Create Operators
+  // Create shared nodes.
-  PDNode* elementwise_add_op{nullptr};
+  x->assert_is_op_input("mul", "X");
-  auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
+  auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
-  if (with_bias) {
+
-    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+  auto* mul_w_var = pattern->NewNode(w_repr())
                             ->assert_is_op("elementwise_add");
  }
  // Create variables
  // w
  auto* mul_weight_var = pattern->NewNode(name_scope, "w")
                        ->AsInput()
                        ->assert_is_persistable_var()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
+                        ->assert_is_op_input("mul", "Y");
-  PDNode* mul_out_var{nullptr};
+
-  if (with_bias) {
+  auto* mul_out_var =
-    // intermediate variable, will be removed in the IR after fuse.
+      pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
-    mul_out_var = pattern->NewNode(name_scope, "mul_out")
+
-                      ->AsIntermediate()
+  if (!with_bias) {  // not with bias
-                      ->assert_is_only_output_of_op("mul")
+    // Add links.
-                      ->assert_is_op_input("elementwise_add");
+    mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var});
-  }
+    return mul_out_var;
-  PDNode *bias{nullptr}, *fc_out{nullptr};
+
-  if (with_bias) {
+  } else {  // with bias
-    // bias
+    mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
-    bias = pattern->NewNode(name_scope, "fc_bias")
+    // Create operators.
    auto* elementwise_add = pattern->NewNode(elementwise_add_repr())
                                ->assert_is_op("elementwise_add");
    // Create variables.
    auto* bias = pattern->NewNode(bias_repr())
                     ->assert_is_op_input("elementwise_add")
                     ->AsInput();
-    // output
+
-    fc_out = pattern->NewNode(name_scope, "fc_out")
+    auto* fc_out = pattern->NewNode(Out_repr())
                       ->AsOutput()
                       ->assert_is_op_output("elementwise_add");
  } else {
    fc_out = pattern->NewNode(name_scope, "fc_out")
                 ->AsOutput()
                 ->assert_is_op_output("mul");
  }
  if (with_bias) {
    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
  } else {
    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
  }
    mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var});
    elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
    return fc_out;
  }
-PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
+}
-                       PDNode* x) {
+
 PDNode* patterns::LSTM::operator()(PDNode* x) {
  x->assert_is_op_input("lstm", "Input");
-  auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
+  auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
 #define NEW_NODE(arg__, io__) \
-  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+  auto* arg__ =               \
-                    ->assert_is_op_##io__("lstm", #arg__);
+      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
  // Currently, the H0 and C0 are optional
  // TODO(Superjomn) upgrade the fuse framework to support optional.
@ -584,11 +574,42 @@ PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
  NEW_NODE(Cell, output);
  NEW_NODE(BatchGate, output);
  NEW_NODE(BatchCellPreAct, output);
 #undef NEW_NODE
  lstm_op->LinksFrom({x, Weight, Bias});
  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
  return Hidden;
 }
 PDNode* patterns::GRU::operator()(PDNode* x) {
  x->assert_is_op_input("gru", "Input");
  auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
 #define NEW_NODE(arg__, io__) \
  auto* arg__ =               \
      pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
  NEW_NODE(Weight, input);
  // TODO(Superjomn): upgrade the fuse framework to support optional.
  // H0 and bias are optional
  NEW_NODE(Bias, input);  // also optional
  // NEW_NODE(H0, input);
  NEW_NODE(Hidden, output);
  // below are intermediate
  NEW_NODE(BatchGate, output);
  NEW_NODE(BatchResetHiddenPrev, output);
  NEW_NODE(BatchHidden, output);
 #undef NEW_NODE
  BatchGate->AsIntermediate();
  BatchResetHiddenPrev->AsIntermediate();
  BatchHidden->AsIntermediate();
  gru_op->LinksFrom({x, Weight, Bias});
  gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
  return Hidden;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/Show More
+++ b/Show More