Merge branch 'develop' into core_inference_multi_thread

8 years ago · bdb21f6bc3
parent 90f3a421c7 47a4ec0672
commit bdb21f6bc3
73 changed files with 2976 additions and 808 deletions
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@ -48,6 +48,13 @@ parser.add_argument(
    type=int,
    default=16,
    help="The sequence number of a mini-batch data. (default: %(default)d)")
 parser.add_argument(
    '--skip_batch_num',
    type=int,
    default=5,
    help='The first num of minibatch num to skip, for better performance test')
 parser.add_argument(
    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    "--dict_size",
    type=int,
@ -72,16 +79,21 @@ parser.add_argument(
    default=3,
    help="The width for beam searching. (default: %(default)d)")
 parser.add_argument(
-    "--use_gpu",
+    '--device',
-    type=distutils.util.strtobool,
+    type=str,
-    default=True,
+    default='GPU',
-    help="Whether to use gpu. (default: %(default)d)")
+    choices=['CPU', 'GPU'],
    help="The device type.")
 parser.add_argument(
    "--max_length",
    type=int,
    default=250,
    help="The maximum length of sequence when doing generation. "
    "(default: %(default)d)")
 parser.add_argument(
    '--with_test',
    action='store_true',
    help='If set, test the testset during training.')
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@ -281,7 +293,7 @@ def train():
            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
        batch_size=args.batch_size)
-    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = Executor(place)
    exe.run(framework.default_startup_program())
@ -307,14 +319,20 @@ def train():
        return total_loss / count
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in xrange(args.pass_num):
-        pass_start_time = time.time()
+        train_accs = []
-        words_seen = 0
+        train_losses = []
        for batch_id, data in enumerate(train_batch_generator()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
-            words_seen += word_num
+            num_samples += word_num
            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
-            words_seen += word_num
+            num_samples += word_num
            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
            fetch_outs = exe.run(framework.default_main_program(),
@ -325,24 +343,36 @@ def train():
                                 },
                                 fetch_list=[avg_cost])
-            avg_cost_val = np.array(fetch_outs[0])
+            iters += 1
-            print('pass_id=%d, batch_id=%d, train_loss: %f' %
+            loss = np.array(fetch_outs[0])
-                  (pass_id, batch_id, avg_cost_val))
+            print(
-
+                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
-        pass_end_time = time.time()
+            )  # The accuracy is the accumulation of batches, but not the current batch.
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            test_loss = do_validation()
-        time_consumed = pass_end_time - pass_start_time
+        exit(0)
        words_per_sec = words_seen / time_consumed
        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
              (pass_id, test_loss, words_per_sec, time_consumed))
 def infer():
    pass
 def print_arguments(args):
    print('----------- seq2seq Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
    if args.infer_only:
        infer()
    else:
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -35,6 +35,12 @@ def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
@ -53,19 +59,14 @@ def parse_args():
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
@ -161,16 +162,22 @@ def run_benchmark(model, args):
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    accuracy = fluid.average.WeightedAverage()
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
-        pass_start = time.time()
+        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
            start = time.time()
            outs = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
@ -178,21 +185,36 @@ def run_benchmark(model, args):
                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
            )  # The accuracy is the accumulation of batches, but not the current batch.
            accuracy.add(value=outs[1], weight=outs[2])
-            end = time.time()
+            iters += 1
            num_samples += len(y_data)
            loss = np.array(outs[0])
            acc = np.array(outs[1])
-            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+            train_losses.append(loss)
-                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+            train_accs.append(acc)
-
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-        pass_end = time.time()
+                  (pass_id, iters, loss, acc))
-
+
-        train_avg_acc = accuracy.eval()
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
                                     inference_program)
        exit(0)
-        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+
-              (pass_id, train_avg_acc, test_avg_acc,
+def print_arguments(args):
-               (pass_end - pass_start) / 1000))
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('----------- mnist Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@ -87,15 +87,6 @@ def parse_args():
    return args
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
    conv1 = fluid.layers.conv2d(
        input=input,
@ -279,32 +270,31 @@ def run_benchmark(model, args):
                      'label': label},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            iters += 1
-            num_samples += label[0]
+            num_samples += len(label)
            accuracy.add(value=acc, weight=weight)
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
        pass_train_acc = accuracy.eval()
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        train_elapsed = time.time() - start_time
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-
+        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        exit(0)
-    if args.use_cprof:
+
-        pr.disable()
+def print_arguments(args):
-        s = StringIO.StringIO()
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-        sortby = 'cumulative'
+                                vars(args)['device'] == 'GPU')
-        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+    print('----------- resnet Configuration Arguments -----------')
-        ps.print_stats()
+    for arg, value in sorted(vars(args).iteritems()):
-        print(s.getvalue())
+        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -1,7 +1,9 @@
 #!/bin/bash
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
-export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
 # disable openmp and mkl parallel
 #https://github.com/PaddlePaddle/Paddle/issues/7199
@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
 # only query the gpu used
 nohup stdbuf -oL nvidia-smi \
      --id=${CUDA_VISIBLE_DEVICES} \
      --query-gpu=timestamp \
      --query-compute-apps=pid,process_name,used_memory \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
               2>&1 | tee -a mnist_gpu_128.log
 # vgg16
-# cifar10 gpu cifar10 128
+# gpu cifar10 128
-FLAGS_benchmark=true python fluid/vgg.py \
+FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 > vgg16_gpu_128.log
+               2>&1 | tee -a vgg16_gpu_128.log
 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=32 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true python fluid/resnet.py \
+FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 > resnet50_gpu_128.log
+               2>&1 | tee -a resnet50_gpu_128.log
 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --model=resnet_imagenet \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
 FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
               --device=GPU \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
               --hidden_dim=512 \
               --emb_dim=512 \
               --crop_size=1500 \
               2>&1 | tee -a lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
 FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a lstm_gpu_128.log
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@ -37,6 +37,14 @@ def parse_args():
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--emb_dim',
        type=int,
@ -64,6 +72,10 @@ def parse_args():
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
@ -157,20 +169,23 @@ def main():
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    def train_loop(pass_num, crop_size):
        with profiler.profiler(args.device, 'total') as prof:
            for pass_id in range(pass_num):
    train_reader = batch(
        paddle.reader.shuffle(
-                        crop_sentence(imdb.train(word_dict), crop_size),
+            crop_sentence(imdb.train(word_dict), args.crop_size),
            buf_size=25000),
        batch_size=args.batch_size)
-                word_nums = 0
+
-                pass_start_time = time.time()
+    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            tensor_words = to_lodtensor([x[0] for x in data], place)
                    for x in data:
                        word_nums += len(x[0])
            label = numpy.array([x[1] for x in data]).astype("int64")
            label = label.reshape((-1, 1))
            loss_np, acc, weight = exe.run(
@ -178,16 +193,19 @@ def main():
                feed={"words": tensor_words,
                      "label": label},
                fetch_list=[loss, batch_acc, batch_size_tensor])
-                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+            iters += 1
-                          (pass_id, batch_id, loss_np, acc))
+            for x in data:
-
+                num_samples += len(x[0])
-                pass_end_time = time.time()
+            print(
-                time_consumed = pass_end_time - pass_start_time
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                words_per_sec = word_nums / time_consumed
+                (pass_id, iters, loss_np, acc)
-                print("pass_id=%d, sec/pass: %f, words/s: %f" %
+            )  # The accuracy is the accumulation of batches, but not the current batch.
                      (pass_id, time_consumed, words_per_sec))
-    train_loop(args.pass_num, args.crop_size)
+        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        exit(0)
 def to_lodtensor(data, place):
@ -205,5 +223,14 @@ def to_lodtensor(data, place):
    return res
 def print_arguments(args):
    print('----------- lstm Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -191,25 +191,29 @@ def main():
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
-            num_samples += len(data)
+            num_samples += len(y_data)
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
-        pass_train_acc = accuracy.eval()
+        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
-        train_elapsed = time.time() - start_time
+        exit(0)
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
 def print_arguments():
-    print('-----------  Configuration Arguments -----------')
+    print('----------- vgg Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@ -0,0 +1,180 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import time
 import numpy as np
 import tensorflow as tf
 import paddle.v2 as paddle
 DTYPE = tf.float32
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    args = parser.parse_args()
    return args
 def run_benchmark(args):
    def weight_variable(dtype, shape):
        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
        return tf.Variable(initial)
    def bias_variable(dtype, shape):
        initial = tf.constant(0.1, shape=shape, dtype=dtype)
        return tf.Variable(initial)
    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
    with tf.device(device):
        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
        labels = tf.placeholder(tf.int64, shape=(None, ))
        # conv1, relu, pool1
        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
        conv1_bias = bias_variable(DTYPE, [20])
        conv1 = tf.nn.conv2d(
            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
        pool1 = tf.nn.max_pool(
            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
        # conv2, relu, pool2
        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
        conv2_bias = bias_variable(DTYPE, [50])
        conv2 = tf.nn.conv2d(
            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
        pool2 = tf.nn.max_pool(
            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
        # FC 
        pool_shape = pool2.get_shape().as_list()
        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
        fc_bias = bias_variable(DTYPE, [10])
        logits = tf.matmul(reshape, fc_weights) + fc_bias
        # Get prediction
        prediction = tf.nn.softmax(logits)
        # Loss 
        one_hot_labels = tf.one_hot(labels, depth=10)
        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
        avg_cost = tf.reduce_mean(cost)
        # Get accuracy
        correct = tf.equal(tf.argmax(prediction, 1), labels)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        # metrics, g_accuracy
        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
            g_accuracy = tf.metrics.accuracy(
                labels, tf.argmax(
                    prediction, axis=1))
            vars = tf.contrib.framework.get_variables(
                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
            g_accuracy_reset_op = tf.variables_initializer(vars)
        # Optimizer 
        opt = tf.train.AdamOptimizer(
            learning_rate=0.001, beta1=0.9, beta2=0.999)
        train_op = opt.minimize(avg_cost)
        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    def eval_test():
        sess.run(g_accuracy_reset_op)
        for batch_id, data in enumerate(test_reader()):
            images_data = np.array(
                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
            loss, acc, g_acc = sess.run(
                [avg_cost, accuracy, g_accuracy],
                feed_dict={images: images_data,
                           labels: labels_data})
        return g_acc[1]
    config = tf.ConfigProto(
        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        init_g = tf.global_variables_initializer()
        init_l = tf.local_variables_initializer()
        sess.run(init_g)
        sess.run(init_l)
        for pass_id in range(args.pass_num):
            sess.run(g_accuracy_reset_op)
            pass_start = time.time()
            for batch_id, data in enumerate(train_reader()):
                images_data = np.array(
                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
                labels_data = np.array(map(lambda x: x[1], data)).astype(
                    "int64")
                start = time.time()
                _, loss, acc, g_acc = sess.run(
                    [train_op, avg_cost, accuracy, g_accuracy],
                    feed_dict={images: images_data,
                               labels: labels_data})
                end = time.time()
                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
            pass_end = time.time()
            test_avg_acc = eval_test()
            print(
                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
                % (pass_id, g_acc[1], test_avg_acc,
                   (pass_end - pass_start) / 1000))
 def print_arguments(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@ -0,0 +1,220 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import tensorflow as tf
 import paddle.v2 as paddle
 def parse_args():
    parser = argparse.ArgumentParser("LSTM model benchmark.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--stacked_num',
        type=int,
        default=5,
        help='Number of lstm layers to stack. (default: %(default)d)')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=10,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.0002,
        help='Learning rate used to train. (default: %(default)f)')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    args = parser.parse_args()
    return args
 def print_arguments(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def dynamic_lstm_model(dict_size,
                       embedding_dim,
                       hidden_dim,
                       stacked_num,
                       class_num=2,
                       is_train=True):
    word_idx = tf.placeholder(tf.int64, shape=[None, None])
    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
    embedding_weights = tf.get_variable('word_embeddings',
                                        [dict_size, embedding_dim])
    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
    lstm_cell = tf.nn.rnn_cell.LSTMCell(
        num_units=hidden_dim, use_peepholes=False)
    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
    _, final_state = tf.nn.dynamic_rnn(
        cell=stacked_cell,
        inputs=embedding,
        dtype=tf.float32,
        sequence_length=sequence_length)
    w = tf.Variable(
        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
    bias = tf.Variable(
        tf.constant(
            value=0.0, shape=[class_num], dtype=tf.float32))
    prediction = tf.matmul(final_state[-1][1], w) + bias
    if not is_train:
        return (word_idx, sequence_length), tf.nn.softmax(prediction)
    label = tf.placeholder(tf.int64, shape=[None, ])
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(label, 2), logits=prediction)
    avg_loss = tf.reduce_mean(loss)
    correct_count = tf.equal(tf.argmax(prediction, 1), label)
    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
        vars = tf.contrib.framework.get_variables(
            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
        reset_op = tf.variables_initializer(vars)
    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
 def padding_data(data, padding_size, value):
    data = data + [value] * padding_size
    return data[:padding_size]
 def train(args):
    word_dict = paddle.dataset.imdb.word_dict()
    dict_size = len(word_dict)
    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    train_op = adam_optimizer.minimize(avg_loss)
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=25000),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.test(word_dict), buf_size=25000),
        batch_size=args.batch_size)
    def do_validation(sess):
        sess.run(reset_op)
        for batch_id, data in enumerate(test_reader()):
            word_idx = map(lambda x: x[0], data)
            sequence_length = np.array(
                [len(seq) for seq in word_idx]).astype('int64')
            maxlen = np.max(sequence_length)
            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
            word_idx = np.array(word_idx).astype('int64')
            label = np.array(map(lambda x: x[1], data)).astype('int64')
            _, loss, fetch_acc, fetch_g_acc = sess.run(
                [train_op, avg_loss, acc, g_acc],
                feed_dict={
                    feeding_list[0]: word_idx,
                    feeding_list[1]: sequence_length,
                    feeding_list[2]: label
                })
        return fetch_g_acc[1]
    config = tf.ConfigProto(
        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        init_g = tf.global_variables_initializer()
        init_l = tf.local_variables_initializer()
        sess.run(init_l)
        sess.run(init_g)
        for pass_id in xrange(args.pass_num):
            # clear accuracy local variable 
            sess.run(reset_op)
            pass_start_time = time.time()
            words_seen = 0
            for batch_id, data in enumerate(train_reader()):
                word_idx = map(lambda x: x[0], data)
                sequence_length = np.array(
                    [len(seq) for seq in word_idx]).astype('int64')
                words_seen += np.sum(sequence_length)
                maxlen = np.max(sequence_length)
                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
                word_idx = np.array(word_idx).astype('int64')
                label = np.array(map(lambda x: x[1], data)).astype('int64')
                _, loss, fetch_acc, fetch_g_acc = sess.run(
                    [train_op, avg_loss, acc, g_acc],
                    feed_dict={
                        feeding_list[0]: word_idx,
                        feeding_list[1]: sequence_length,
                        feeding_list[2]: label
                    })
                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
            pass_end_time = time.time()
            time_consumed = pass_end_time - pass_start_time
            words_per_sec = words_seen / time_consumed
            test_acc = do_validation(sess)
            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
                  (pass_id, test_acc, words_per_sec, time_consumed))
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.infer_only:
        pass
    else:
        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
    endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
    endif()
    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@ -1,3 +1,4 @@
 .timestamp
 *.o
 *.a
 .svn
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
 endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@ -21,9 +21,9 @@ endif()
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                               int block_id) {
  auto& global_block = pdesc.Block(block_id);
  const Scope* ancestor_scope = scope;
  while (ancestor_scope->parent()) {
    ancestor_scope = ancestor_scope->parent();
  }
  if (ancestor_scope != scope) {
    for (auto& var : global_block.AllVars()) {
      if (var->Name() == framework::kEmptyVarName) {
        continue;
      }
      if (var->Persistable()) {
        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
        VLOG(3) << "Create Variable " << var->Name()
                << " global, which pointer is " << ptr;
      } else {
        auto* ptr = scope->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
        VLOG(3) << "Create Variable " << var->Name()
                << " locally, which pointer is " << ptr;
      }
    }
  } else {
    for (auto& var : global_block.AllVars()) {
      auto* ptr = scope->Var(var->Name());
      InitializeVariable(ptr, var->GetType());
      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
              << ptr;
    }
  }
 }
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope, bool create_vars) {
  platform::RecordBlock b(block_id);
@ -184,8 +221,8 @@ static bool has_fetch_operators(
 void Executor::Run(const ProgramDesc& program, Scope* scope,
                   std::map<std::string, const LoDTensor*>& feed_targets,
                   std::map<std::string, LoDTensor*>& fetch_targets,
-                   const std::string& feed_holder_name,
+                   bool create_vars, const std::string& feed_holder_name,
-                   const std::string& fetch_holder_name, bool create_vars) {
+                   const std::string& fetch_holder_name) {
  platform::RecordBlock b(kProgramId);
  bool has_feed_ops =
      has_feed_operators(program.Block(0), feed_targets, feed_holder_name);
@ -296,38 +333,13 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars) {
  auto& block = ctx->prog_.Block(ctx->block_id_);
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
      local_scope = &scope->NewScope();
      for (auto& var : block.AllVars()) {
        if (var->Name() == framework::kEmptyVarName) {
          continue;
    }
-
+    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
        if (var->Persistable()) {
          auto* ptr = scope->Var(var->Name());
          InitializeVariable(ptr, var->GetType());
          VLOG(3) << "Create Variable " << var->Name()
                  << " global, which pointer is " << ptr;
        } else {
          auto* ptr = local_scope->Var(var->Name());
          InitializeVariable(ptr, var->GetType());
          VLOG(3) << "Create Variable " << var->Name()
                  << " locally, which pointer is " << ptr;
        }
      }
    } else {
      for (auto& var : block.AllVars()) {
        auto* ptr = local_scope->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
                << ptr;
  }
    }  // if (create_local_scope)
  }    // if (create_vars)
  for (auto& op : ctx->ops_) {
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@ -54,9 +54,9 @@ class Executor {
  void Run(const ProgramDesc& program, Scope* scope,
           std::map<std::string, const LoDTensor*>& feed_targets,
           std::map<std::string, LoDTensor*>& fetch_targets,
           bool create_vars = true,
           const std::string& feed_holder_name = "feed",
-           const std::string& fetch_holder_name = "fetch",
+           const std::string& fetch_holder_name = "fetch");
           bool create_vars = true);
  static std::unique_ptr<ExecutorPrepareContext> Prepare(
      const ProgramDesc& program, int block_id);
@ -64,6 +64,8 @@ class Executor {
  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
      const ProgramDesc& program, const std::vector<int>& block_ids);
  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          bool create_local_scope = true,
                          bool create_vars = true);
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 #include <string>
 #include <vector>
@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace framework {
@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
 #endif
 };
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
 }
 ParallelExecutor::ParallelExecutor(
    size_t num_threads, bool use_event,
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &params,
-    const ProgramDesc &startup_program, const ProgramDesc &main_program,
+    const std::unordered_set<std::string> &bcast_vars,
-    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
+    const ProgramDesc &main_program, const std::string &loss_var_name,
    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
-  // Step 1. RunStartupProgram and Bcast the params to devs.
+  // Step 1. Bcast the params to devs.
  Executor exe(places[0]);
  exe.Run(startup_program, scope, 0);
  // Create local scopes
  if (local_scopes.empty()) {
    for (size_t i = 0; i < member_->places_.size(); ++i) {
      member_->local_scopes_.push_back(&scope->NewScope());
    }
  } else {
    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
    for (size_t i = 0; i < member_->places_.size(); ++i) {
      member_->local_scopes_.push_back(local_scopes[i]);
    }
  }
 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
 #endif
-  if (platform::is_gpu_place(places[0]) &&
+  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
-      member_->local_scopes_.size() != 1) {  // Is CUDA
+      local_scopes.empty()) {  // Is CUDA
-    BCastParamsToGPUs(startup_program);
+    BCastParamsToGPUs(bcast_vars);
  }
 // Startup Program has been run. All local scopes has correct parameters.
@ -99,16 +109,17 @@ ParallelExecutor::ParallelExecutor(
 }
 void ParallelExecutor::BCastParamsToGPUs(
-    const ProgramDesc &startup_program) const {
+    const std::unordered_set<std::string> &vars) const {
 #ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];
-  for (auto *var_desc : startup_program.Block(0).AllVars()) {
+  for (auto &var : vars) {
-    size_t idx = var_desc->Name().find("@GRAD");
+    auto *main_var = main_scope->FindVar(var);
-    if (idx != std::string::npos) continue;
+    if (!main_var->IsType<LoDTensor>()) {
-    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
+      continue;
-      auto &main_tensor =
+    }
-          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
+
    auto &main_tensor = main_var->Get<LoDTensor>();
    auto &dims = main_tensor.dims();
@ -123,8 +134,7 @@ void ParallelExecutor::BCastParamsToGPUs(
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
-            auto *t =
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
@ -136,13 +146,12 @@ void ParallelExecutor::BCastParamsToGPUs(
      platform::CPUPlace cpu;
      for (size_t i = 1; i < member_->places_.size(); ++i) {
        auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
        t->Resize(dims);
        t->mutable_data(cpu, main_tensor.type());
        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
    }
    member_->nccl_ctxs_->WaitAll();
  }
 #else
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -36,11 +36,14 @@ class ParallelExecutor {
  explicit ParallelExecutor(size_t num_threads, bool use_event,
                            const std::vector<platform::Place>& places,
                            const std::unordered_set<std::string>& params,
-                            const ProgramDesc& startup_program,
+                            const std::unordered_set<std::string>& bcast_vars,
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
                            const std::vector<Scope*>& local_scopes,
                            bool allow_op_delay);
  std::vector<Scope*>& GetLocalScopes();
  void Run(const std::vector<std::string>& fetch_tensors,
           const std::string& fetched_var_name,
           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
@ -51,7 +54,7 @@ class ParallelExecutor {
  ParallelExecutorPrivate* member_;
-  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };
 }  // namespace framework
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include <memory>  // for unique_ptr
 #include <mutex>   // for call_once
 #include <set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
@ -39,6 +38,7 @@ Scope::~Scope() {
 }
 Scope& Scope::NewScope() const {
  std::unique_lock<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope* scope) {
  std::unique_lock<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
  }
 }
-void Scope::EraseVars(std::vector<std::string>& var_names) {
+void Scope::EraseVars(const std::vector<std::string>& var_names) {
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <list>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@ -51,13 +52,13 @@ class Scope {
  /// Create a variable with a scope-unique name.
  Variable* Var(std::string* name = nullptr);
-  void EraseVars(std::vector<std::string>& var_names);
+  void EraseVars(const std::vector<std::string>& var_names);
  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.
  Variable* FindVar(const std::string& name) const;
-  const Scope& parent() const { return *parent_; }
+  const Scope* parent() const { return parent_; }
  /// Find the scope or an ancestor scope that contains the given variable.
  const Scope* FindScope(const Variable* var) const;
@ -88,6 +89,9 @@ class Scope {
  Scope const* parent_{nullptr};
  DISABLE_COPY_AND_ASSIGN(Scope);
 private:
  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
 cc_library(paddle_fluid_api
    SRCS io.cc
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@ -46,8 +46,8 @@ TEST(inference, image_classification) {
  // Run inference on CPU
  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
+  TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
-                                            FLAGS_repeat);
+                                                   cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@ -57,8 +57,8 @@ TEST(inference, image_classification) {
  // Run inference on CUDA GPU
  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
+  TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
-                                             FLAGS_repeat);
+                                                    cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
-template <typename Place>
+template <typename Place, bool CreateVars = true>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
@ -166,8 +166,16 @@ void TestInference(const std::string& dirname,
  // 6. Run the inference program
  {
    if (!CreateVars) {
      // If users don't want to create and destroy variables every time they
      // run, they need to set `create_vars` to false and manually call
      // `CreateVariables` before running.
      executor.CreateVariables(*inference_program, scope, 0);
    }
    // Ignore the profiling results of the first run
-    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets,
                 CreateVars);
    // Enable the profiler
    paddle::platform::EnableProfiler(state);
@ -178,7 +186,8 @@ void TestInference(const std::string& dirname,
          "run_inference",
          paddle::platform::DeviceContextPool::Instance().Get(place));
-      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets,
                   CreateVars);
    }
    // Disable the profiler and print the timing information
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@ -1,20 +1,15 @@
 add_subdirectory(detail)
-cc_library(memory SRCS memory.cc DEPS place enforce)
+cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
-cc_library(paddle_memory
+cc_library(memory
        DEPS
-        memory
+        malloc
-        memcpy
+        memcpy)
        meta_data
        meta_cache
        memory_block
        buddy_allocator
        system_allocator)
-cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
 #if (WITH_GPU)
-#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place paddle_memory)
+#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@ -1,3 +1,5 @@
 cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
 if(${WITH_GPU})
  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
@ -6,10 +8,4 @@ endif(${WITH_GPU})
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
-cc_library(meta_data SRCS meta_data.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
 cc_library(meta_cache SRCS meta_cache.cc)
 cc_library(memory_block SRCS memory_block.cc)
 cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
--- a/Show More
+++ b/Show More