Merge branch develop

7 years ago · ac8ac33e45
parent 9b6c5397c5 dd75fbde81
commit ac8ac33e45
71 changed files with 6544 additions and 162 deletions
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -0,0 +1,205 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        pass_start = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            start = time.time()
+            outs = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            accuracy.add(value=outs[1], weight=outs[2])
+            end = time.time()
+            loss = np.array(outs[0])
+            acc = np.array(outs[1])
+            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+        pass_end = time.time()
+
+        train_avg_acc = accuracy.eval()
+        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                 inference_program)
+
+        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+              (pass_id, train_avg_acc, test_avg_acc,
+               (pass_end - pass_start) / 1000))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+
+# vgg16
+# cifar10 gpu cifar10 128
+FLAGS_benchmark=true python fluid/vgg.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30  \
+               2>&1 > vgg16_gpu_128.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true python fluid/resnet.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 > resnet50_gpu_128.log
+
+# lstm
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@ -0,0 +1,209 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    def train_loop(pass_num, crop_size):
+        with profiler.profiler(args.device, 'total') as prof:
+            for pass_id in range(pass_num):
+                train_reader = batch(
+                    paddle.reader.shuffle(
+                        crop_sentence(imdb.train(word_dict), crop_size),
+                        buf_size=25000),
+                    batch_size=args.batch_size)
+                word_nums = 0
+                pass_start_time = time.time()
+                for batch_id, data in enumerate(train_reader()):
+                    tensor_words = to_lodtensor([x[0] for x in data], place)
+                    for x in data:
+                        word_nums += len(x[0])
+                    label = numpy.array([x[1] for x in data]).astype("int64")
+                    label = label.reshape((-1, 1))
+                    loss_np, acc, weight = exe.run(
+                        fluid.default_main_program(),
+                        feed={"words": tensor_words,
+                              "label": label},
+                        fetch_list=[loss, batch_acc, batch_size_tensor])
+                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+                          (pass_id, batch_id, loss_np, acc))
+
+                pass_end_time = time.time()
+                time_consumed = pass_end_time - pass_start_time
+                words_per_sec = word_nums / time_consumed
+                print("pass_id=%d, sec/pass: %f, words/s: %f" %
+                      (pass_id, time_consumed, words_per_sec))
+
+    train_loop(args.pass_num, args.crop_size)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(data)
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:

-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -55,6 +55,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
  auto graph = new SSAGraph();
  SSAGraph &result = *graph;
+  std::unordered_set<std::string> og_has_been_broadcast;
  result.vars_.resize(places_.size());

  bool is_forwarding = true;
@ -122,9 +123,15 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(

    if (!is_forwarding) {
      auto var_names = op->OutputArgumentNames();
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once. But there are no
+      // other cases, for example, we need to adjust the gradient according to
+      // the input when we get the gradient, which is not considered at present.
      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0) {  // is param grad
-                                           // Insert NCCL AllReduce Op
+        if (grad_names_.count(og) != 0 &&
+            og_has_been_broadcast.count(og) == 0) {  // is param grad
+                                                     // Insert NCCL AllReduce Op
+          og_has_been_broadcast.insert(og);
 #ifdef PADDLE_WITH_CUDA
          result.ops_.emplace_back(
              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@ -22,7 +22,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct OpHandleBase;
+class OpHandleBase;

 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
-#include <string>

-#include "ThreadPool.h"
+#include <string>
+#include <vector>

 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"

@ -52,7 +55,7 @@ class SelectedRows {

 private:
  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
  std::unique_ptr<Tensor> value_{nullptr};
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@ -2,7 +2,7 @@ if(WITH_DISTRIBUTE)
  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(serde_test.cc grpc_server_test PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
      cares zlib protobuf sendrecvop_grpc)
  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "grpc_client.h"
-#include <sys/time.h>
+#include "paddle/fluid/operators/detail/grpc_client.h"
+
+#include <limits>
+
 #include "paddle/fluid/framework/threadpool.h"

 namespace paddle {
@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  });

  req_count_++;
@ -70,8 +72,7 @@ void ProcGetResponse(const VarHandle& var_h,
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
  ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(
-      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
  ::grpc::ByteBuffer tmp(&slice, 1);
  result->Swap(&tmp);
 }
@ -109,7 +110,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  });

  req_count_++;
@ -153,7 +154,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
        &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  });

  req_count_++;
@ -169,7 +170,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  req_count_++;
 }

@ -181,7 +182,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
  req_count_++;
 }

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@ -14,6 +14,9 @@ limitations under the License. */

 #include "paddle/fluid/operators/detail/grpc_server.h"

+#include <limits>
+#include <string>
+
 using ::grpc::ServerAsyncResponseWriter;

 namespace paddle {
@ -156,6 +159,8 @@ class RequestPrefetch final : public RequestBase {
    ::grpc::ByteBuffer reply;
    // TODO(Yancey1989): execute the Block which containers prefetch ops

+    VLOG(3) << "RequestPrefetch Process in";
+
    responder_.Finish(reply, ::grpc::Status::OK, this);
    status_ = FINISH;
  }
@ -221,6 +226,7 @@ void AsyncGRPCServer::ShutdownQueue() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  cq_send_->Shutdown();
  cq_get_->Shutdown();
+  cq_prefetch_->Shutdown();
 }

 // This URL explains why shutdown is complicate:
@ -233,6 +239,7 @@ void AsyncGRPCServer::ShutDown() {
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }
  RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@ -243,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
    return;
  }
  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@ -253,6 +261,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
 void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
    return;
  }
  RequestPrefetch* prefetch =
@ -270,25 +279,28 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,

  void* tag = NULL;
  bool ok = false;
+
  while (true) {
+    VLOG(3) << "HandleRequest for " << cq_name << " while in";
    if (!cq->Next(&tag, &ok)) {
      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
      break;
    }
+    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";

    PADDLE_ENFORCE(tag);
    // FIXME(typhoonzero): de-couple the barriers with recv_op
    if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
    if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);

-    RequestBase* base = (RequestBase*)tag;
+    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
    if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
+      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+                   << base->GetReqName() << "]";
      TryToRegisterNewOne();
      delete base;
      continue;
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@ -15,7 +15,8 @@ limitations under the License. */
 #pragma once

 #include <grpc++/grpc++.h>
-#include <thread>
+#include <string>
+#include <utility>

 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@ -93,6 +94,7 @@ class AsyncGRPCServer final {

  // received variable from RPC, operators fetch variable from this queue.
  SimpleBlockQueue<MessageWithName> var_get_queue_;
+  // client send variable to this queue.
  ReceivedQueue var_recv_queue_;

  // condition of the sub program
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@ -28,6 +28,7 @@ std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;

 void StartServer(const std::string& endpoint) {
  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  rpc_service_->RunSyncUpdate();
 }

 TEST(PREFETCH, CPU) {
@ -39,13 +40,23 @@ TEST(PREFETCH, CPU) {
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);
  // create var on local scope
-  std::string var_name("tmp_0");
-  auto var = scope.Var(var_name);
-  auto tensor = var->GetMutable<framework::LoDTensor>();
-  tensor->Resize({10, 10});
+  std::string in_var_name("in");
+  std::string out_var_name("out");
+  auto* in_var = scope.Var(in_var_name);
+  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
+  in_tensor->Resize({10, 10});
+  VLOG(3) << "before mutable_data";
+  in_tensor->mutable_data<int>(place);

+  scope.Var(out_var_name);
+
+  VLOG(3) << "before fetch";
  detail::RPCClient client;
-  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, "");
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
+                               out_var_name);
+  client.Wait();
+
+  rpc_service_->ShutDown();
  server_thread.join();
  rpc_service_.reset(nullptr);
 }
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@ -80,7 +80,7 @@ enum class GrpcMethod {
 };

 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;

 inline const char* GrpcMethodName(GrpcMethod id) {
  switch (id) {
@ -89,7 +89,7 @@ inline const char* GrpcMethodName(GrpcMethod id) {
    case GrpcMethod::kGetVariable:
      return "/sendrecv.SendRecvService/GetVariable";
    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendREcvService/PrefetchVariable";
+      return "/sendrecv.SendRecvService/PrefetchVariable";
  }

  // Shouldn't be reached.
@ -117,5 +117,5 @@ class GrpcService final {
 };

 }  // namespace detail
-}  // namespace operator
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
-
-#include <unistd.h>

 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"

 namespace paddle {
 namespace operators {
@ -111,6 +102,11 @@ class ListenAndServOp : public framework::OperatorBase {

    framework::Executor executor(dev_place);

+    // TODO(qiao) set proper fields for table lookup and update
+    rpc_service_->SetExecutor(&executor);
+    rpc_service_->SetPrefetchBlkdId(0);
+    rpc_service_->SetProgram(program);
+
    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
    bool exit_flag = false;
    // Record received sparse variables, so that
@ -173,7 +169,8 @@ class ListenAndServOp : public framework::OperatorBase {
      }
      ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);

-      VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
+      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
+              << "(ms)";

      // Reset the received sparse variables, the sum operator would not
      // sum the input sparse variables which rows is empty at the next
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@ -18,6 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+static inline framework::OpKernelType ExpectedKernelType(
+    const framework::ExecutionContext& ctx) {
+  auto* table_var = ctx.InputVar("W");
+  if (table_var->IsType<LoDTensor>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<LoDTensor>().type()),
+        ctx.device_context());
+  } else if (table_var->IsType<SelectedRows>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
+        ctx.device_context());
+  } else {
+    PADDLE_THROW("W should be LoDTensor or SelectedRows");
+  }
+}
+
 class LookupTableOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
  }
 };

@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                     "If the value is -1, it makes no effect to lookup. "
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
    AddComment(R"DOC(
 Lookup Table Operator.

@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
  }
 };

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@ -14,6 +14,9 @@ limitations under the License. */

 #pragma once

+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -25,16 +28,37 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static constexpr int64_t kNoPadding = -1;
+
+inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
+  auto it = std::find(rows.begin(), rows.end(), value);
+  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
+  return static_cast<size_t>(std::distance(rows.begin(), it));
+}

 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    auto *ids_var = context.InputVar("Ids");
+    Tensor *output_t = context.Output<Tensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    DDim table_dim;

-    int64_t* ids;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
+    int64_t *ids;
    int64_t ids_numel;

    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel<T> {
    // when Ids's type is SelectedRows, the rows of Ids contains the
    // ids to be looked up in W.
    if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      auto *ids_t = context.Input<LoDTensor>("Ids");
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
      ids_numel = ids_t->numel();
    } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      auto *ids_t = context.Input<SelectedRows>("Ids");
+      ids = const_cast<int64_t *>(ids_t->rows().data());
      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
    } else {
      PADDLE_THROW("Unsupported Variable Type of Ids");
    }

-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];

-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());

-    if (padding_idx == -1) {
      for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
      }
-    } else {
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
          PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = getIndex(table_t.rows(), ids[i]);
+          memcpy(output + i * row_width, table + id_index * row_width,
+                 row_width * sizeof(T));
        }
      }
    }
@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
    bool is_sparse = context.Attr<bool>("is_sparse");
    // Since paddings are not trainable and fixed in forward, the gradient of
    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));

-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
      auto ids_dim = ids->dims();

      framework::Vector<int64_t> new_rows;
@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      }
      d_table->set_rows(new_rows);

-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
      d_table_value->mutable_data<T>(context.GetPlace());

-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);

-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();

      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
    } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));

-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
      auto ids_dim = ids->dims();

-      int N = table->dims()[0];
+      int N = table_dim[0];
      int D = d_output->dims()[1];

-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());

      memset(d_table_data, 0, d_table->numel() * sizeof(T));

--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}

 // ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
 TEST_F(NCCLTester, ncclAllReduceOp) {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  op2->SetType("ncclAllReduce");
@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
    }
  }
 }
+*/

 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }

 // ncclBcastOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
 TEST_F(NCCLTester, ncclBcastOp) {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
    ASSERT_NEAR(ct[j], result, 1e-5);
  }
 }
+*/
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@ -20,12 +20,29 @@ namespace paddle {
 namespace operators {
 namespace reader {

-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
+// time. So the kCacheSize shoul be at least 2.
+static constexpr size_t kCacheSize = 2;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
+static constexpr size_t kChannelSize = 0;  // kCacheSize - 2

 class DoubleBufferReader : public framework::DecoratedReader {
 public:
  struct Item {
    Item() : ctx_(nullptr) {}
+    Item(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+    }
+    Item& operator=(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+      return *this;
+    }

    std::vector<framework::LoDTensor> payloads_;
    platform::DeviceContext* ctx_;
@ -34,42 +51,44 @@ class DoubleBufferReader : public framework::DecoratedReader {
  explicit DoubleBufferReader(
      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
      : DecoratedReader(reader), place_(target_place) {
-    for (size_t i = 0; i < kDoubleBufferSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
 #ifdef PADDLE_WITH_CUDA
+    for (size_t i = 0; i < kCacheSize; ++i) {
+      if (platform::is_gpu_place(place_)) {
        ctxs_.emplace_back(new platform::CUDADeviceContext(
            boost::get<platform::CUDAPlace>(place_)));
-#endif
      }
    }
-
-    start_thread();
-  }
-
-  void start_thread() {
-    buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+#endif
+    StartPrefetcher();
  }

+  bool HasNext() const override;
  void ReadNext(std::vector<framework::LoDTensor>* out) override;
  void ReInit() override;

-  ~DoubleBufferReader() {
-    buffer_->Close();
-    prefetcher_.join();
-    delete buffer_;
+  ~DoubleBufferReader() { EndPrefetcher(); }
+
+ private:
+  void StartPrefetcher() {
+    channel_ = framework::MakeChannel<Item>(kChannelSize);
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
  }

-  bool HasNext() const override;
+  void EndPrefetcher() {
+    channel_->Close();
+    if (prefetcher_.joinable()) {
+      prefetcher_.join();
+    }
+    delete channel_;
+    channel_ = nullptr;
+  }

- private:
  void PrefetchThreadFunc();

  std::thread prefetcher_;
-  framework::Channel<Item>* buffer_;
+  framework::Channel<Item>* channel_;
  platform::Place place_;
  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-  mutable Item local_buffer_;
 };

 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@ -123,70 +142,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
  }
 };

+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
  if (!HasNext()) {
    PADDLE_THROW("There is no next data!");
  }

-  if (local_buffer_.payloads_.empty()) {
-    buffer_->Receive(&local_buffer_);
-  }
-  *out = local_buffer_.payloads_;
-  local_buffer_.payloads_.clear();
-  if (local_buffer_.ctx_) {
-    local_buffer_.ctx_->Wait();
+  Item batch;
+  channel_->Receive(&batch);
+  *out = batch.payloads_;
+  if (batch.ctx_) {
+    batch.ctx_->Wait();
  }
 }

 void DoubleBufferReader::ReInit() {
  reader_->ReInit();
-  buffer_->Close();
-  prefetcher_.join();
-  delete buffer_;
-  start_thread();
+  EndPrefetcher();
+  StartPrefetcher();
 }

 void DoubleBufferReader::PrefetchThreadFunc() {
  VLOG(5) << "A new prefetch thread starts.";
-  size_t gpu_ctx_offset = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
+  size_t cached_tensor_id = 0;
+
  while (reader_->HasNext()) {
    Item batch;
-    reader_->ReadNext(&batch.payloads_);
+    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    reader_->ReadNext(&cpu_batch);
    if (platform::is_gpu_place(place_)) {
-      std::vector<framework::LoDTensor> gpu_batch;
-      auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
-      gpu_ctx_offset %= this->ctxs_.size();
-      gpu_batch.resize(batch.payloads_.size());
-      for (size_t i = 0; i < batch.payloads_.size(); ++i) {
-        framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
-                              &gpu_batch[i]);
-        gpu_batch[i].set_lod(batch.payloads_[i].lod());
+      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto* gpu_ctx = ctxs_[cached_tensor_id].get();
+      gpu_batch.resize(cpu_batch.size());
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
      }
-      batch.ctx_ = gpu_ctx.get();
-      std::swap(gpu_batch, batch.payloads_);
+      batch.payloads_ = gpu_batch;
+      batch.ctx_ = gpu_ctx;
+    } else {
+      // CPUPlace
+      batch.payloads_ = cpu_batch;
    }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;

    try {
-      buffer_->Send(&batch);
+      channel_->Send(&batch);
    } catch (paddle::platform::EnforceNotMet e) {
      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                 "prefetch thread will terminate.";
      break;
    }
  }
-  buffer_->Close();
+  channel_->Close();
  VLOG(5) << "Prefetch thread terminates.";
 }

-bool DoubleBufferReader::HasNext() const {
-  if (local_buffer_.payloads_.empty()) {
-    bool ok = buffer_->Receive(&local_buffer_);
-    return ok;
-  } else {
-    return true;
-  }
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@ -104,7 +104,9 @@ EOF
        # make install should also be test when unittest
        make install -j `nproc`
        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        paddle version
+        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+            paddle version
+        fi
    fi
 }

@ -183,6 +185,14 @@ EOF
        NCCL_DEPS=""
    fi

+    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
+        PADDLE_VERSION="paddle version"
+        CMD='"paddle", "version"'
+    else
+        PADDLE_VERSION="true"
+        CMD='"true"'
+    fi
+
    cat >> /paddle/build/Dockerfile <<EOF
    ADD python/dist/*.whl /
    # run paddle version to install python packages first
@ -192,7 +202,7 @@ EOF
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \
-        paddle version && \
+        ${PADDLE_VERSION} && \
        ldconfig
    ${DOCKERFILE_CUDNN_DSO}
    ${DOCKERFILE_GPU_ENV}
@ -200,7 +210,7 @@ EOF
    ADD go/cmd/pserver/pserver /usr/bin/
    ADD go/cmd/master/master /usr/bin/
    # default command shows the paddle version and exit
-    CMD ["paddle", "version"]
+    CMD [${CMD}]
 EOF
 }

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@ -81,6 +81,7 @@ if (WITH_TESTING)
      # enable v2 API unittest only when paddle swig api is compiled
      add_subdirectory(paddle/v2/tests)
      add_subdirectory(paddle/v2/plot/tests)
+      add_subdirectory(paddle/v2/reader/tests)
    endif()
  endif()
  add_subdirectory(paddle/fluid/tests)
--- a/python/paddle/dataset/init.py
+++ b/python/paddle/dataset/init.py
@ -37,7 +37,7 @@ __all__ = [
    'cifar',
    'movielens',
    'conll05',
-    'sentiment'
+    'sentiment',
    'uci_housing',
    'wmt14',
    'wmt16',
--- a/Show More
+++ b/Show More