Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-python-pad

7 years ago · 3c370ee0d5
parent 0b8534f2a4 dd75fbde81
commit 3c370ee0d5
63 changed files with 6471 additions and 120 deletions
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -0,0 +1,205 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 SEED = 1
 DTYPE = "float32"
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    parser.add_argument(
        '--use_cprof', action='store_true', help='If set, use cProfile.')
    parser.add_argument(
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    args = parser.parse_args()
    return args
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu")
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu")
    # TODO(dzhwinter) : refine the initializer and random seed settting
    SIZE = 10
    input_shape = conv_pool_2.shape
    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
    predict = fluid.layers.fc(
        input=conv_pool_2,
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
            initializer=fluid.initializer.NormalInitializer(
                loc=0.0, scale=scale)))
    return predict
 def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    test_pass_acc = fluid.average.WeightedAverage()
    for batch_id, data in enumerate(test_reader()):
        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
                                data)).astype(DTYPE)
        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
        y_data = y_data.reshape([len(y_data), 1])
        acc, weight = exe.run(inference_program,
                              feed={"pixel": img_data,
                                    "label": y_data},
                              fetch_list=[batch_acc, batch_size_tensor])
        test_pass_acc.add(value=acc, weight=weight)
        pass_acc = test_pass_acc.eval()
    return pass_acc
 def run_benchmark(model, args):
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
    start_time = time.time()
    # Input data
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    predict = model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
        learning_rate=0.001, beta1=0.9, beta2=0.999)
    opt.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    accuracy = fluid.average.WeightedAverage()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        pass_start = time.time()
        for batch_id, data in enumerate(train_reader()):
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
            start = time.time()
            outs = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
            )  # The accuracy is the accumulation of batches, but not the current batch.
            accuracy.add(value=outs[1], weight=outs[2])
            end = time.time()
            loss = np.array(outs[0])
            acc = np.array(outs[1])
            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
        pass_end = time.time()
        train_avg_acc = accuracy.eval()
        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
                                 inference_program)
        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
              (pass_id, train_avg_acc, test_avg_acc,
               (pass_end - pass_start) / 1000))
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.use_nvprof and args.device == 'GPU':
        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
            run_benchmark(cnn_model, args)
    else:
        run_benchmark(cnn_model, args)
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -0,0 +1,49 @@
 #!/bin/bash
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
 # disable openmp and mkl parallel
 #https://github.com/PaddlePaddle/Paddle/issues/7199
 export MKL_NUM_THREADS=1
 export OMP_NUM_THREADS=1
 ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
 if [ $ht -eq 1 ]; then # HT is OFF
    if [ -z "$KMP_AFFINITY" ]; then
        export KMP_AFFINITY="granularity=fine,compact,0,0"
    fi
    if [ -z "$OMP_DYNAMIC" ]; then
        export OMP_DYNAMIC="FALSE"
    fi
 else # HT is ON
    if [ -z "$KMP_AFFINITY" ]; then
        export KMP_AFFINITY="granularity=fine,compact,1,0"
    fi
 fi
 # disable multi-gpu if have more than one
 export CUDA_VISIBLE_DEVICES=0
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
 # vgg16
 # cifar10 gpu cifar10 128
 FLAGS_benchmark=true python fluid/vgg.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30  \
               2>&1 > vgg16_gpu_128.log
 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true python fluid/resnet.py \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 > resnet50_gpu_128.log
 # lstm
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@ -0,0 +1,209 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import cPickle
 import os
 import random
 import time
 import numpy
 import paddle.v2 as paddle
 import paddle.v2.dataset.imdb as imdb
 import paddle.fluid as fluid
 from paddle.v2 import batch
 import paddle.fluid.profiler as profiler
 def parse_args():
    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=100,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--device',
        type=str,
        default='CPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--crop_size',
        type=int,
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
    args = parser.parse_args()
    return args
 word_dict = imdb.word_dict()
 def crop_sentence(reader, crop_size):
    unk_value = word_dict['<unk>']
    def __impl__():
        for item in reader():
            if len([x for x in item[0] if x != unk_value]) < crop_size:
                yield item
    return __impl__
 def main():
    args = parse_args()
    lstm_size = args.hidden_dim
    data = fluid.layers.data(
        name="words", shape=[1], lod_level=1, dtype='int64')
    sentence = fluid.layers.embedding(
        input=data, size=[len(word_dict), args.emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
    rnn = fluid.layers.DynamicRNN()
    with rnn.block():
        word = rnn.step_input(sentence)
        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
        def gate_common(
                ipt,
                hidden,
                size, ):
            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
            gate = fluid.layers.sums(input=[gate0, gate1])
            return gate
        forget_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        input_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        output_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        cell_gate = fluid.layers.tanh(
            x=gate_common(word, prev_hidden, lstm_size))
        cell = fluid.layers.sums(input=[
            fluid.layers.elementwise_mul(
                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
                    x=input_gate, y=cell_gate)
        ])
        hidden = fluid.layers.elementwise_mul(
            x=output_gate, y=fluid.layers.tanh(x=cell))
        rnn.update_memory(prev_cell, cell)
        rnn.update_memory(prev_hidden, hidden)
        rnn.output(hidden)
    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
    loss = fluid.layers.cross_entropy(
        input=logit,
        label=fluid.layers.data(
            name='label', shape=[1], dtype='int64'))
    loss = fluid.layers.mean(x=loss)
    # add acc
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
                shape=[1], dtype='int64'), total=batch_size_tensor)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    adam = fluid.optimizer.Adam()
    adam.minimize(loss)
    fluid.memory_optimize(fluid.default_main_program())
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    def train_loop(pass_num, crop_size):
        with profiler.profiler(args.device, 'total') as prof:
            for pass_id in range(pass_num):
                train_reader = batch(
                    paddle.reader.shuffle(
                        crop_sentence(imdb.train(word_dict), crop_size),
                        buf_size=25000),
                    batch_size=args.batch_size)
                word_nums = 0
                pass_start_time = time.time()
                for batch_id, data in enumerate(train_reader()):
                    tensor_words = to_lodtensor([x[0] for x in data], place)
                    for x in data:
                        word_nums += len(x[0])
                    label = numpy.array([x[1] for x in data]).astype("int64")
                    label = label.reshape((-1, 1))
                    loss_np, acc, weight = exe.run(
                        fluid.default_main_program(),
                        feed={"words": tensor_words,
                              "label": label},
                        fetch_list=[loss, batch_acc, batch_size_tensor])
                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
                          (pass_id, batch_id, loss_np, acc))
                pass_end_time = time.time()
                time_consumed = pass_end_time - pass_start_time
                words_per_sec = word_nums / time_consumed
                print("pass_id=%d, sec/pass: %f, words/s: %f" %
                      (pass_id, time_consumed, words_per_sec))
    train_loop(args.pass_num, args.crop_size)
 def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
 if __name__ == '__main__':
    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -0,0 +1,220 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    '--batch_size', type=int, default=128, help="Batch size for training.")
 parser.add_argument(
    '--skip_batch_num',
    type=int,
    default=5,
    help='The first num of minibatch num to skip, for better performance test')
 parser.add_argument(
    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    '--learning_rate',
    type=float,
    default=1e-3,
    help="Learning rate for training.")
 parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
 parser.add_argument(
    '--device',
    type=str,
    default='GPU',
    choices=['CPU', 'GPU'],
    help="The device type.")
 parser.add_argument(
    '--data_format',
    type=str,
    default='NCHW',
    choices=['NCHW', 'NHWC'],
    help='The data order, now only support NCHW.')
 parser.add_argument(
    '--data_set',
    type=str,
    default='cifar10',
    choices=['cifar10', 'flowers'],
    help='Optional dataset for benchmark.')
 parser.add_argument(
    '--with_test',
    action='store_true',
    help='If set, test the testset during training.')
 args = parser.parse_args()
 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max')
    conv1 = conv_block(input, 64, 2, [0.3, 0])
    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
 def main():
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
    # Input data
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    opts = optimizer.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # data reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    # test
    def test(exe):
        test_accuracy = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            acc, weight = exe.run(inference_program,
                                  feed={"pixel": img_data,
                                        "label": y_data},
                                  fetch_list=[batch_acc, batch_size_tensor])
            test_accuracy.add(value=acc, weight=weight)
        return test_accuracy.eval()
    iters, num_samples, start_time = 0, 0, time.time()
    accuracy = fluid.average.WeightedAverage()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            loss, acc, weight = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
            num_samples += len(data)
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        train_elapsed = time.time() - start_time
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
 def print_arguments():
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == "__main__":
    print_arguments()
    main()
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@ -1,5 +1,143 @@
-############################
+.. _install_faq:
-Install, Build and Unit test
+
-############################
+###############################
 Compile, Install, and Unit Test
 ###############################
 ..  contents::
 1. Insufficient CUDA driver version
 ----------------------------------------------------------------
 Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
 You can solve the issue by running the following commands:
 ..  code-block:: bash
    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
 2. Version mismatch between PythonLibs and PythonInterpreter
 ----------------------------------------------------------------
 It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
    ..  code-block:: bash
        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
 You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
 3. PaddlePaddle version is 0.0.0
 ------------------------------------------------
 This issue would happen when you run the code  `paddle version` or `cmake ..`
 ..  code-block:: bash
    CMake Warning at cmake/version.cmake:20 (message):
      Cannot add paddle version from git tag
 You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
 4. paddlepaddle\*.whl is not a supported wheel on this platform.
 ------------------------------------------------------------------------
 The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
 You can upgrade Pip with the following command\:
 ..  code-block:: bash
    pip install --upgrade pip
 If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
 If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
 if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
 5. ImportError: No module named v2
 ----------------------------------
 Please uninstall Paddle V1 if you have installed it before.
 ..  code-block:: bash
    pip uninstall py_paddle paddle
 Then install Python for PaddlePaddle , enter the build directory and run the following commands
 pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
 6. Illegal instruction
 -----------------------
 This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
 7.  Python unittest fails
 --------------------------------
 If the following python unittest testcases fail:
 ..  code-block:: bash
    24 - test_PyDataProvider (Failed)
    26 - test_RecurrentGradientMachine (Failed)
    27 - test_NetworkCompare (Failed)
    28 - test_PyDataProvider2 (Failed)
    32 - test_Prediction (Failed)
    33 - test_Compare (Failed)
    34 - test_Trainer (Failed)
    35 - test_TrainerOnePass (Failed)
    36 - test_CompareTwoNets (Failed)
    37 - test_CompareTwoOpts (Failed)
    38 - test_CompareSparse (Failed)
    39 - test_recurrent_machine_generation (Failed)
    40 - test_PyDataProviderWrapper (Failed)
    41 - test_config_parser (Failed)
    42 - test_swig_api (Failed)
    43 - layers_test (Failed)
 Please check the PaddlePaddle unittest logs which may suggest the following:
 ..  code-block:: bash
    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
 The solution is:
 * Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
 8. Failed to download the MKLML library
 ----------------------------------------------
 ..  code-block:: bash
    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
    make[1]: *** waiting for the unfinished  jobs....
 Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
 The solution is: manually download and install, the specific steps are as follows.
 ..  code-block:: bash
    // 1. enter the directory
    cd build/third_party/mklml/src/extern_mklml
    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
    du -sh mklml_lnx_2018.0.1.20171007.tgz
    // 3. manually download and unzip and make the download success tag:
    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
    tar zxf mklml_lnx_2018.0.1.20171007.tgz
    touch ../extern_mklml-stamp/extern_mklml-download
    // 4. then compile
 TBD
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -55,6 +55,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
  auto graph = new SSAGraph();
  SSAGraph &result = *graph;
  std::unordered_set<std::string> og_has_been_broadcast;
  result.vars_.resize(places_.size());
  bool is_forwarding = true;
@ -122,9 +123,15 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    if (!is_forwarding) {
      auto var_names = op->OutputArgumentNames();
      // Currently, we assume that once gradient is generated, it can be
      // broadcast, and each gradient is only broadcast once. But there are no
      // other cases, for example, we need to adjust the gradient according to
      // the input when we get the gradient, which is not considered at present.
      for (auto &og : var_names) {
-        if (grad_names_.count(og) != 0) {  // is param grad
+        if (grad_names_.count(og) != 0 &&
            og_has_been_broadcast.count(og) == 0) {  // is param grad
                                                     // Insert NCCL AllReduce Op
          og_has_been_broadcast.insert(og);
 #ifdef PADDLE_WITH_CUDA
          result.ops_.emplace_back(
              new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@ -22,7 +22,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct OpHandleBase;
+class OpHandleBase;
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <string>
-#include "ThreadPool.h"
+#include <string>
 #include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@ -52,7 +55,7 @@ class SelectedRows {
 private:
  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
  std::unique_ptr<Tensor> value_{nullptr};
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@ -18,6 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 static inline framework::OpKernelType ExpectedKernelType(
    const framework::ExecutionContext& ctx) {
  auto* table_var = ctx.InputVar("W");
  if (table_var->IsType<LoDTensor>()) {
    return framework::OpKernelType(
        framework::ToDataType(table_var->Get<LoDTensor>().type()),
        ctx.device_context());
  } else if (table_var->IsType<SelectedRows>()) {
    return framework::OpKernelType(
        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
        ctx.device_context());
  } else {
    PADDLE_THROW("W should be LoDTensor or SelectedRows");
  }
 }
 class LookupTableOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
+    return ExpectedKernelType(ctx);
        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
        ctx.device_context());
  }
 };
@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                     "If the value is -1, it makes no effect to lookup. "
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
    AddComment(R"DOC(
 Lookup Table Operator.
@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
+    return ExpectedKernelType(ctx);
        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
        ctx.device_context());
  }
 };
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -25,16 +28,37 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 static constexpr int64_t kNoPadding = -1;
 inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
  auto it = std::find(rows.begin(), rows.end(), value);
  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
  return static_cast<size_t>(std::distance(rows.begin(), it));
 }
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
+    auto *table_var = context.InputVar("W");
-    auto* ids_var = context.InputVar("Ids");
+    auto *ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+    Tensor *output_t = context.Output<Tensor>("Out");
    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
    DDim table_dim;
    if (table_var->IsType<LoDTensor>()) {
      table_dim = context.Input<LoDTensor>("W")->dims();
    } else if (table_var->IsType<SelectedRows>()) {
      auto *table_t = context.Input<SelectedRows>("W");
      table_dim = table_t->value().dims();
    } else {
      PADDLE_THROW("table only support LoDTensor and SelectedRows");
    }
-    int64_t* ids;
+    int64_t *ids;
    int64_t ids_numel;
    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel<T> {
    // when Ids's type is SelectedRows, the rows of Ids contains the
    // ids to be looked up in W.
    if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
      ids_numel = ids_t->numel();
    } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
+      auto *ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      ids = const_cast<int64_t *>(ids_t->rows().data());
      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
    } else {
      PADDLE_THROW("Unsupported Variable Type of Ids");
    }
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
      auto *table_t = context.Input<LoDTensor>("W");
      int64_t row_number = table_t->dims()[0];
      int64_t row_width = table_t->dims()[1];
-    int N = table_t->dims()[0];
+      auto *table = table_t->data<T>();
-    int D = table_t->dims()[1];
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
    auto* table = table_t->data<T>();
    auto* output = output_t->mutable_data<T>(context.GetPlace());
    if (padding_idx == -1) {
      for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_LT(ids[i], row_number);
          PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          memcpy(output + i * row_width, table + ids[i] * row_width,
                 row_width * sizeof(T));
        }
-    } else {
+      }
    } else if (table_var->IsType<SelectedRows>()) {
      const auto &table_t = table_var->Get<SelectedRows>();
      int64_t row_width = table_t.value().dims()[1];
      const auto *table = table_t.value().data<T>();
      auto *output = output_t->mutable_data<T>(context.GetPlace());
      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_LT(ids[i], N);
          PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = getIndex(table_t.rows(), ids[i]);
          memcpy(output + i * row_width, table + id_index * row_width,
                 row_width * sizeof(T));
        }
      }
    }
@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
    auto *table_var = context.InputVar("W");
    DDim table_dim;
    if (table_var->IsType<LoDTensor>()) {
      table_dim = context.Input<LoDTensor>("W")->dims();
    } else if (table_var->IsType<SelectedRows>()) {
      auto *table_t = context.Input<SelectedRows>("W");
      table_dim = table_t->value().dims();
    } else {
      PADDLE_THROW("table only support LoDTensor and SelectedRows");
    }
    bool is_sparse = context.Attr<bool>("is_sparse");
    // Since paddings are not trainable and fixed in forward, the gradient of
    // paddings makes no sense and we don't deal with it in backward.
    if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
+      auto *ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
      auto ids_dim = ids->dims();
      framework::Vector<int64_t> new_rows;
@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      }
      d_table->set_rows(new_rows);
-      auto* d_table_value = d_table->mutable_value();
+      auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
      d_table_value->mutable_data<T>(context.GetPlace());
-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);
-      auto* d_output_data = d_output->data<T>();
+      auto *d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
    } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
+      auto *ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
      auto* table = context.Input<LoDTensor>("W");
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
      auto ids_dim = ids->dims();
-      int N = table->dims()[0];
+      int N = table_dim[0];
      int D = d_output->dims()[1];
-      auto* d_output_data = d_output->data<T>();
+      auto *d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
      memset(d_table_data, 0, d_table->numel() * sizeof(T));
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}
 // ncclAllReduceOp with desc
 // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
 /*
 TEST_F(NCCLTester, ncclAllReduceOp) {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  op2->SetType("ncclAllReduce");
@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
    }
  }
 }
 */
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 // ncclBcastOp with desc
 // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
 /*
 TEST_F(NCCLTester, ncclBcastOp) {
  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
    ASSERT_NEAR(ct[j], result, 1e-5);
  }
 }
 */
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@ -20,12 +20,29 @@ namespace paddle {
 namespace operators {
 namespace reader {
-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
 static constexpr size_t kCacheSize = 2;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
 static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
 public:
  struct Item {
    Item() : ctx_(nullptr) {}
    Item(Item&& b) {
      payloads_ = std::move(b.payloads_);
      ctx_ = std::move(b.ctx_);
    }
    Item& operator=(Item&& b) {
      payloads_ = std::move(b.payloads_);
      ctx_ = std::move(b.ctx_);
      return *this;
    }
    std::vector<framework::LoDTensor> payloads_;
    platform::DeviceContext* ctx_;
@ -34,42 +51,44 @@ class DoubleBufferReader : public framework::DecoratedReader {
  explicit DoubleBufferReader(
      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
      : DecoratedReader(reader), place_(target_place) {
    for (size_t i = 0; i < kDoubleBufferSize; ++i) {
      if (platform::is_gpu_place(place_)) {
 #ifdef PADDLE_WITH_CUDA
    for (size_t i = 0; i < kCacheSize; ++i) {
      if (platform::is_gpu_place(place_)) {
        ctxs_.emplace_back(new platform::CUDADeviceContext(
            boost::get<platform::CUDAPlace>(place_)));
 #endif
      }
    }
-
+#endif
-    start_thread();
+    StartPrefetcher();
  }
  void start_thread() {
    buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
  }
  bool HasNext() const override;
  void ReadNext(std::vector<framework::LoDTensor>* out) override;
  void ReInit() override;
-  ~DoubleBufferReader() {
+  ~DoubleBufferReader() { EndPrefetcher(); }
-    buffer_->Close();
+
-    prefetcher_.join();
+ private:
-    delete buffer_;
+  void StartPrefetcher() {
    channel_ = framework::MakeChannel<Item>(kChannelSize);
    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
  }
-  bool HasNext() const override;
+  void EndPrefetcher() {
    channel_->Close();
    if (prefetcher_.joinable()) {
      prefetcher_.join();
    }
    delete channel_;
    channel_ = nullptr;
  }
 private:
  void PrefetchThreadFunc();
  std::thread prefetcher_;
-  framework::Channel<Item>* buffer_;
+  framework::Channel<Item>* channel_;
  platform::Place place_;
  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
  mutable Item local_buffer_;
 };
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@ -123,70 +142,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
  }
 };
 bool DoubleBufferReader::HasNext() const {
  while (!channel_->IsClosed() && !channel_->CanReceive()) {
  }
  return channel_->CanReceive();
 }
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
  if (!HasNext()) {
    PADDLE_THROW("There is no next data!");
  }
-  if (local_buffer_.payloads_.empty()) {
+  Item batch;
-    buffer_->Receive(&local_buffer_);
+  channel_->Receive(&batch);
-  }
+  *out = batch.payloads_;
-  *out = local_buffer_.payloads_;
+  if (batch.ctx_) {
-  local_buffer_.payloads_.clear();
+    batch.ctx_->Wait();
  if (local_buffer_.ctx_) {
    local_buffer_.ctx_->Wait();
  }
 }
 void DoubleBufferReader::ReInit() {
  reader_->ReInit();
-  buffer_->Close();
+  EndPrefetcher();
-  prefetcher_.join();
+  StartPrefetcher();
  delete buffer_;
  start_thread();
 }
 void DoubleBufferReader::PrefetchThreadFunc() {
  VLOG(5) << "A new prefetch thread starts.";
-  size_t gpu_ctx_offset = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
  size_t cached_tensor_id = 0;
  while (reader_->HasNext()) {
    Item batch;
-    reader_->ReadNext(&batch.payloads_);
+    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
    reader_->ReadNext(&cpu_batch);
    if (platform::is_gpu_place(place_)) {
-      std::vector<framework::LoDTensor> gpu_batch;
+      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
-      auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
+      auto* gpu_ctx = ctxs_[cached_tensor_id].get();
-      gpu_ctx_offset %= this->ctxs_.size();
+      gpu_batch.resize(cpu_batch.size());
-      gpu_batch.resize(batch.payloads_.size());
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
-      for (size_t i = 0; i < batch.payloads_.size(); ++i) {
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
-        framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
-                              &gpu_batch[i]);
+      }
-        gpu_batch[i].set_lod(batch.payloads_[i].lod());
+      batch.payloads_ = gpu_batch;
-      }
+      batch.ctx_ = gpu_ctx;
-      batch.ctx_ = gpu_ctx.get();
+    } else {
-      std::swap(gpu_batch, batch.payloads_);
+      // CPUPlace
      batch.payloads_ = cpu_batch;
    }
    ++cached_tensor_id;
    cached_tensor_id %= kCacheSize;
    try {
-      buffer_->Send(&batch);
+      channel_->Send(&batch);
    } catch (paddle::platform::EnforceNotMet e) {
      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                 "prefetch thread will terminate.";
      break;
    }
  }
-  buffer_->Close();
+  channel_->Close();
  VLOG(5) << "Prefetch thread terminates.";
 }
 bool DoubleBufferReader::HasNext() const {
  if (local_buffer_.payloads_.empty()) {
    bool ok = buffer_->Receive(&local_buffer_);
    return ok;
  } else {
    return true;
  }
 }
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@ -104,8 +104,10 @@ EOF
        # make install should also be test when unittest
        make install -j `nproc`
        pip install /usr/local/opt/paddle/share/wheels/*.whl
        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
            paddle version
        fi
    fi
 }
@ -183,6 +185,14 @@ EOF
        NCCL_DEPS=""
    fi
    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
        PADDLE_VERSION="paddle version"
        CMD='"paddle", "version"'
    else
        PADDLE_VERSION="true"
        CMD='"true"'
    fi
    cat >> /paddle/build/Dockerfile <<EOF
    ADD python/dist/*.whl /
    # run paddle version to install python packages first
@ -192,7 +202,7 @@ EOF
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \
-        paddle version && \
+        ${PADDLE_VERSION} && \
        ldconfig
    ${DOCKERFILE_CUDNN_DSO}
    ${DOCKERFILE_GPU_ENV}
@ -200,7 +210,7 @@ EOF
    ADD go/cmd/pserver/pserver /usr/bin/
    ADD go/cmd/master/master /usr/bin/
    # default command shows the paddle version and exit
-    CMD ["paddle", "version"]
+    CMD [${CMD}]
 EOF
 }
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@ -81,6 +81,7 @@ if (WITH_TESTING)
      # enable v2 API unittest only when paddle swig api is compiled
      add_subdirectory(paddle/v2/tests)
      add_subdirectory(paddle/v2/plot/tests)
      add_subdirectory(paddle/v2/reader/tests)
    endif()
  endif()
  add_subdirectory(paddle/fluid/tests)
--- a/python/paddle/dataset/init.py
+++ b/python/paddle/dataset/init.py
@ -37,7 +37,7 @@ __all__ = [
    'cifar',
    'movielens',
    'conll05',
-    'sentiment'
+    'sentiment',
    'uci_housing',
    'wmt14',
    'wmt16',
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
            self.check_with_place(place)
 class TestLookupTableWIsSelectedRows(OpTest):
    def check_with_place(self, place):
        scope = core.Scope()
        # create and initialize Id Variable
        ids_tensor = scope.var('Ids').get_tensor()
        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
        ids_tensor.set(ids_array, place)
        # create and initialize W Variable
        rows = [0, 1, 2, 3, 4, 5, 6]
        row_numel = 12
        w_selected_rows = scope.var('W').get_selected_rows()
        w_selected_rows.set_height(len(rows))
        w_selected_rows.set_rows(rows)
        w_array = np.ones((len(rows), row_numel)).astype("float32")
        for i in range(len(rows)):
            w_array[i] *= i
        ids_tensor = w_selected_rows.get_tensor()
        ids_tensor.set(w_array, place)
        # create Out Variable
        Out_tensor = scope.var('Out').get_tensor()
        # create and run lookup_table operator
        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
        lookup_table.run(scope, place)
        # get result from Out
        result_array = np.array(Out_tensor)
        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
        for idx, row in enumerate(ids_array):
            assert (row[0] == result_array[idx]).all()
    def test_w_is_selected_rows(self):
        places = [core.CPUPlace()]
        # currently only support CPU
        for place in places:
            self.check_with_place(place)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@ -23,7 +23,7 @@ import time
 class TestRecvOp(unittest.TestCase):
-    def test_send(self):
+    def no_test_send(self):
        # Run init_serv in a thread
        place = fluid.CPUPlace()
        p = Process(target=self.init_serv, args=(place, ))
--- a/python/paddle/v2/init.py
+++ b/python/paddle/v2/init.py
@ -22,13 +22,17 @@ import data_type
 import topology
 import networks
 import evaluator
 from . import dataset
 from . import reader
 from . import plot
 import attr
 import op
 import pooling
 import inference
 import networks
 import minibatch
 import plot
 import image
 import paddle.trainer.config_parser as cp
 __all__ = [
@ -44,11 +48,14 @@ __all__ = [
    'data_type',
    'attr',
    'pooling',
    'dataset',
    'reader',
    'topology',
    'networks',
    'infer',
    'plot',
    'evaluator',
    'image',
    'master',
 ]
@ -146,3 +153,4 @@ def init(**kwargs):
 infer = inference.infer
 batch = minibatch.batch
--- a/python/paddle/v2/dataset/init.py
+++ b/python/paddle/v2/dataset/init.py
@ -0,0 +1,46 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Dataset package.
 """
 import mnist
 import imikolov
 import imdb
 import cifar
 import movielens
 import conll05
 import uci_housing
 import sentiment
 import wmt14
 import wmt16
 import mq2007
 import flowers
 import voc2012
 __all__ = [
    'mnist',
    'imikolov',
    'imdb',
    'cifar',
    'movielens',
    'conll05',
    'sentiment',
    'uci_housing',
    'wmt14',
    'wmt16',
    'mq2007',
    'flowers',
    'voc2012',
 ]
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@ -0,0 +1,139 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 CIFAR dataset.
 This module will download dataset from
 https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
 paddle reader creators.
 The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
 with 6000 images per class. There are 50000 training images and 10000 test
 images.
 The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
 containing 600 images each. There are 500 training images and 100 testing
 images per class.
 """
 import cPickle
 import itertools
 import numpy
 import paddle.v2.dataset.common
 import tarfile
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
 CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
 CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 def reader_creator(filename, sub_name):
    def read_batch(batch):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
        assert labels is not None
        for sample, label in itertools.izip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)
    def reader():
        with tarfile.open(filename, mode='r') as f:
            names = (each_item.name for each_item in f
                     if sub_name in each_item.name)
            for name in names:
                batch = cPickle.load(f.extractfile(name))
                for item in read_batch(batch):
                    yield item
    return reader
 def train100():
    """
    CIFAR-100 training set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 99].
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
        'train')
 def test100():
    """
    CIFAR-100 test set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
        'test')
 def train10():
    """
    CIFAR-10 training set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
        'data_batch')
 def test10():
    """
    CIFAR-10 test set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
        'test_batch')
 def fetch():
    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
 def convert(path):
    """
    Converts dataset to recordio format
    """
    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@ -0,0 +1,236 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import requests
 import hashlib
 import os
 import errno
 import shutil
 import sys
 import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
 import cPickle as pickle
 __all__ = [
    'DATA_HOME',
    'download',
    'md5file',
    'split',
    'cluster_files_reader',
    'convert',
 ]
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 # When running unit tests, there could be multiple processes that
 # trying to create DATA_HOME directory simultaneously, so we cannot
 # use a if condition to check for the existence of the directory;
 # instead, we use the filesystem as the synchronization mechanism by
 # catching returned errors.
 def must_mkdirs(path):
    try:
        os.makedirs(DATA_HOME)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass
 must_mkdirs(DATA_HOME)
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")
    for chunk in iter(lambda: f.read(4096), b""):
        hash_md5.update(chunk)
    f.close()
    return hash_md5.hexdigest()
 def download(url, module_name, md5sum, save_name=None):
    dirname = os.path.join(DATA_HOME, module_name)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    filename = os.path.join(dirname,
                            url.split('/')[-1]
                            if save_name is None else save_name)
    retry = 0
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
            print "file md5", md5file(filename), md5sum
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
        print "Cache file %s not found, downloading %s" % (filename, url)
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')
        if total_length is None:
            with open(filename, 'w') as f:
                shutil.copyfileobj(r.raw, f)
        else:
            with open(filename, 'w') as f:
                dl = 0
                total_length = int(total_length)
                for data in r.iter_content(chunk_size=4096):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
                    sys.stdout.write("\r[%s%s]" % ('=' * done,
                                                   ' ' * (50 - done)))
                    sys.stdout.flush()
    return filename
 def fetch_all():
    for module_name in filter(lambda x: not x.startswith("__"),
                              dir(paddle.v2.dataset)):
        if "fetch" in dir(
                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
            getattr(
                importlib.import_module("paddle.v2.dataset.%s" % module_name),
                "fetch")()
 def fetch_all_recordio(path):
    for module_name in filter(lambda x: not x.startswith("__"),
                              dir(paddle.v2.dataset)):
        if "convert" in dir(
                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
                not module_name == "common":
            ds_path = os.path.join(path, module_name)
            must_mkdirs(ds_path)
            getattr(
                importlib.import_module("paddle.v2.dataset.%s" % module_name),
                "convert")(ds_path)
 def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
    """
    you can call the function as:
    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
        suffix="imikolov-train-%05d.pickle")
    the output files as:
    |-imikolov-train-00000.pickle
    |-imikolov-train-00001.pickle
    |- ...
    |-imikolov-train-00480.pickle
    :param reader: is a reader creator
    :param line_count: line count for each file
    :param suffix: the suffix for the output files, should contain "%d"
                means the id for each file. Default is "%05d.pickle"
    :param dumper: is a callable function that dump object to file, this
                function will be called as dumper(obj, f) and obj is the object
                will be dumped, f is a file object. Default is cPickle.dump.
    """
    if not callable(dumper):
        raise TypeError("dumper should be callable.")
    lines = []
    indx_f = 0
    for i, d in enumerate(reader()):
        lines.append(d)
        if i >= line_count and i % line_count == 0:
            with open(suffix % indx_f, "w") as f:
                dumper(lines, f)
                lines = []
                indx_f += 1
    if lines:
        with open(suffix % indx_f, "w") as f:
            dumper(lines, f)
 def cluster_files_reader(files_pattern,
                         trainer_count,
                         trainer_id,
                         loader=cPickle.load):
    """
    Create a reader that yield element from the given files, select
    a file set according trainer count and trainer_id
    :param files_pattern: the files which generating by split(...)
    :param trainer_count: total trainer count
    :param trainer_id: the trainer rank id
    :param loader: is a callable function that load object from file, this
                function will be called as loader(f) and f is a file object.
                Default is cPickle.load
    """
    def reader():
        if not callable(loader):
            raise TypeError("loader should be callable.")
        file_list = glob.glob(files_pattern)
        file_list.sort()
        my_file_list = []
        for idx, fn in enumerate(file_list):
            if idx % trainer_count == trainer_id:
                print "append file: %s" % fn
                my_file_list.append(fn)
        for fn in my_file_list:
            with open(fn, "r") as f:
                lines = loader(f)
                for line in lines:
                    yield line
    return reader
 def convert(output_path, reader, line_count, name_prefix):
    import recordio
    """
    Convert data from reader to recordio format files.
    :param output_path: directory in which output files will be saved.
    :param reader: a data reader, from which the convert program will read
                   data instances.
    :param name_prefix: the name prefix of generated files.
    :param max_lines_to_shuffle: the max lines numbers to shuffle before
                                 writing.
    """
    assert line_count >= 1
    indx_f = 0
    def write_data(indx_f, lines):
        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
        writer = recordio.writer(filename)
        for l in lines:
            # FIXME(Yancey1989):
            # dumps with protocol: pickle.HIGHEST_PROTOCOL
            writer.write(cPickle.dumps(l))
        writer.close()
    lines = []
    for i, d in enumerate(reader()):
        lines.append(d)
        if i % line_count == 0 and i >= line_count:
            write_data(indx_f, lines)
            lines = []
            indx_f += 1
            continue
    write_data(indx_f, lines)
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
--- a/Show More
+++ b/Show More