Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into gen_nccl_id_op

7 years ago · a529d790b6
parent 82c61dbde3 b708ec0ae1
commit a529d790b6
156 changed files with 4732 additions and 2146 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -16,34 +16,14 @@ env:
  - JOB=check_style
  - JOB=build_android
 addons:
  apt:
    packages:
      - gcc-4.8
      - g++-4.8
      - git
      - build-essential
      - python
      - python-pip
      - python2.7-dev
      - python-wheel
      - libboost-dev
      - curl
      - swig
      - graphviz
      - clang-format-3.8
      - automake
      - libtool
      - ccache
  ssh_known_hosts: 13.229.163.131
 before_install:
  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
    # 43min timeout
-    if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
+    paddle/scripts/paddle_docker_build.sh ${JOB}
    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
  - |
    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
--- a/README.md
+++ b/README.md
@ -75,19 +75,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
  You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
   You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
   We appreciate your contributions!
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ b/benchmark/cluster/vgg16/run_vgg_dist.sh
@ -0,0 +1,21 @@
 #!/bin/bash
 # Update to point to the source file.
 VGG_SRC="vgg16_fluid.py"
 export TRAINING_ROLE=PSERVER
 export TRAINERS=2
 export POD_IP=127.0.0.1
 export PADDLE_INIT_PORT=6174
 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
 # Need to wait for the ps to start first.
 sleep 10
 echo "done start ps"
 export TRAINING_ROLE=TRAINER
 export TRAINERS=2
 export POD_IP=127.0.0.1
 export PADDLE_INIT_PORT=6174
 CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
 CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -200,18 +200,19 @@ def main():
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    % (pass_id, iters, loss, acc,
+                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
-                       len(data) / (time.time() - ts))
+                                             loss, acc,
                                             len(data) / (time.time() - ts))
                )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
-            print(
+            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                  "Train accuracy = %f, Test accuracy = %f\n" %
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_test_acc))
+                   pass_train_acc, pass_test_acc))
    if args.local:
        # Parameter initialization
@ -239,8 +240,6 @@ def main():
        t = fluid.DistributeTranspiler()
        t.transpile(
            optimize_ops,
            params_grads,
            trainer_id=args.task_index,
            pservers=args.ps_hosts,
            trainers=trainers)
--- a/contrib/float16/.gitignore
+++ b/contrib/float16/.gitignore
@ -0,0 +1 @@
 *.inference.model
--- a/contrib/float16/float16_benchmark.md
+++ b/contrib/float16/float16_benchmark.md
@ -0,0 +1,97 @@
 # float16 benchmark
 ## Description
 We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU.
 For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
 ## Test environment
 - GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti 
 - CUDNN: 7.1.1
 - CUDA: 9.0
 - Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode)
 ## Benchmark on V100
 All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes.
 ### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
 Total inference time for one batch:
 |       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
 |float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
 |float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
 |Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
 Total time spent on conv op for one batch:
 |       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
 |float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
 |float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
 |Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
 ### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
 Total inference time for one batch:
 |       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
 |float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
 |float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
 |Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
 Total time spent on conv op for one batch:
 |       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
 |float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
 |float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
 |Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
 ### Vgg16 on cifar10 (image.shape = [3, 32, 32]):
 Total inference time for one batch:
 |       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
 |-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| 
 |float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98  | 6.23  | 8.42  | 13.44  | 24.19  | 44.97  | 
 |float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96  | 3.24  | 4.01  |  5.78  |  9.65  | 17.37  |
 |Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34  | 1.92  | 2.10  |  2.33  |  2.51  |  2.59  |
 ### Resnet32 on cifar10 (image.shape = [3, 32, 32]):
 Total inference time for one batch:
 |       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
 |-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
 |float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10  | 3.28  | 4.47  | 6.86   | 11.63  | 21.16  |
 |float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77  | 3.97  | 3.92  | 4.15   |  6.41  | 11.02  | 
 |Speedup|      |      |      |      |       |       | 1.14  | 1.65   |  1.81  |  1.92  |
 ## Benchmark on 1080 Ti
 All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes.
 ### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
 Total inference time for one batch:
 |       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  |
 |-------|-----: |-----: |-----: |-----: |------: |-------:|
 |float32| 5.60  | 9.38  | 15.86 | 29.79 | 57.60  | 117.73 |
 |float16| 4.99  | 7.79  | 13.47 | 26.02 | 52.30  | 102.34 |
 |Speedup| 1.12  | 1.20  |  1.18 |  1.15 |  1.10  |   1.15 |
 ### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
 Total inference time for one batch:
 |       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  | mb=64  |
 |-------|-----: |-----: |-----: |-----: |------: |-------:|-------:|
 |float32| 5.63  | 6.23  | 8.85  | 14.71 | 26.07  | 52.86  | 108.95 |
 |float16| 5.89  | 6.44  | 7.94  | 12.57 | 22.03  | 45.06  |  92.68 |
 |Speedup|       |       | 1.12  |  1.17 |  1.18  |  1.17  |   1.18 |
--- a/contrib/float16/float16_inference_demo.py
+++ b/contrib/float16/float16_inference_demo.py
--- a/contrib/float16/float16_inference_report.md
+++ b/contrib/float16/float16_inference_report.md
@ -0,0 +1,163 @@
 ## Introduction
 Working with deep neural networks (DNN) is a two-stage process. First we train DNN using labeled examples of inputs and desired outputs to obtain the model parameters (weights), then we deploy DNN along with the trained weights to run inference on unknown inputs. Typically, these weights are in float data type and hence we run inference in float mode using these weights. This post focuses on the discussion of how to use low precision float16 data type to represent these trained weights and run inference in float16 mode as well as the advantages of float16 inference over its float counterpart by showing some experiment results. 
 ## What is float16?
 float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has high tolerance against the loss of precision and range when using float16 to represent the weights and the inference accuracy will only be minimally affected in most cases. This gives us the opportunity to use float16 data type to speedup the inference.
 Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
 ## Why float16?
 The trend in today's deep learning community is to use bigger and deeper model. This translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float are correspondingly three-fold:
 1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of float16 data type. This makes the whole memory footprint of float16 inference roughly about half of its float counterpart. This is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
 2. Because float16 occupies less memory than float, in theory hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, an outstanding example of hardware devices that actually deliver such advantages is Nvidia's latest Volta architecture GPUs, including Tesla V100 and Titan V. Moreover float16 takes less time to read from or write to memory and hence float16 can make inference more efficient especially in memory-bound applications where the performance is largely affected by how fast it is to read and write data.
 3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less that its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
 ## Fluid implementation of float16 inference
 ### Overview
 Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. 
 ### Basic requirement
 When an operator is run by an executor, it uses a kernel to perform computations on tensors contained in the input variables, and then write the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
 This means that if we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will makes the program run in float mode and gives us a final output of float data type. 
 The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator and every subsequent operator will invoke the float16 kernel until we get the final output in float16 data type. So the preliminary requirements for float16 inference is to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
 ### float16 transpiler
 Furthermore, we need a float16 transpiler to achieve the following usage code:
 ```python
 # Get the float32 inference program and load the associated float32 weights
 [inference_program, feed_target_names,
 fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 # Prepare the float input data
 batch_size = 1
 tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32)
 # Running inference_program in float mode
 float_results = exe.run(inference_program,
                        feed={feed_target_names[0]: tensor_img},
                        fetch_list=fetch_targets)
 # Use float16 transpiler to speedup
 float16_inference_program = float_inference_program.clone()
 t = Float16Transpiler()
 t.transpile(float16_inference_program, GPUPlace)
 # Running float16_inference_program in float16 mode using the same input data
 float16_results = exe.run(float16_inference_program,
                          feed={feed_target_names[0]: tensor_img},
                          fetch_list=fetch_targets)
 # Do some tests to verify the correctness of float16 inference
 ...
 np.testing.assert_almost_equal(float_results, float16_results, ...)
 ...
 # Save the float16 inference program and float16 weights for future deployment
 fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
                              fetch_targets, exe,
                              float16_inference_program)
 ```
 In this scenario, we already have a float32 inference program and some associated float32 weights that can do float32 inference. We can easily use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
 We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. This requires us to add some cast operators in the program to convert between float16 tensor and float32 tensor.
 The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
 ### Experiment results
 We provide demo codes that can be used to reproduce the experiment results by doing:
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle.git
 cd Paddle
 # This line will generate a paddle development docker image with cuda 8 and cudnn 7
 # If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile 
 # from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
 # to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
 nvidia-docker build -t paddle:float16 .
 # After running this, different results will be written to different log files in Paddle/contrib/float16/
 nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
 ```
 #### Correctness
 As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incured by float16 and we want to see how good this tolerance is.
 We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
 We repeat the test ten times and get the following results:
 |        | float16 | float32  |
 |--------|--------:|--------: |
 | # 1    | 62.75%  | 62.72%   |
 | # 2    | 61.27%  | 61.28%   |
 | # 3    | 62.24%  | 62.23%   |
 | # 4    | 64.16%  | 64.17%   |
 | # 5    | 60.75%  | 60.77%   |
 | # 6    | 63.25%  | 63.24%   |
 | # 7    | 62.15%  | 62.13%   |
 | # 8    | 62.05%  | 62.02%   |
 | # 9    | 65.19%  | 65.20%   |
 | #10    | 62.53%  | 62.48%   |
 | average| 62.63%  | 62.62%   |
 We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over 10 tests. 
 #### Performance benchmark
 Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
 Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
 Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7 make float16 truly superior to float in certain deep learning applications.
 We thus benchmark the float16 inference performance on a single Nvidia Tesla V100 GPU (volta architecture and with tensor cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
 Average inference time for one mini-batch on Vgg16 model tested on imagenet data set:
 | total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
 |float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
 |float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
 |Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
 We can see that float16 inference provides 2x ~ 4x speedup on different batch sizes. 
 Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
 |conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
 |float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
 |float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
 |Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
 Fluid convolution operator uses cuDNN 7 to implement the kernel and we can see that with the help of tensor core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
 Similarly, we also list the benchmark results of Resnet50 model tested on imagenet data set:
 | total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
 |float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
 |float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
 |Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
 |conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
 |float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
 |float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
 |Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
 We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increase to about 2x for larger batch sizes. Similar trend can be found for the time spent on the convolution operator. Note that right now the tensor core will only be utilized in the convolution operation when certain dimentional requirements are met for the input data and filter. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than the Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
 We also did the same benchmark on a Nvidia GeForce GTX 1080 Ti GPU that does not support tensor core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then deliver around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that tensor core, which is specialized for float16 computations, is a critical component for high performance float16 inference.
 Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for comprehensive benchmark results.
 ### Summary
 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
 2. The accuracy of float16 inference is verified to be almost identical to the float32 counterpart at least on CNNs.
 3. float16 inference provides significant speedup on large and computationally intensive Vgg16 network on image net data set. For the much smaller and simpler Resnet50, the speedup provided by float16 inference is less significant than on Vgg16 but still favorable especially for large batch size.
 4. We cannot achieve the superior float16 inference performance without the help of the newly introduced tensor cores on the Nvidia Volta architecture GPUs.
--- a/python/paddle/fluid/inference_transpiler.py
+++ b/python/paddle/fluid/inference_transpiler.py
@ -13,115 +13,13 @@
 # limitations under the License.
 import numpy as np
-from framework import Program
+import paddle.fluid.core as core
-from executor import global_scope
+from paddle.fluid.framework import Program
-from . import core
+from paddle.fluid.executor import global_scope
-class InferenceTranspiler:
+class Float16Transpiler:
    def transpile(self, program, place, scope=None):
        '''
        Transpile the program. Support only fuse batch normalization now.
        :param program: program to transpile 
        :type program: Program
        :param place: inference place 
        :type place: Place
        :param scope: inference scope 
        :type scope: Scope or None
        '''
        if not isinstance(program, Program):
            raise TypeError("program should be as Program type")
        if not isinstance(place, core.CPUPlace) and not isinstance(
                place, core.CUDAPlace):
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
        if not isinstance(scope, core.Scope):
            raise TypeError("scope should be as Scope type or None")
        self.fuse_batch_norm(program, place, scope)
    def fuse_batch_norm(self, program, place, scope):
        '''
        Transpile the program by fused batch normalization.
        The batch normalization followed the convolution or fully connected layer 
        can be integrated with them. Doing so will give us a forward acceleration, 
        especially in environments like mobile or embedded.
        For input X:
        - Conv process:        X = input * W + bias 
        - Batch norm process:  X' = (X - mean) / std 
        - Scale Process:       Y = a * X' + b
        After fuse into one operation:
        Y = (input * W + bias - mean) / std * a + b
          = input * a * W / std + ((bias - mean) / std * a + b)
        The operator transformation is: 
        - before:
          - conv->batch_norm->any_other_op (bias == 0)
          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
        - after: 
          - conv->elementwise_add->any_other_op
        The transpile stages are:
        1. insert elementwise_add op when bias == 0.
        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
        3. remove batch_norm ops which are not used in any other ops.
        4. adjust the input of any_other_op to be the output of elementwise_add operator.
        5. remove unused variables.
        :param program: program to transpile 
        :type program: Program
        :param place: inference place 
        :type place: Place
        :param scope: inference scope 
        :type scope: Scope
        '''
        self.scope = scope
        self.place = place
        self.block = program.block(0)
        self.input_map = {}  # store the input names should be adjusted 
        i = 0
        while i < len(self.block.ops):
            current_op = self.block.ops[i]
            # TODO(luotao1): consider only conv2d now. fc would be delt later.
            if current_op.type in ['conv2d']:
                # TODO(luotao1): consider single chain network now. 
                # For branch network, we counldn't use block.ops[i + 1] as 
                # the judgment condition.
                next_op = self.block.ops[i + 1]
                # conv2d without bias
                if (next_op.type == 'batch_norm'):
                    # insert bias op
                    bias_op = self._insert_bias_op(i + 1, current_op, next_op)
                    # fuse batch_norm
                    self._fuse_param(current_op, next_op, bias_op, 0)
                    # remove batch_norm_op
                    self.block.remove_op(i + 2)
                    i = i + 1
                # conv2d with bias, the next_op.type is elementwise_add
                elif (next_op.type == 'elementwise_add'):
                    next_next_op = self.block.ops[i + 2]
                    if (next_next_op.type == 'batch_norm'):
                        # fuse batch_norm
                        self._fuse_param(current_op, next_next_op, next_op, 1)
                        # remove batch_norm_op
                        self.block.remove_op(i + 2)
                        i = i + 1
            i = i + 1
        self._adjust_input()
        self._remove_unused_var()
        # TODO(luotao): use clone() method to flush the program.desc in force, 
        # since some large program.desc will not be flushed immediately. 
        # And a better solution will be considered later.
        program = program.clone()
    def float16_transpile(self, program, place, scope=None):
        '''
        Transpile the program desc and cast the weights to float16 data type to
        enable float16 inference.
@ -155,8 +53,15 @@ class InferenceTranspiler:
        :param scope: inference scope 
        :type scope: Scope         
        '''
        if not isinstance(program, Program):
            raise TypeError("program should be as Program type")
        if not isinstance(place, core.CPUPlace) and not isinstance(
                place, core.CUDAPlace):
            raise TypeError("place should be as CPUPlace/CUDAPlace type")
        if scope is None:
            scope = global_scope()
        if not isinstance(scope, core.Scope):
            raise TypeError("scope should be as Scope type or None")
        self.scope = scope
        self.place = place
@ -174,101 +79,6 @@ class InferenceTranspiler:
        program = program.clone()
    # ====================== private transpiler functions =====================
    def _insert_bias_op(self, index, current_op, bn_op):
        '''
        Construct elementwise_add operator for adding bias 
        and insert it into program.
        :param index: insert location of bias_op
        :type index: Int
        :param current_op: current operator (conv or fc)
        :type current_op: Operator
        :param bn_op: batch norm operator
        :type bn_op: Operator
        :return: bias_op
        :rtype: Operator
        '''
        # The input of bias_op is current_op's output and Bias of bn_op
        # The output of bias_op is bn_op's output
        x_var = self.block.var(current_op.output("Output")[0])
        y_var = self.block.var(bn_op.input("Bias")[0])
        out_var = self.block.var(bn_op.output("Y")[0])
        bias_op = self.block.insert_op(
            index,
            type="elementwise_add",
            inputs={"X": x_var,
                    "Y": y_var},
            outputs={"Out": out_var},
            attrs={"axis": 1})  # dim_start=1
        return bias_op
    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
        '''
        fuse the batch_norm_op' parameters to current_op (conv or fc)
        :param current_op: current operator (conv or fc)
        :type current_op: Operator
        :param bn_op: batch norm operator
        :type bn_op: Operator
        :param bias_op: elementwise_add operator for adding bias
        :type bias_op: Operator
        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. 
        :type with_bias: Int
        '''
        def _update_param(op, old_param_name, new_param):
            # For the sake of remaining the original variables the same as before,
            # create new variables in scope to store the new parameters.
            old_param_name = old_param_name[0]
            old_var = self.block.vars[old_param_name]
            new_param_name = old_param_name + '_fuse_bn'
            new_var = self.block.create_parameter(
                name=new_param_name.encode('ascii'),
                type=old_var.type,
                dtype=old_var.dtype,
                shape=old_var.shape)
            op.rename_input(old_param_name, new_param_name)
            self.scope.var(new_param_name)
            tensor = self.scope.find_var(new_param_name).get_tensor()
            tensor.set(np.array(new_param), self.place)
        def _load_param(param_name):
            return np.array(self.scope.find_var(param_name[0]).get_tensor())
        bias_bn = _load_param(bn_op.input("Bias"))  #Bias
        scale_bn = _load_param(bn_op.input("Scale"))  #Scale
        mean_bn = _load_param(bn_op.input("Mean"))  #Mean
        var_bn = _load_param(bn_op.input("Variance"))  #Variance
        # TODO(luotao1): consider only conv2d now. fc would be delt later.
        current_param = _load_param(current_op.input("Filter"))
        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
        tmp = np.float32(np.divide(scale_bn, std_bn))
        # add bias of batch_norm_op to conv2d
        if with_bias:
            bias = _load_param(bias_op.input("Y"))
        else:
            bias = np.zeros(bias_bn.shape)
        bias = np.float32(
            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
        # re-compute weight of conv2d
        tmp = tmp.reshape(tmp.shape[0], -1)
        dst_param = current_param.reshape((tmp.shape[0], -1))
        dst_param = np.float32(np.multiply(dst_param, tmp))
        dst_param = dst_param.reshape(current_param.shape)
        # update parameters
        _update_param(current_op, current_op.input("Filter"), dst_param)
        _update_param(bias_op, bias_op.input("Y"), bias)
        # collect the renamed input
        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
    def _adjust_input(self, skip=False):
        '''
        Change the input variable name in operators.
--- a/contrib/float16/run_float16_demo.sh
+++ b/contrib/float16/run_float16_demo.sh
@ -0,0 +1,117 @@
 #!/bin/bash
 BUILD_PATH=/paddle/fp16_build
 WHEEL_PATH=$BUILD_PATH/python/dist
 INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
 DEMO_PATH=/paddle/contrib/float16
 # Use the single most powerful CUDA GPU on your machine
 export CUDA_VISIBLE_DEVICES=0
 # Build the PaddlePaddle Fluid wheel package and install it.
 mkdir -p $BUILD_PATH && cd $BUILD_PATH
 cmake .. -DWITH_AVX=OFF \
         -DWITH_MKL=OFF \
         -DWITH_GPU=ON \
         -DWITH_TESTING=ON \
         -DWITH_TIMER=ON \
         -DWITH_PROFILER=ON \
         -DWITH_FLUID_ONLY=ON
 make -j `nproc`
 pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
 cd $DEMO_PATH
 # Clear previous log results
 rm -f *.log
 # Test the float16 inference accuracy of resnet32 on cifar10 data set
 stdbuf -oL python float16_inference_demo.py \
       --data_set=cifar10 \
       --model=resnet \
       --threshold=0.6 \
       --repeat=10 \
       2>&1 | tee -a float16_inference_accuracy.log
 # Sleep to cool down the GPU for consistent benchmarking
 sleep 2m
 # benchmarking parameters
 REPEAT=1000
 MAXIMUM_BATCH_SIZE=512
 for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2)); 
 do
  # Test inference benchmark of vgg16 on imagenet
  stdbuf -oL python float16_inference_demo.py \
         --data_set=imagenet \
         --model=vgg \
         --threshold=0.001 \
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_vgg \
      --data_set=imagenet \
      --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
      --repeat=$REPEAT \
      --batch_size=$batch_size \
      --skip_cpu=true \
      2>&1 | tee -a imagenet_vgg16_benchmark.log
  sleep 2m
  # Test inference benchmark of resnet50 on imagenet
  stdbuf -oL python float16_inference_demo.py \
         --data_set=imagenet \
         --model=resnet \
         --threshold=0.001 \
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_resnet \
      --data_set=imagenet \
      --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
      --repeat=$REPEAT \
      --batch_size=$batch_size \
      --skip_cpu=true \
      2>&1 | tee -a imagenet_resnet50_benchmark.log
  sleep 2m
  # Test inference benchmark of vgg16 on cifar10
  stdbuf -oL python float16_inference_demo.py \
         --data_set=cifar10 \
         --model=vgg \
         --threshold=0.001 \
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_vgg \
      --data_set=cifar10 \
      --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
      --repeat=$REPEAT \
      --batch_size=$batch_size \
      --skip_cpu=true \
      2>&1 | tee -a cifar10_vgg16_benchmark.log
  sleep 1m
  # Test inference benchmark of resnet32 on cifar10
  stdbuf -oL python float16_inference_demo.py \
         --data_set=cifar10 \
         --model=resnet \
         --threshold=0.001 \
         --repeat=1 \
  $INFER_PATH/test_inference_image_classification_vgg \
      --data_set=cifar10 \
      --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
      --repeat=$REPEAT \
      --batch_size=$batch_size \
      --skip_cpu=true \
      2>&1 | tee -a cifar10_resnet32_benchmark.log
  sleep 1m
 done
--- a/doc/fluid/design/dist_train/distributed_traing_review.md
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@ -0,0 +1,44 @@
 # Parallelism, Asynchronous,  Synchronous, Codistillation
 For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
 # Model Parallelism
 In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
 A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
 More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
 # Data Parallelism
 Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
 # Asynchronous Training
 In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
 In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
 Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
 # Synchronous Training
 Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
 An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
 Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
 Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
 # Codistillation
 Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
 # Reference
 [1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
 [2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
 [3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
 [4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
--- a/doc/v2/howto/cluster/multi_cluster/index_en.rst
+++ b/doc/v2/howto/cluster/multi_cluster/index_en.rst
@ -8,28 +8,28 @@ The user's cluster environment is not the same. To facilitate everyone's deploym
 ..  toctree::
  :maxdepth: 1
-  k8s_cn.md
+  k8s_en.md
-  k8s_distributed_cn.md
+  k8s_distributed_en.md
 `OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
 ..  toctree::
  :maxdepth: 1
-  openmpi_cn.md
+  openmpi_en.md
 `Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
 ..  toctree::
  :maxdepth: 1
-  fabric_cn.md
+  fabric_en.md
 We also support the deployment of PaddlePaddle on AWS. Learn more about:
 ..  toctree::
  :maxdepth: 1
-  k8s_aws_cn.md
+  k8s_aws_en.md
-The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
+The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@ -108,7 +108,7 @@ paddle_error paddle_matrix_get_row(paddle_matrix mat,
 paddle_error paddle_matrix_get_shape(paddle_matrix mat,
                                     uint64_t* height,
                                     uint64_t* width) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
  if (height != nullptr) {
    *height = cast(mat)->mat->getHeight();
  }
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifndef HL_BASE_H_
+#pragma once
 #define HL_BASE_H_
 #include <cstddef>
@ -207,8 +206,8 @@ typedef struct {
 #ifdef __NVCC__
-#include "cuda_runtime.h"
+#include <cuda_runtime.h>
-#include "hl_cuda.h"
+#include "paddle/cuda/include/hl_cuda.h"
 #include "paddle/utils/Logging.h"
 extern __thread bool g_sync_flag;
@ -230,6 +229,11 @@ extern __thread cudaStream_t default_stream;
 // __shfl has been deprecated as of CUDA 9.0.
 #if CUDA_VERSION < 9000
 template <typename T>
 __forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
  return __shfl_down(val, delta);
 }
 template <typename T>
 __forceinline__ __device__ T
 __shfl_sync(unsigned, T val, int src_line, int width) {
@ -243,6 +247,4 @@ __shfl_sync(unsigned, T val, int src_line, int width) {
  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
-#endif /* __NVCC__ */
+#endif  // __NVCC__
 #endif /* HL_BASE_H_ */
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
+#include "paddle/cuda/include/hl_base.h"
-#include "hl_sparse.ph"
+#include "paddle/cuda/include/hl_sparse.ph"
-#include "hl_top_k.h"
+#include "paddle/cuda/include/hl_top_k.h"
 #include "paddle/utils/Logging.h"
 // using namespace hppl;
@ -244,8 +244,9 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
    if (--beamSize == 0) break;
    __syncthreads();
    // NOTE(zcd): temporary solution
    unsigned mask = 0u;
-    // CREATE_SHFL_MASK(mask, tid < len);
+    CREATE_SHFL_MASK(mask, true);
    if (tid == maxId[0]) {
      if (beam < maxLength) {
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@ -143,7 +143,7 @@ OpDesc *BlockDesc::InsertOp(size_t index) {
 }
 void BlockDesc::RemoveOp(size_t s, size_t e) {
-  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+  if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) {
    return;
  }
  need_update_ = true;
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -15,12 +15,14 @@ if(WITH_GPU)
            dynload_cuda)
    set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
 else()
    set(multi_devices_graph_builder_deps)
    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@ -19,14 +19,12 @@
 namespace paddle {
 namespace framework {
 namespace details {
 BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
    : local_scopes_(local_scopes), places_(places) {}
 void BroadcastOpHandle::RunImpl() {
-  // the input and output may have dummy var.
+  if (places_.size() == 1) return;
  VarHandle *in_var_handle;
  // The input and output may have dummy vars.
  VarHandle *in_var_handle;
  {
    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
@ -55,27 +53,97 @@ void BroadcastOpHandle::RunImpl() {
  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
-  for (auto *out : out_var_handles) {
+  // NOTE: The tensors' Place of input and output must be all on GPU or all on
-    if (*out == *in_var_handle) {
+  // CPU.
  for (auto *out_var_handle : out_var_handles) {
    if (out_var_handle->IsTheSameVar(*in_var_handle)) {
      continue;
    }
-
+    auto t_out_p = out_var_handle->place_;
-    auto &out_p = out->place_;
+    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
-    auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
+                        ->FindVar(out_var_handle->name_);
    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
+    if (platform::is_gpu_place(in_tensor.place())) {
-                      "Places must be all on CPU or all on CUDA.");
+      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-
+                     "Places of input and output must be all on GPU.");
    } else {
      t_out_p = platform::CPUPlace();
    }
    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
-    VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
                                                            in_tensor.type());
  }
  if (platform::is_cpu_place(in_tensor.place())) {
    for (auto *out_var_handle : out_var_handles) {
      if (out_var_handle->IsTheSameVar(*in_var_handle)) {
        continue;
      }
      auto &out_p = out_var_handle->place_;
      auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
                          ->FindVar(out_var_handle->name_);
      RunAndRecordEvent(out_p, [in_tensor, out_var] {
        paddle::framework::TensorCopy(
            in_tensor, platform::CPUPlace(),
            &VariableVisitor::GetMutableTensor(out_var));
      });
    }
  } else {
 #ifdef PADDLE_WITH_CUDA
    VarHandle *out_handle = nullptr;
    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
    std::vector<std::function<void()>> broadcast_calls;
    for (auto out_var_handle : out_var_handles) {
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
                              ->FindVar(out_var_handle->name_);
      int dst_id =
          boost::get<platform::CUDAPlace>(out_var_handle->place_).device;
      auto &nccl_ctx = nccl_ctxs_->at(dst_id);
      void *send_recv_buffer = nullptr;
      if (root_id == dst_id) {
        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
        out_handle = out_var_handle;
      } else {
        send_recv_buffer =
            VariableVisitor::GetMutableTensor(out_var).mutable_data(
                out_var_handle->place_);
      }
      int type = platform::ToNCCLDataType(in_tensor.type());
      size_t numel = static_cast<size_t>(in_tensor.numel());
      broadcast_calls.emplace_back(
          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
            PADDLE_ENFORCE(platform::dynload::ncclBcast(
                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
          });
    }
-    auto dev_ctx = dev_ctxes_.at(out_p);
+    this->RunAndRecordEvent([&] {
-    RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
+      {
-      paddle::framework::TensorCopy(
+        platform::NCCLGroupGuard guard;
-          in_tensor, out_p, *(dev_ctx),
+        for (auto &call : broadcast_calls) {
-          &VariableVisitor::GetMutableTensor(out_var));
+          call();
        }
      }
      if (!out_handle->IsTheSameVar(*in_var_handle)) {
        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
                           ->FindVar(out_var_handles[0]->name_);
        paddle::framework::TensorCopy(
            in_tensor, in_var_handle->place_,
            *(dev_ctxes_.at(in_var_handle->place_)),
            &VariableVisitor::GetMutableTensor(out_var));
      }
    });
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif
  }
 }
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -24,14 +24,32 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 namespace paddle {
 namespace framework {
 namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
 public:
 #ifdef PADDLE_WITH_CUDA
  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
      }
    }
  }
 #else
  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places);
+                    const std::vector<platform::Place> &places)
      : local_scopes_(local_scopes), places_(places) {}
 #endif
  std::string Name() const override;
@ -44,6 +62,9 @@ struct BroadcastOpHandle : public OpHandleBase {
 private:
  const std::vector<Scope *> &local_scopes_;
  const std::vector<platform::Place> &places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -35,15 +35,25 @@ struct TestBroadcastOpHandle {
  std::unique_ptr<OpHandleBase> op_handle_;
  std::vector<std::unique_ptr<VarHandleBase>> vars_;
  std::vector<p::Place> gpu_list_;
  bool use_gpu_;
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
  void WaitAll() {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
 #ifdef PADDLE_WITH_CUDA
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
 #endif
  }
  void InitCtxOnGpu(bool use_gpu) {
-    if (use_gpu) {
+    use_gpu_ = use_gpu;
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
@ -57,6 +67,7 @@ struct TestBroadcastOpHandle {
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CUDADeviceContext(p));
      }
      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
@ -67,6 +78,9 @@ struct TestBroadcastOpHandle {
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
 #ifdef PADDLE_WITH_CUDA
      nccl_ctxs_.reset(nullptr);
 #endif
    }
  }
@ -82,7 +96,21 @@ struct TestBroadcastOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("input");
-    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
      op_handle_.reset(
          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
      op_handle_.reset(
          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
 #else
      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
 #endif
    }
    auto* in_var_handle =
        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
@ -97,7 +125,9 @@ struct TestBroadcastOpHandle {
    op_handle_->AddInput(dummy_var_handle);
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@ -25,6 +25,7 @@ GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
    : local_scopes_(local_scopes), places_(places) {}
 void GatherOpHandle::RunImpl() {
  if (places_.size() == 1) return;
  // the input and output may have dummy var.
  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@ -35,7 +36,6 @@ void GatherOpHandle::RunImpl() {
  VarHandle *out_var_handle;
  {
    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                      "The number of output should be one.");
    out_var_handle = out_var_handles.front();
@ -50,68 +50,62 @@ void GatherOpHandle::RunImpl() {
  auto pre_in_var =
      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
                 "Currently, gather_op only can gather SelectedRows.");
  auto pre_place = in_0_handle->place_;
  PADDLE_ENFORCE_EQ(out_var_handle->place_.which(), pre_place.which(),
                    "The place of input and output should be the same.");
  // Wait input done, this Wait is asynchronous operation
  WaitInputVarGenerated(in_var_handles);
  auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
  std::vector<int64_t> out_rows;
  std::vector<Tensor> in_tensors;
  std::vector<platform::Place> in_places;
-  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
+  // Gather the inputs
  // gather the inputs
  for (auto *in_handle : in_var_handles) {
    auto in_p = in_handle->place_;
    in_places.push_back(in_p);
    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
                      "Places must be all on CPU or all on CUDA.");
    auto *in_var =
        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
-    auto &in_sr = in_var->Get<framework::SelectedRows>();
+    PADDLE_ENFORCE_NOT_NULL(in_var);
    VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
-    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
+    auto &in_sr_value = in_var->Get<framework::SelectedRows>();
                      "The type of input is not consistent.");
    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
                      "The height of inputs is not consistent.");
    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(),
                      "The dims of inputs is not consistent.");
-    auto &in_sr_rows = in_sr.rows();
+    auto &in_sr_rows = in_sr_value.rows();
    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
-
+    in_tensors.emplace_back(in_sr_value.value());
    in_tensors.emplace_back(in_sr.value());
  }
-  // write the output
+  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
-  auto &out_place = out_var_handle->place_;
+  platform::Place t_out_p = out_var_handle->place_;
-  auto out_scope_idx = out_var_handle->scope_idx_;
+  if (platform::is_gpu_place(pre_in_value.place())) {
-  auto out_var = var_scopes.at(out_scope_idx)->FindVar(out_var_handle->name_);
+    PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
                   "Places of input and output must be all on GPU.");
  } else {
    t_out_p = platform::CPUPlace();
  }
-  auto out = out_var->GetMutable<framework::SelectedRows>();
+  auto out_var =
-  out->set_height(pre_in.height());
+      var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
-  out->set_rows(out_rows);
+  PADDLE_ENFORCE_NOT_NULL(out_var);
  auto out_value = out_var->GetMutable<framework::SelectedRows>();
  out_value->set_height(pre_in_value.height());
  out_value->set_rows(out_rows);
  size_t rows = out_rows.size();
-  DDim out_dim = pre_in.GetCompleteDims();
+  DDim out_dim = pre_in_value.GetCompleteDims();
  out_dim[0] = static_cast<int64_t>(rows);
-  out->mutable_value()->Resize(out_dim);
+  out_value->mutable_value()->Resize(out_dim).mutable_data(
-  out->mutable_value()->mutable_data(out_place, pre_in.value().type());
+      t_out_p, pre_in_value.value().type());
-  Tensor *out_tensor = out->mutable_value();
+  Tensor *out_tensor = out_value->mutable_value();
  // copy
-  auto dev_ctx = dev_ctxes_[out_place];
+  auto dev_ctx = dev_ctxes_[out_var_handle->place_];
-  RunAndRecordEvent(out_place, [in_tensors, out_tensor, dev_ctx, out_place] {
+  RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
                                             t_out_p] {
    int s = 0, e = 0;
    for (size_t j = 0; j < in_tensors.size(); ++j) {
      e += in_tensors[j].dims()[0];
      auto sub_out = out_tensor->Slice(s, e);
-      paddle::framework::TensorCopy(in_tensors[j], out_place, *(dev_ctx),
+      paddle::framework::TensorCopy(in_tensors[j], t_out_p, *dev_ctx, &sub_out);
                                    &sub_out);
      s = e;
    }
  });
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -11,9 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include <utility>
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/details/send_op_handle.h"
 #include "paddle/fluid/framework/scope.h"
@ -34,8 +36,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
+    const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs)
+    platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale)
    : loss_var_name_(loss_var_name),
      places_(places),
      local_scopes_(local_scopes),
@ -105,6 +107,11 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    const ProgramDesc &program) const {
  std::unordered_map<std::string, proto::VarType::Type> var_types;
  for (auto *var : program.Block(0).AllVars()) {
    var_types[var->Name()] = var->GetType();
  }
  auto graph = new SSAGraph();
  SSAGraph &result = *graph;
  std::unordered_set<std::string> og_has_been_broadcast;
@ -133,12 +140,17 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      is_forwarding = false;
    } else {
      CreateComputationalOps(&result, *op, places_.size());
-      if (!is_forwarding) {
+      if (!is_forwarding && places_.size() > 1) {
        // Currently, we assume that once gradient is generated, it can be
        // broadcast, and each gradient is only broadcast once.
        for (auto &og : op->OutputArgumentNames()) {
          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            InsertNCCLAllReduceOp(&result, og);
+            if (IsSparseGradient(var_types, og)) {
              CreateReduceOp(&result, og, 0);
              CreateBroadcastOp(&result, og, 0);
            } else {
              InsertNCCLAllReduceOp(&result, og);
            }
          }
        }
      }
@ -165,6 +177,50 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  return std::unique_ptr<SSAGraph>(graph);
 }
 bool MultiDevSSAGraphBuilder::IsSparseGradient(
    const std::unordered_map<std::string, proto::VarType::Type> &var_types,
    const std::string &og) const {
  PADDLE_ENFORCE(var_types.count(og) != 0);
  if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
    return true;
  }
  return false;
 }
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
                                                const std::string &p_name,
                                                size_t src_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
 #else
  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
 #endif
  result->ops_.emplace_back(op_handle);
  auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
  op_handle->AddInput(in);
  for (size_t i = 0; i < places_.size(); ++i) {
    auto &vars = result->vars_.at(i).at(p_name);
    auto &p = places_[i];
    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
    vars.emplace_back(out_var);
    op_handle->AddOutput(out_var);
 #ifndef ADDLE_WITH_CUDA
    op_handle->SetDeviceContext(p,
                                platform::DeviceContextPool::Instance().Get(p));
 #endif
  }
 }
 void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
                                                    const OpDesc &op,
                                                    int dev_id) const {
  result->ops_.emplace_back(
      new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
  CreateOpHandleIOs(result, op, dev_id);
 }
 OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
    const ProgramDesc &program) const {
  for (auto *op : program.Block(0).AllOps()) {
@ -174,7 +230,6 @@ OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
  }
  return nullptr;
 }
 void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
    SSAGraph *result, const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
@ -247,6 +302,36 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
  }
 }
 VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
                                                   const std::string &og,
                                                   int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
  result->ops_.emplace_back(
      new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
 #else
  result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
 #endif
  auto *op_handle = result->ops_.back().get();
  for (size_t i = 0; i < places_.size(); ++i) {
    auto &vars = result->vars_[i][og];
 #ifndef PADDLE_WITH_CUDA
    auto &p = places_[i];
    op_handle->SetDeviceContext(p,
                                platform::DeviceContextPool::Instance().Get(p));
 #endif
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());
  }
  auto &vars = result->vars_[dst_dev_id][og];
  auto var =
      new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]);
  vars.emplace_back(var);
  op_handle->AddOutput(var);
  return var;
 }
 void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
                                           const OpDesc &op) const {
  auto &p = places_[0];
@ -263,6 +348,7 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
  return op.OutputArgumentNames().size() == 1 &&
         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@ -13,8 +13,8 @@
 // limitations under the License.
 #pragma once
 #include <string>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
@ -27,6 +27,7 @@ class NCCLContextMap;
 namespace framework {
 class Scope;
 namespace details {
 class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 public:
 #ifdef PADDLE_WITH_CUDA
@ -34,8 +35,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::string &loss_var_name,
                          const std::unordered_set<std::string> &params,
                          const std::vector<Scope *> &local_scopes,
-                          bool skip_scale_loss,
+                          platform::NCCLContextMap *nccl_ctxs,
-                          platform::NCCLContextMap *nccl_ctxs);
+                          bool use_default_grad_scale);
 #else
  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                          const std::string &loss_var_name,
@ -74,6 +75,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                              size_t num_places) const;
  void CreateScaleLossGradOp(SSAGraph *result) const;
  VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
                            int dst_dev_id) const;
  void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
                             int dev_id) const;
  bool IsParameterGradientOnce(
      const std::string &og,
@ -81,11 +86,18 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                         size_t src_dev_id) const;
  /**
   * Get send op in the global block of program.
   * nullptr if not found.
   */
  OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
  bool IsSparseGradient(
      const std::unordered_map<std::string, proto::VarType::Type> &var_types,
      const std::string &og) const;
 };
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@ -22,6 +22,7 @@ namespace framework {
 namespace details {
 void ReduceOpHandle::RunImpl() {
  if (places_.size() == 1) return;
  // the input and output may have dummy var.
  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@ -51,44 +52,48 @@ void ReduceOpHandle::RunImpl() {
  // Wait input done, this Wait is asynchronous operation
  WaitInputVarGenerated(in_var_handles);
  auto pre_place = in_0_handle->place_;
  std::vector<platform::Place> in_places;
  auto pre_in_tensor = VariableVisitor::GetMutableTensor(pre_in_var);
  for (auto *in_handle : in_var_handles) {
    auto in_p = in_handle->place_;
    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
                      "Places must be all on CPU or all on CUDA.");
    in_places.emplace_back(in_p);
  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
  std::vector<platform::Place> in_places;  // used to get dev_ctx
  for (auto *in_handle : in_var_handles) {
    in_places.emplace_back(in_handle->place_);
    auto in_var =
        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
    PADDLE_ENFORCE_NOT_NULL(in_var);
-
+    VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
    auto in_tensor = VariableVisitor::GetMutableTensor(in_var);
    PADDLE_ENFORCE_EQ(in_tensor.type(), pre_in_tensor.type(),
                      "The type of input is not consistent.");
  }
  auto out_var =
      var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
  PADDLE_ENFORCE_NOT_NULL(out_var);
  // NOTE: The tensors' Place of input and output must be all on GPU or all on
  // CPU.
  auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
  platform::Place t_out_p;
  if (platform::is_gpu_place(in_p)) {
    PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place_),
                   "Places of input and output must be all on GPU.");
    t_out_p = out_var_handle->place_;
  } else {
    t_out_p = platform::CPUPlace();
  }
  if (pre_in_var->IsType<framework::SelectedRows>()) {
    std::vector<const SelectedRows *> in_selected_rows =
        GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
                       out_var_handle->place_,
                       out_var->GetMutable<framework::SelectedRows>());
  } else {
    std::vector<const LoDTensor *> lod_tensors =
        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
-    if (paddle::platform::is_cpu_place(pre_place)) {
+    if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
      ReduceLoDTensor func(lod_tensors,
                           out_var->GetMutable<framework::LoDTensor>());
      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
-    } else if (paddle::platform::is_gpu_place(pre_place)) {
+    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
@ -96,7 +101,7 @@ void ReduceOpHandle::RunImpl() {
          out_var_handle->place_, pre_in.type());
      auto out_p = out_var_handle->place_;
-      int root = boost::get<platform::CUDAPlace>(out_p).device;
+      int root_id = boost::get<platform::CUDAPlace>(out_p).device;
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < var_scopes.size(); ++i) {
        auto &p = in_places[i];
@ -104,23 +109,23 @@ void ReduceOpHandle::RunImpl() {
        int dev_id = boost::get<platform::CUDAPlace>(p).device;
        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
        auto stream = nccl_ctx.stream();
        auto comm = nccl_ctx.comm_;
        void *buffer = const_cast<void *>(lod_tensor.data<void>());
        void *recvbuffer = nullptr;
-        if (root == dev_id) {
+        if (root_id == dev_id) {
          recvbuffer =
              out_var->GetMutable<framework::LoDTensor>()->mutable_data(
                  out_var_handle->place_);
        }
        int type = platform::ToNCCLDataType(lod_tensor.type());
-        all_reduce_calls.emplace_back([=] {
+        size_t numel = static_cast<size_t>(lod_tensor.numel());
-          PADDLE_ENFORCE(platform::dynload::ncclReduce(
+        all_reduce_calls.emplace_back(
-              buffer, recvbuffer, static_cast<size_t>(lod_tensor.numel()),
+            [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              static_cast<ncclDataType_t>(type), ncclSum, root, comm, stream));
+              PADDLE_ENFORCE(platform::dynload::ncclReduce(
-        });
+                  buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
                  ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
            });
      }
      this->RunAndRecordEvent([&] {
@ -130,7 +135,7 @@ void ReduceOpHandle::RunImpl() {
        }
      });
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW("CUDA is not enabled.");
 #endif
    } else {
      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@ -55,7 +55,7 @@ struct ReduceOpHandle : public OpHandleBase {
  std::string Name() const override;
-  bool IsMultiDeviceTransfer() override { return false; };
+  bool IsMultiDeviceTransfer() override { return true; };
 protected:
  void RunImpl() override;
--- a/Show More
+++ b/Show More