Merge branch 'develop' into tr_convert_init

7 years ago · 9945265f09
parent 6f6f330423 211d56edd8
commit 9945265f09
167 changed files with 2725 additions and 1028 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -12,7 +12,7 @@ services:
 os:
  - linux
 env:
-  - JOB=build_doc
+  - JOB=doc
  - JOB=check_style
  - JOB=build_android
 addons:
@ -36,21 +36,18 @@ addons:
      - ccache
  ssh_known_hosts: 13.229.163.131
 before_install:
-  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
-  # protobuf version.
  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
    # 43min timeout
-    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
-    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
+    if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
+    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
  - |
-    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
+    # For document only
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -2,12 +2,14 @@
 |---|---|
 | abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
+| baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
-| JiayiFeng | Jia-Yi Feng |
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
 | dzhwinter | Zhi-Hong Dong |
+| dragonwarrior | Long Wang |
+| dyning | Yuning Du |
 | emailweixu | Wei Xu |
 | gangliao | Gang Liao |
 | gongweibao | Wei-Bao Gong |
@ -16,6 +18,9 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
+| jczaja | Jacek Czaja |
+| JiayiFeng | Jia-Yi Feng |
+| kbinias | Krzysztof Binias |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
@ -24,15 +29,20 @@
 | llxxxll | Yong-Feng Liu |
 | luotao01 | Tao Luo |
 | lzhao4ever | Liang Zhao |
+| mozga-intel | Mateusz Ozga |
 | NHZlX | Zhao-Long Xing |
+| Noplz | Yuan Gao |
 | pakchoi | Chuan-Jiang Song |
+| panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
 | pkuyym | Ya-Ming Yang |
+| pzelazko-intel | Pawel Zelazko |
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
+| tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
--- a/7
+++ b/7
@ -1,7 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-
-# When you modify it, please be aware of cudnn-runtime version 
+# When you modify it, please be aware of cudnn-runtime version
 # and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
@ -24,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
-    apt-get install -y \
+    apt-get install -y --allow-downgrades \
    git python-pip python-dev openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@ -33,7 +32,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool && \
+    net-tools libtool ccache && \
    apt-get clean -y

 # Install Go and glide
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@ -21,7 +21,7 @@ import argparse
 import time
 import distutils.util

-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -20,7 +20,7 @@ import numpy as np
 import argparse
 import time

-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler

--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@ -23,7 +23,7 @@ import time

 import cProfile, pstats, StringIO

-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@ -23,10 +23,10 @@ import random
 import time

 import numpy
-import paddle.v2 as paddle
-import paddle.v2.dataset.imdb as imdb
+import paddle
+import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-from paddle.v2 import batch
+import paddle.batch as batch
 import paddle.fluid.profiler as profiler


--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -17,7 +17,7 @@ from __future__ import print_function
 import sys
 import time
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# in cuda9, suppress cuda warning on eigen 
+list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@ -22,7 +22,9 @@ else()
        extern_eigen3
        ${EXTERNAL_PROJECT_LOG_ARGS}
        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        # eigen on cuda9.1 missing header of math_funtions.hpp
+        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
        PREFIX          ${EIGEN_SOURCE_DIR}
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -38,8 +38,7 @@ ENDIF()
 ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
-    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
+    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/doc/fluid/api/data/data_reader.rst
+++ b/doc/fluid/api/data/data_reader.rst
@ -56,11 +56,11 @@ DataFeeder
 Reader
 ======

-..  automodule:: paddle.v2.reader
+..  automodule:: paddle.reader
    :members:
    :noindex:

-..  automodule:: paddle.v2.reader.creator
+..  automodule:: paddle.reader.creator
    :members:
    :noindex:

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -479,6 +479,13 @@ label_smooth
 ..  autofunction:: paddle.fluid.layers.label_smooth
    :noindex:

+roi_pool
+---------
+
+..  autofunction:: paddle.fluid.layers.roi_pool
+    :noindex:
+
+    
 ops
 ===

@ -820,3 +827,5 @@ topk

 ..  autofunction:: paddle.fluid.layers.topk
    :noindex:
+
+
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@ -1,7 +1,7 @@
 # Averaging Parameter in PaddlePaddle

 ## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.

 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.

@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti
 ### How to perform Parameter Averaging in PaddlePaddle

 Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
 2. The optimizer itself is responsible for updating the parameters.
 3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
-    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
-    2. However, saving all the N instances of the parameters in memory is not feasible.
+    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all N instances of the parameters in memory is not feasible.
    3. Therefore, an approximation algorithm is used.

 Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.

-During the testing/ saving the model phase, we perform the following steps:
+During the testing/saving the model phase, we perform the following steps:
 1. Perform the delayed operations.
 2. Save current values of the parameters to a temporary variable.
 3. Replace the values of the parameters with the averaged values.
--- a/doc/fluid/design/data_type/float16.md
+++ b/doc/fluid/design/data_type/float16.md
@ -3,7 +3,7 @@
 ## Why float16
 Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 

-When high precision computation is not required, using float16 data type could potentially 
+When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially 

 - reduce storage space, memory bandwidth, and power usages; 
 - increase the chance of data fitting into a smaller cache of lower latency; 
@ -12,7 +12,7 @@ When high precision computation is not required, using float16 data type could p
 ## Survey of current float16 support
 A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.

-The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 

 ### Compiler
 - nvcc supports `__half` data type after CUDA 7.5.
@ -95,11 +95,89 @@ float half_to_float(float16 h);
 ```
 which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.

-## To do
-After float16 class is available, some of the future items are below:
+## float16 inference
+In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one.  

- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
+### Operator level requirement
+Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output. 

- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
+This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type.

- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
+The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable.
+
+So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator.
+
+### Variable level requirement
+Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type.
+
+When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights. 
+
+In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h). 
+
+The following code demonstrates how to do the tensor conversion.
+```Python
+# var is the variable of float weights
+# tensor is a numpy array of data copied from the tensor data in var 
+# fp16_var is the variable that will contain float16 weights converted from var  
+tensor = numpy.array(var.get_tensor())
+fp16_tensor = fp16_var.get_tensor()
+
+# After the original tensor data is converted to numpy float16 data type, 
+# view(numpy.uint16) is used so that the internal memory of the numpy array 
+# will be reinterpreted to be of uint16 data type, which is binded to 
+# Fluid float16 class via pybind with the help of uint16_t built-in c++ type
+fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace)  
+```
+
+### Consistent API requirement
+The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type.
+
+This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode.   
+
+### float16 transpiler
+Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one.
+
+Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md),
+this transpiler mainly does the following modifications:
+
+1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel. 
+
+2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result.
+
+3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program.
+
+4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart.
+
+Below is an example of usage:
+```Python
+# Get the float inference program
+[float_inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(float_inference_program,
+                        feed={feed_target_names[0]: tensor_img},
+                        fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = fluid.InferenceTranspiler()
+t.float16_transpile(float16_inference_program, GPUPlace)
+
+# Running 
+float16_results = exe.run(float16_inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+```
+
+As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode.
+
+### Speedup on GPU
+Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+
+Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details.
--- a/doc/v2/api/data/data_reader.rst
+++ b/doc/v2/api/data/data_reader.rst
@ -56,11 +56,11 @@ DataFeeder
 Reader
 ======

-..  automodule:: paddle.v2.reader
+..  automodule:: paddle.reader
    :members:
    :noindex:

-..  automodule:: paddle.v2.reader.creator
+..  automodule:: paddle.reader.creator
    :members:
    :noindex:

--- a/doc/v2/api/data/dataset.rst
+++ b/doc/v2/api/data/dataset.rst
@ -1,82 +1,82 @@
 Dataset
 =======

-..  automodule:: paddle.v2.dataset
+..  automodule:: paddle.dataset
    :members:
    :noindex:

 mnist
 +++++

-..  automodule:: paddle.v2.dataset.mnist
+..  automodule:: paddle.dataset.mnist
    :members:
    :noindex:

 cifar
 +++++

-..  automodule:: paddle.v2.dataset.cifar
+..  automodule:: paddle.dataset.cifar
    :members:
    :noindex:

 conll05
 +++++++

-..  automodule:: paddle.v2.dataset.conll05
+..  automodule:: paddle.dataset.conll05
    :members: get_dict,get_embedding,test
    :noindex:

 imdb
 ++++

-..  automodule:: paddle.v2.dataset.imdb
+..  automodule:: paddle.dataset.imdb
    :members:
    :noindex:

 imikolov
 ++++++++

-..  automodule:: paddle.v2.dataset.imikolov
+..  automodule:: paddle.dataset.imikolov
    :members:
    :noindex:

 movielens
 +++++++++

-..  automodule:: paddle.v2.dataset.movielens
+..  automodule:: paddle.dataset.movielens
    :members:
    :noindex:

-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+..  autoclass:: paddle.dataset.movielens.MovieInfo
    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
    :noindex:

 sentiment
 +++++++++

-..  automodule:: paddle.v2.dataset.sentiment
+..  automodule:: paddle.dataset.sentiment
    :members:
    :noindex:

 uci_housing
 +++++++++++

-..  automodule:: paddle.v2.dataset.uci_housing
+..  automodule:: paddle.dataset.uci_housing
    :members:
    :noindex:

 wmt14
 +++++

-..  automodule:: paddle.v2.dataset.wmt14
+..  automodule:: paddle.dataset.wmt14
    :members:
    :noindex:

 wmt16
 +++++

-..  automodule:: paddle.v2.dataset.wmt16
+..  automodule:: paddle.dataset.wmt16
    :members:
    :noindex:
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream;
        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
  }

+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
 #endif /* __NVCC__ */

 #endif /* HL_BASE_H_ */
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
 }

 __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  int addr = idx % 32;
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
 #pragma unroll
  for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl(a[k], addr, 32);
+    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
  }

 #pragma unroll
@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
  }

  addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
 #pragma unroll
  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl(a[k], addr, 32);
-    addr = __shfl(addr, (idx + 31) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
  }
 }

--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
    if (--beamSize == 0) break;
    __syncthreads();

+    unsigned mask = 0u;
+    // CREATE_SHFL_MASK(mask, tid < len);
+
    if (tid == maxId[0]) {
      if (beam < maxLength) {
        shTopK[tid] = topK[beam];
      }
    }
    if (maxId[0] / 32 == warp) {
-      if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break;
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
    }
  }
 }
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@ -139,7 +139,7 @@ struct TestBroadcastOpHandle {
      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");

      f::Tensor result_tensor;
-      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
      float* ct = result_tensor.mutable_data<float>(cpu_place);

      for (int64_t i = 0; i < f::product(kDims); ++i) {
@ -185,7 +185,7 @@ struct TestBroadcastOpHandle {
      }

      f::Tensor result_tensor;
-      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      f::TensorCopySync(rt, cpu_place, &result_tensor);
      float* ct = result_tensor.data<float>();

      for (int64_t i = 0; i < f::product(kDims); ++i) {
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@ -66,8 +66,7 @@ void FetchOpHandle::RunImpl() {
    auto &t = var->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true);
-      dev_ctxes_.at(t.place())->Wait();
+      TensorCopySync(t, cpu, &tensors_[i]);
 #endif
    } else {
      tensors_[i].ShareDataWith(t);
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss,
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
    platform::NCCLContextMap *nccl_ctxs)
    : loss_var_name_(loss_var_name),
      places_(places),
@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss)
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
    : loss_var_name_(loss_var_name),
      places_(places),
      local_scopes_(local_scopes) {
@ -53,28 +53,25 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  for (auto &p : params) {
    grad_names_.insert(GradVarName(p));
  }
-  skip_scale_loss_ = skip_scale_loss;
+  use_default_grad_scale_ = use_default_grad_scale;
 }

 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
                                                const OpDesc &op,
-                                                const platform::Place &p,
-                                                const size_t &i) const {
+                                                size_t place_id) const {
+  auto p = places_[place_id];
  auto *op_handle = result->ops_.back().get();
  op_handle->SetDeviceContext(p,
                              platform::DeviceContextPool::Instance().Get(p));

-  auto var_names = op.InputArgumentNames();
-
-  for (auto &each_var_name : var_names) {
-    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
+  for (auto &each_var_name : op.InputArgumentNames()) {
+    VarHandle *var =
+        CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
    op_handle->AddInput(var);
  }

-  var_names = op.OutputArgumentNames();
-
-  for (auto &each_var_name : var_names) {
-    CreateOpOutput(result, op_handle, each_var_name, p, i);
+  for (auto &each_var_name : op.OutputArgumentNames()) {
+    CreateOpOutput(result, op_handle, each_var_name, p, place_id);
  }
 }

@ -84,17 +81,18 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
    return false;
  }

-  auto checker = [&](const std::vector<std::string> opvars,
-                     const std::vector<std::string> sendvars) -> bool {
-    bool is_dist_train_op = false;
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &sendvars) -> bool {
    for (auto &var : opvars) {
      if (var.find(".block") != std::string::npos &&
          std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
-        is_dist_train_op = true;
-        break;
+        return true;
      }
    }
-    return is_dist_train_op;
+    return false;
  };

  if (op.Type() == "split") {
@ -117,13 +115,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      places_.size());

  // Find "send" op first for split is in front of send.
-  OpDesc *send_op = nullptr;
-  for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      send_op = op;
-      break;
-    }
-  }
+  OpDesc *send_op = GetSendOpDesc(program);

  bool is_forwarding = true;
  for (auto *op : program.Block(0).AllOps()) {
@ -134,7 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    } else if (IsDistTrainOp(*op, send_op)) {
      CreateComputationalOps(&result, *op, 1);
    } else if (IsScaleLossOp(*op)) {
-      if (!skip_scale_loss_) {
+      // user can customize loss@grad if not use_default_grad_scale_
+      if (use_default_grad_scale_) {
        CreateScaleLossGradOp(&result);
      }
      is_forwarding = false;
@ -142,10 +135,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      CreateComputationalOps(&result, *op, places_.size());
      if (!is_forwarding) {
        // Currently, we assume that once gradient is generated, it can be
-        // broadcast, and each gradient is only broadcast once. But there are no
-        // other cases, for example, we need to adjust the gradient according to
-        // the input when we get the gradient, which is not considered at
-        // present.
+        // broadcast, and each gradient is only broadcast once.
        for (auto &og : op->OutputArgumentNames()) {
          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
            InsertNCCLAllReduceOp(&result, og);
@ -175,6 +165,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  return std::unique_ptr<SSAGraph>(graph);
 }

+OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
+    const ProgramDesc &program) const {
+  for (auto *op : program.Block(0).AllOps()) {
+    if (op->Type() == "send") {
+      return op;
+    }
+  }
+  return nullptr;
+}
+
 void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
    SSAGraph *result, const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
@ -243,7 +243,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
    auto p = places_[scope_idx];
    auto s = local_scopes_[scope_idx];
    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
-    CreateOpHandleIOs(result, op, p, scope_idx);
+    CreateOpHandleIOs(result, op, scope_idx);
  }
 }

@ -255,7 +255,7 @@ void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
  result->ops_.emplace_back(new SendOpHandle(op, s, p));
  // Create inputs for output on original place and no ssa output
  // is created for send op.
-  CreateOpHandleIOs(result, op, p, 0);
+  CreateOpHandleIOs(result, op, 0);
 }

 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@ -41,14 +41,14 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::string &loss_var_name,
                          const std::unordered_set<std::string> &params,
                          const std::vector<Scope *> &local_scopes,
-                          bool skip_scale_loss);
+                          bool use_default_grad_scale);
 #endif

  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;

 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         const platform::Place &p, const size_t &i) const;
+                         size_t place_id) const;

 private:
  std::string loss_var_name_;
@ -59,12 +59,15 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool skip_scale_loss_;
+  bool use_default_grad_scale_;

  bool IsScaleLossOp(const OpDesc &op) const;

  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;

+  /**
+   * Is this operator as the end-point operator before/after send operator.
+   */
  bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;

  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
@ -77,6 +80,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
      std::unordered_set<std::string> *og_has_been_broadcast) const;

  void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
+
+  /**
+   * Get send op in the global block of program.
+   * nullptr if not found.
+   */
+  OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
 };
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@ -194,7 +194,7 @@ struct TestReduceOpHandle {
    }

    f::Tensor result_tensor;
-    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    f::TensorCopySync(rt, cpu_place, &result_tensor);
    float *ct = result_tensor.data<float>();

    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
@ -239,7 +239,7 @@ struct TestReduceOpHandle {
    auto &rt = out_var->Get<f::LoDTensor>();

    f::Tensor result_tensor;
-    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    f::TensorCopySync(rt, cpu_place, &result_tensor);
    float *ct = result_tensor.data<float>();

    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
--- a/Show More
+++ b/Show More