Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_grpc_server_ready_condition

7 years ago · a131c73fcf
parent 008f6df9b2 211d56edd8
commit a131c73fcf
70 changed files with 1014 additions and 481 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -12,7 +12,7 @@ services:
 os:
  - linux
 env:
-  - JOB=build_doc
+  - JOB=doc
  - JOB=check_style
  - JOB=build_android
 addons:
@ -36,21 +36,18 @@ addons:
      - ccache
  ssh_known_hosts: 13.229.163.131
 before_install:
-  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
-  # protobuf version.
  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
    # 43min timeout
-    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
-    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
+    if [[ "$JOB" != "doc" ]]; then timeout 2580 paddle/scripts/paddle_docker_build.sh ${JOB}; else paddle/scripts/paddle_build.sh ${JOB}; fi;
+    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
  - |
-    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
+    # For document only
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -2,12 +2,14 @@
 |---|---|
 | abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
+| baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
-| JiayiFeng | Jia-Yi Feng |
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
 | dzhwinter | Zhi-Hong Dong |
+| dragonwarrior | Long Wang |
+| dyning | Yuning Du |
 | emailweixu | Wei Xu |
 | gangliao | Gang Liao |
 | gongweibao | Wei-Bao Gong |
@ -16,6 +18,9 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
+| jczaja | Jacek Czaja |
+| JiayiFeng | Jia-Yi Feng |
+| kbinias | Krzysztof Binias |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
@ -24,16 +29,20 @@
 | llxxxll | Yong-Feng Liu |
 | luotao01 | Tao Luo |
 | lzhao4ever | Liang Zhao |
+| mozga-intel | Mateusz Ozga |
 | NHZlX | Zhao-Long Xing |
+| Noplz | Yuan Gao |
 | pakchoi | Chuan-Jiang Song |
 | panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
 | pkuyym | Ya-Ming Yang |
+| pzelazko-intel | Pawel Zelazko |
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
+| tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
--- a/7
+++ b/7
@ -1,7 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-
-# When you modify it, please be aware of cudnn-runtime version 
+# When you modify it, please be aware of cudnn-runtime version
 # and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
@ -24,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
-    apt-get install -y \
+    apt-get install -y --allow-downgrades \
    git python-pip python-dev openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@ -33,7 +32,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool && \
+    net-tools libtool ccache && \
    apt-get clean -y

 # Install Go and glide
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# in cuda9, suppress cuda warning on eigen 
+list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@ -22,7 +22,9 @@ else()
        extern_eigen3
        ${EXTERNAL_PROJECT_LOG_ARGS}
        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-        GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+        # eigen on cuda9.1 missing header of math_funtions.hpp
+        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
        PREFIX          ${EIGEN_SOURCE_DIR}
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -38,8 +38,7 @@ ENDIF()
 ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
-    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
+    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@ -1,7 +1,7 @@
 # Averaging Parameter in PaddlePaddle

 ## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.

 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.

@ -16,16 +16,16 @@ We propose averaging for any optimizer similar to how ASGD performs it, as menti
 ### How to perform Parameter Averaging in PaddlePaddle

 Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
 2. The optimizer itself is responsible for updating the parameters.
 3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
-    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
-    2. However, saving all the N instances of the parameters in memory is not feasible.
+    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all N instances of the parameters in memory is not feasible.
    3. Therefore, an approximation algorithm is used.

 Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.

-During the testing/ saving the model phase, we perform the following steps:
+During the testing/saving the model phase, we perform the following steps:
 1. Perform the delayed operations.
 2. Save current values of the parameters to a temporary variable.
 3. Replace the values of the parameters with the averaged values.
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream;
        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
  }

+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
 #endif /* __NVCC__ */

 #endif /* HL_BASE_H_ */
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
 }

 __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  int addr = idx % 32;
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
 #pragma unroll
  for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl(a[k], addr, 32);
+    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
  }

 #pragma unroll
@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
  }

  addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
 #pragma unroll
  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl(a[k], addr, 32);
-    addr = __shfl(addr, (idx + 31) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
  }
 }

--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
    if (--beamSize == 0) break;
    __syncthreads();

+    unsigned mask = 0u;
+    // CREATE_SHFL_MASK(mask, tid < len);
+
    if (tid == maxId[0]) {
      if (beam < maxLength) {
        shTopK[tid] = topK[beam];
      }
    }
    if (maxId[0] / 32 == warp) {
-      if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break;
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
    }
  }
 }
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -34,7 +34,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss,
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
    platform::NCCLContextMap *nccl_ctxs)
    : loss_var_name_(loss_var_name),
      places_(places),
@ -45,7 +45,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool skip_scale_loss)
+    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
    : loss_var_name_(loss_var_name),
      places_(places),
      local_scopes_(local_scopes) {
@ -53,7 +53,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  for (auto &p : params) {
    grad_names_.insert(GradVarName(p));
  }
-  skip_scale_loss_ = skip_scale_loss;
+  use_default_grad_scale_ = use_default_grad_scale;
 }

 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@ -126,8 +126,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    } else if (IsDistTrainOp(*op, send_op)) {
      CreateComputationalOps(&result, *op, 1);
    } else if (IsScaleLossOp(*op)) {
-      // user can customize loss@grad if skip_scale_loss_
-      if (!skip_scale_loss_) {
+      // user can customize loss@grad if not use_default_grad_scale_
+      if (use_default_grad_scale_) {
        CreateScaleLossGradOp(&result);
      }
      is_forwarding = false;
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@ -41,7 +41,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::string &loss_var_name,
                          const std::unordered_set<std::string> &params,
                          const std::vector<Scope *> &local_scopes,
-                          bool skip_scale_loss);
+                          bool use_default_grad_scale);
 #endif

  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@ -59,7 +59,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool skip_scale_loss_;
+  bool use_default_grad_scale_;

  bool IsScaleLossOp(const OpDesc &op) const;

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@ -46,6 +46,7 @@ void ScaleLossGradOpHandle::RunImpl() {
              ->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+      VLOG(1) << place_ << "RUN Scale loss grad op";
    });
 #endif
  }
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -58,7 +58,7 @@ ParallelExecutor::ParallelExecutor(
    const std::unordered_set<std::string> &bcast_vars,
    const ProgramDesc &main_program, const std::string &loss_var_name,
    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
-    bool customize_scale_loss)
+    bool use_default_grad_scale)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;

@ -93,11 +93,11 @@ ParallelExecutor::ParallelExecutor(
 #ifdef PADDLE_WITH_CUDA
  details::MultiDevSSAGraphBuilder builder(
      member_->places_, loss_var_name, params, member_->local_scopes_,
-      customize_scale_loss, member_->nccl_ctxs_.get());
+      use_default_grad_scale, member_->nccl_ctxs_.get());
 #else
  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
                                           params, member_->local_scopes_,
-                                           customize_scale_loss);
+                                           use_default_grad_scale);
 #endif
  auto graph = builder.Build(main_program);

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -40,7 +40,7 @@ class ParallelExecutor {
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
                            const std::vector<Scope*>& local_scopes,
-                            bool allow_op_delay, bool customize_scale_loss);
+                            bool allow_op_delay, bool use_default_grad_scale);

  ~ParallelExecutor();

--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
@ -15,7 +15,7 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/accuracy_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"

 namespace paddle {
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/adagrad_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@ -195,10 +195,9 @@ std::string ItemToString(const BeamSearch::Item &item) {
  return stream.str();
 }

-class BeamSearchProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
+class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+  BeamSearchOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // inputs and outputs stored in proto
    AddInput("pre_ids", "ids in previous step");
@ -222,20 +221,32 @@ class BeamSearchProtoAndCheckerMaker
  }
 };

-class BeamSearchInferShape : public framework::InferShapeBase {
+class BeamSearchOp : public framework::OperatorWithKernel {
 public:
-  void operator()(framework::InferShapeContext *context) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
    for (const std::string &arg :
         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
-      PADDLE_ENFORCE(context->HasInput(arg),
-                     "BeamSearch need input argument '%s'", arg);
+      PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
+                     arg);
    }
    for (const std::string &arg :
         std::vector<std::string>({"selected_ids", "selected_scores"})) {
-      PADDLE_ENFORCE(context->HasOutput(arg),
+      PADDLE_ENFORCE(ctx->HasOutput(arg),
                     "BeamSearch need output argument '%s'", arg);
    }
  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+        platform::CPUPlace());
+    return kt;
+  }
 };

 class BeamSearchInferVarType : public framework::VarTypeInference {
@ -254,8 +265,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
-                  paddle::operators::BeamSearchProtoAndCheckerMaker,
-                  paddle::operators::BeamSearchInferShape,
-                  paddle::operators::BeamSearchInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker,
+                  ops::BeamSearchInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@ -192,49 +192,29 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);

 std::string ItemToString(const BeamSearch::Item& item);

-class BeamSearchOp : public framework::OperatorBase {
+template <typename DeviceContext, typename T>
+class BeamSearchOpKernel : public framework::OpKernel<T> {
 public:
-  BeamSearchOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  BeamSearchOp(const BeamSearchOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not Implemented");
-  }
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto ids_var = scope.FindVar(Input("ids"));
-    auto scores_var = scope.FindVar(Input("scores"));
-    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* ids_var = context.Input<framework::LoDTensor>("ids");
+    auto* scores_var = context.Input<framework::LoDTensor>("scores");
+    auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
    PADDLE_ENFORCE_NOT_NULL(ids_var);
    PADDLE_ENFORCE_NOT_NULL(scores_var);
    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);

-    auto& ids = ids_var->Get<framework::LoDTensor>();
-    auto& scores = scores_var->Get<framework::LoDTensor>();
-    auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
-    size_t level = Attr<int>("level");
-    size_t beam_size = Attr<int>("beam_size");
-    int end_id = Attr<int>("end_id");
-    BeamSearch alg(ids, scores, level, beam_size, end_id);
-
-    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
-    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
+    size_t level = context.Attr<int>("level");
+    size_t beam_size = context.Attr<int>("beam_size");
+    int end_id = context.Attr<int>("end_id");
+    BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
+    auto selected_ids_var =
+        context.Output<framework::LoDTensor>("selected_ids");
+    auto selected_scores_var =
+        context.Output<framework::LoDTensor>("selected_scores");
    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
-    auto& selected_ids_tensor =
-        *selected_ids_var->GetMutable<framework::LoDTensor>();
-    auto& selected_scores_tensor =
-        *selected_scores_var->GetMutable<framework::LoDTensor>();
-    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
+    alg(*pre_ids_var, selected_ids_var, selected_scores_var);
  }
 };
-
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@ -10,7 +10,7 @@
   limitations under the License. */

 #include "paddle/fluid/operators/bilinear_interp_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/box_coder_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
      auto& dev_ctx = ctx.template device_context<DeviceContext>();
      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
          concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
+      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
    }
  }
 };
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@ -20,6 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"

+DEFINE_bool(cudnn_algo_use_autotune, true,
+            "Whether allow using an autotuning algorithm for convolution "
+            "operator. The autotuning algorithm may be non-deterministic. If "
+            "false, the algorithm is deterministic.");
+
 namespace paddle {
 namespace operators {

@ -267,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
    if (input_grad) {
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-              handle, cudnn_filter_desc,
-              // dyDesc: Handle to the previously initialized input differential
-              // tensor descriptor.
-              cudnn_output_grad_desc, cudnn_conv_desc,
-              // dxDesc: Handle to the previously initialized output tensor
-              // descriptor.
-              cudnn_input_desc,
-              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &data_algo));
+      if (FLAGS_cudnn_algo_use_autotune) {
+        PADDLE_ENFORCE(
+            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+                handle, cudnn_filter_desc,
+                // dyDesc: Handle to the previously initialized input
+                // differential
+                // tensor descriptor.
+                cudnn_output_grad_desc, cudnn_conv_desc,
+                // dxDesc: Handle to the previously initialized output tensor
+                // descriptor.
+                cudnn_input_desc,
+                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &data_algo));
+      } else {
+        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      }
+
      PADDLE_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
              handle, cudnn_filter_desc, cudnn_output_grad_desc,
@ -286,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    }

    if (filter_grad) {
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
-              cudnn_filter_desc,
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &filter_algo));
+      if (FLAGS_cudnn_algo_use_autotune) {
+        PADDLE_ENFORCE(
+            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+                handle, cudnn_input_desc, cudnn_output_grad_desc,
+                cudnn_conv_desc, cudnn_filter_desc,
+                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &filter_algo));
+      } else {
+        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      }

      PADDLE_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/operators/conv_shift_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"

 namespace paddle {
--- a/Show More
+++ b/Show More