Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lstm_fix

7 years ago · ed2d30b5f9
parent d94c936bd5 b49923b666
commit ed2d30b5f9
54 changed files with 1011 additions and 239 deletions
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
 are transformed into offsets of elements/words as follows:
 ```
-0 9     10  15
+0 3 4   6
-  =     =   =
+  = =   =
-  3+2+4 1+9 2+3+10
+  3 3+1 4+2
 ```
 so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
 The complete offset representation is as follows:
 ```
 0           9 10       15
 0   3  5    9 10  12   15
 ||| || |||| |  ||  |||
 ```
 ## Slicing of LoD Tensors
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
                      out);
    in_var->SetLoDLevel(out_var->GetLodLevel());
  }
  bool IsRuntime() const override;
 protected:
  VarDesc::VarType GetVarType(const std::string &name) const override;
 private:
  DDim GetDim(const std::string &name) const override;
  void SetDim(const std::string &name, const DDim &dim) override;
@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
    info.infer_var_type_(*this, block);
  } else {
    // all output type is LoDTensor by default
    VLOG(10) << this->Type()
             << " has not registered InferVarType. Set output variables to "
                "LOD_TENSOR";
    for (auto &out_pair : this->outputs_) {
      for (auto &out_var_name : out_pair.second) {
        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                          const DDim &dim) {
  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
 }
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
    const std::string &name) const {
  return block_.FindVarRecursive(name)->GetType();
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 namespace paddle {
 namespace framework {
@ -365,7 +367,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
    out_tensor->set_lod(in_tensor.lod());
  }
- private:
+  bool IsRuntime() const override { return true; }
 protected:
  DDim GetDim(const std::string& name) const override {
    Variable* var = scope_.FindVar(name);
    if (var->IsType<LoDTensor>()) {
@ -388,6 +392,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
    }
  }
  VarDesc::VarType GetVarType(const std::string& name) const override {
    auto* var = scope_.FindVar(name);
    return ToVarType(var->Type());
  }
 private:
  const OperatorBase& op_;
  const Scope& scope_;
 };
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -298,11 +298,10 @@ class ExecutionContext {
  }
 #ifdef PADDLE_WITH_CUDA
-  const platform::CUDADeviceContext& cuda_device_context() const {
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    auto cuda_ctx =
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
+        &device_context_);
    return *cuda_ctx;
  }
 #endif
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
    SetDim(names[i], dims[i]);
  }
 }
 std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
    const std::string &name) const {
  return GetVarTypes(Inputs(name));
 }
 std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
    const std::string &name) const {
  return GetVarTypes(Outputs(name));
 }
 std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
    const std::vector<std::string> &names) const {
  std::vector<VarDesc::VarType> retv;
  retv.resize(names.size());
  std::transform(names.begin(), names.end(), retv.begin(),
                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
                           std::placeholders::_1));
  return retv;
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
@ -26,6 +27,10 @@ class InferShapeContext {
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;
  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
  std::vector<VarDesc::VarType> GetOutputsVarType(
      const std::string &name) const;
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;
@ -46,6 +51,8 @@ class InferShapeContext {
  virtual void ShareLoD(const std::string &in, const std::string &out,
                        size_t i = 0, size_t j = 0) const = 0;
  virtual bool IsRuntime() const = 0;
 protected:
  virtual framework::DDim GetDim(const std::string &name) const = 0;
  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
@ -55,6 +62,11 @@ class InferShapeContext {
  void SetDims(const std::vector<std::string> &names,
               const std::vector<framework::DDim> &dims);
  std::vector<VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;
  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };
 }  // namespace framework
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@ -0,0 +1,36 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #pragma once
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/lod_tensor_array.h"
 namespace paddle {
 namespace framework {
 inline VarDesc::VarType ToVarType(std::type_index type) {
  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
    return VarDesc_VarType_LOD_TENSOR;
  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
    return VarDesc_VarType_LOD_RANK_TABLE;
  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
    return VarDesc_VarType_LOD_TENSOR_ARRAY;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@ -48,6 +48,11 @@ class Variable {
  void Clear() { holder_.reset(); }
  std::type_index Type() const {
    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
    return holder_->Type();
  }
 private:
  struct Placeholder {
    virtual ~Placeholder() {}
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
-ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
 ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
                                       ParameterPtr parameter,
@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
 }
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
  if (convMem.empty()) {
    int numDevices = hl_get_device_count();
    convMem.resize(numDevices);
  }
  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
+  MemoryHandlePtr localMem = convMem[devId];
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+  if (NULL == localMem || size > localMem->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
+    localMem = std::make_shared<GpuMemoryHandle>(size);
  }
-  return (*localMem)->getBuf();
+  return localMem->getBuf();
 }
 ConvBaseProjection::~ConvBaseProjection() {
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@ -105,7 +105,7 @@ protected:
  bool bias_;
  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
 };
 }  // namespace paddle
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -110,7 +110,7 @@ function(op_library TARGET)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
    endif()
-    
+
    # reduce_op contains several operators
    if ("${TARGET}" STREQUAL "reduce_op")
        set(pybind_flag 1)
@ -118,6 +118,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
    endif()
    if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
        set(pybind_flag 1)
        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
    endif()
    # pybind USE_NO_KERNEL_OP
    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
    file(READ ${TARGET}.cc TARGET_CONTENT)
@ -161,6 +166,7 @@ set(DEPS_OPS
    sequence_pool_op
    lod_rank_table_op
    lstm_op
    tensor_array_read_write_op
    gru_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
@ -171,6 +177,7 @@ op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@ -72,11 +72,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    }
    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
-        1, PADDLE_CUDA_NUM_THREADS, 0,
+        1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
-        reinterpret_cast<const platform::CUDADeviceContext&>(
+        num_samples, infer_width, indices_data, label_data, accuracy_data);
            ctx.device_context())
            .stream()>>>(num_samples, infer_width, indices_data, label_data,
                         accuracy_data);
  }
 };
--- a/paddle/operators/conv2d_transpose_cudnn_op.cu
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu
@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 using CUDADeviceContext = platform::CUDADeviceContext;
 static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
 using CUDADeviceContext = platform::CUDADeviceContext;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
@ -130,9 +130,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
    dim3 grid_dim(num_x_blocks, batch_size);
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+    auto stream = context.cuda_device_context().stream();
                      context.device_context())
                      .stream();
    conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
        x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
@ -159,9 +157,7 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
    int y_width = Y->dims()[1];
    int y_half_width = (y_width - 1) / 2;
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+    auto stream = context.cuda_device_context().stream();
                      context.device_context())
                      .stream();
    const int x_per_block = 256;
    int num_x_blocks = div_up(x_width, x_per_block);
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@ -82,24 +82,19 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
    int block = 512;
    int grid = (batch_size * class_num + block - 1) / block;
    auto stream = ctx.cuda_device_context().stream();
    if (ctx.Attr<bool>("soft_label")) {
      auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<
+      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
                              ctx.device_context())
                              .stream()>>>(dx_data, dy_data, x_data, label_data,
                                           batch_size, class_num);
    } else {
      math::SetConstant<platform::GPUPlace, T> functor;
      functor(ctx.device_context(), dx, 0);
      auto* label_data = label->data<int64_t>();
      grid = (batch_size + block - 1) / block;
-      CrossEntropyGradientKernel<T><<<
+      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
                              ctx.device_context())
                              .stream()>>>(dx_data, dy_data, x_data, label_data,
                                           batch_size, class_num);
    }
  }
 };
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@ -34,15 +34,18 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
    std::vector<int64_t> shape_int64(shape.size(), 0);
    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                   [](int a) { return static_cast<int64_t>(a); });
-    auto dims = framework::make_ddim(shape_int64);
+    auto output_dim = framework::make_ddim(shape_int64);
-    int dim_idx = ctx->Attrs().Get<int>("dim_idx");
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
-    PADDLE_ENFORCE_GE(dim_idx, 0);
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
-    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), dim_idx);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
-    dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx];
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
-    ctx->SetOutputDim("Out", dims);
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
    ctx->SetOutputDim("Out", output_dim);
  }
 protected:
@ -69,8 +72,11 @@ class FillConstantBatchSizeLikeOpMaker
              "(Tensor) Tensor of specified shape will be filled "
              "with the specified value");
    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
-    AddAttr<int>("dim_idx",
+    AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of batch size dimension")
+                 "(int, default 0) the index of input's batch size dimension")
        .SetDefault(0);
    AddAttr<int>("output_dim_idx",
                 "(int, default 0) the index of output's batch size dimension")
        .SetDefault(0);
    AddAttr<float>("value", "(float, default 0) The value to be filled")
        .SetDefault(0.0f);
@ -86,9 +92,10 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant_batch_size_like,
+REGISTER_OPERATOR(fill_constant_batch_size_like,
-                             ops::FillConstantBatchSizeLikeOp,
+                  ops::FillConstantBatchSizeLikeOp,
-                             ops::FillConstantBatchSizeLikeOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
                  ops::FillConstantBatchSizeLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
    fill_constant_batch_size_like,
    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@ -35,7 +35,9 @@ class FillConstantOp : public framework::OperatorWithKernel {
 protected:
  framework::DataType IndicateDataType(
      const framework::ExecutionContext &ctx) const override {
-    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+    int data_type = ctx.Attr<int>("data_type");
    VLOG(10) << " FillConstant data_type = " << data_type;
    return static_cast<framework::DataType>(data_type);
  }
 };
@ -71,4 +73,5 @@ REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
 REGISTER_OP_CPU_KERNEL(
    fill_constant, ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>,
    ops::FillConstantOpKernel<paddle::platform::CPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>);
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int>,
    ops::FillConstantOpKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
@ -20,4 +20,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    fill_constant, ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>,
    ops::FillConstantOpKernel<paddle::platform::GPUPlace, double>,
-    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>);
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int>,
    ops::FillConstantOpKernel<paddle::platform::GPUPlace, int64_t>);
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@ -31,7 +31,6 @@ class IncrementOp : public framework::OperatorWithKernel {
  }
 };
 template <typename AttrType>
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  IncrementOpMaker(framework::OpProto *proto,
@ -39,10 +38,10 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor of increment operator");
    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
-    AddAttr<AttrType>("step",
+    AddAttr<float>("step",
-                      "(float, default 1.0) "
+                   "(float, default 1.0) "
-                      "The step size by which the "
+                   "The step size by which the "
-                      "input tensor will be incremented.")
+                   "input tensor will be incremented.")
        .SetDefault(1.0);
    AddComment(R"DOC(
 Increment Operator.
@ -73,7 +72,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>,
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
                  ops::IncrementGradOpMaker);
-REGISTER_OP_CPU_KERNEL(increment,
+REGISTER_OP_CPU_KERNEL(
-                       ops::IncrementKernel<paddle::platform::CPUPlace, float>);
+    increment, ops::IncrementKernel<paddle::platform::CPUPlace, float>,
    ops::IncrementKernel<paddle::platform::CPUPlace, double>,
    ops::IncrementKernel<paddle::platform::CPUPlace, int>,
    ops::IncrementKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/increment_op.cu
+++ b/paddle/operators/increment_op.cu
@ -16,4 +16,7 @@
 REGISTER_OP_GPU_KERNEL(
    increment,
-    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>);
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>,
    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, double>,
    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int>,
    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, int64_t>);
--- a/paddle/operators/increment_op.h
+++ b/paddle/operators/increment_op.h
@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
-template <typename Place, typename T, typename AttrType = T>
+template <typename Place, typename T>
 class IncrementKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& context) const {
@ -27,7 +27,7 @@ class IncrementKernel : public framework::OpKernel<T> {
    auto* in = context.Input<framework::Tensor>("X");
    tensor->mutable_data<T>(in->place());
-    auto step = static_cast<T>(context.Attr<AttrType>("step"));
+    auto step = static_cast<T>(context.Attr<float>("step"));
    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@ -74,10 +74,9 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
    dim3 threads(128, 8);
    dim3 grids(8, 1);
-    LookupTable<T, 128, 8, 8><<<
+    LookupTable<T, 128, 8,
-        grids, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                8><<<grids, threads, 0, context.device_context().stream()>>>(
-                               context.device_context())
+        output, table, ids, N, K, D);
                               .stream()>>>(output, table, ids, N, K, D);
  }
 };
@ -95,9 +94,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto* ids_data = ids->data<int64_t>();
      auto ids_dim = ids->dims();
-      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+      auto stream = context.cuda_device_context().stream();
                        context.device_context())
                        .stream();
      // copy GPU memory to CPU pinned memory
      framework::Vector<int64_t> new_rows;
      new_rows.resize(ids_dim[0]);
@ -136,11 +133,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      dim3 threads(128, 8);
      dim3 grids(8, 1);
-      LookupTableGrad<T, 128, 8,
+      LookupTableGrad<
-                      8><<<grids, threads, 0,
+          T, 128, 8,
-                           reinterpret_cast<const platform::CUDADeviceContext&>(
+          8><<<grids, threads, 0, context.device_context().stream()>>>(
-                               context.device_context())
+          d_table, d_output, ids, N, K, D);
                               .stream()>>>(d_table, d_output, ids, N, K, D);
    }
  }
 };
--- a/paddle/operators/math/detail/CMakeLists.txt
+++ b/paddle/operators/math/detail/CMakeLists.txt
@ -1,3 +1 @@
-if(WITH_AVX)
+cc_library(activation_functions SRCS avx_functions.cc)
    cc_library(activation_functions SRCS avx_functions.cc)
 endif()
--- a/paddle/operators/math/detail/avx_functions.cc
+++ b/paddle/operators/math/detail/avx_functions.cc
@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef __AVX__
 #include <immintrin.h>
 #include "paddle/operators/math/detail/activation_functions.h"
 // TODO(qingqing) refine this dependence
@ -84,3 +86,5 @@ __m256 Identity(const __m256 a, const __m256 b) { return a; }
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
 #endif
--- a/Show More
+++ b/Show More