Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into add-GRUStepOp

7 years ago · 9b4a6af251
parent 0bc5a122d4 487a13bbfa
commit 9b4a6af251
41 changed files with 1092 additions and 265 deletions
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
    paddle_parameter
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -42,12 +42,14 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB})
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward)
-#if(WITH_GPU)
+set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op
-#    nv_test(executor_test SRCS executor_test.cc DEPS executor)
+    mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op)
-#else()
+if(WITH_GPU)
-#    cc_test(executor_test SRCS executor_test.cc DEPS executor)
+    nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
-#endif()
+else()
    cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
 endif()
 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@ -27,6 +27,8 @@ extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);
 // TODO(jiayi): Add target as parameter and generate backward op
 // according to target.
 void AppendBackward(ProgramDescBind& program_desc,
                    const std::unordered_set<std::string>& no_grad_vars);
--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@ -25,6 +25,16 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 USE_OP(elementwise_add);
 USE_OP(gaussian_random);
 USE_OP(feed);
 USE_OP(fetch);
 USE_OP(mul);
 USE_OP(sum);
 USE_OP(squared_l2_distance);
 USE_OP(fill_constant);
 USE_OP(sgd);
 using namespace paddle::platform;
 using namespace paddle::framework;
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -211,6 +211,15 @@ static InferShapeFuncMap &InferShapeFuncs() {
  return *g_map;
 }
 void OpDescBind::CheckAttrs() {
  PADDLE_ENFORCE(!Type().empty(),
                 "CheckAttr() can not be called before type is setted.");
  const auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
  PADDLE_ENFORCE_NOT_NULL(checker, "Operator \"%s\" has no registered checker.",
                          Type());
  checker->Check(attrs_);
 }
 void OpDescBind::InferShape(const BlockDescBind &block) const {
  auto &funcs = InferShapeFuncs();
  auto it = funcs.find(this->Type());
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@ -100,6 +100,8 @@ class OpDescBind {
    return &this->attrs_;
  }
  void CheckAttrs();
  void InferShape(const BlockDescBind &block) const;
 private:
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -289,6 +289,15 @@ class ExecutionContext {
    return device_context_;
  }
 #ifdef PADDLE_WITH_CUDA
  const platform::CUDADeviceContext& cuda_device_context() const {
    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
    auto cuda_ctx =
        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
    return *cuda_ctx;
  }
 #endif
 private:
  const OperatorBase& op_;
  const Scope& scope_;
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -88,25 +88,30 @@ class Tensor {
   * @brief   Copy the content of external tensor to a new place.
   *
   * @param[in] src        The external tensor.
-   * @param[in] ctx   The device context contains place where to store.
+   * @param[in] dst_place  The dst place.
   * @param[in] ctx        The device context contains device resources.
   *
   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
   */
  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
  // and make them global functions
  template <typename T>
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);
+  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
                       const platform::DeviceContext& ctx);
  /**
   * @brief   Copy the content of an external vector to a tensor.
   *
-   * @param[in] src   The external vector.
+   * @param[in] src        The external tensor.
-   * @param[in] ctx   The device context contains place where to store.
+   * @param[in] ctx        The device context contains device resources.
   *
   * * @note    CopyFromVector assumes that the tensor has been resized
   *            before invoking.
   */
  template <typename T>
  inline void CopyFromVector(const std::vector<T>& src,
-                             const platform::Place& dst_place);
+                             const platform::DeviceContext& ctx);
  /**
   * @brief   Return the slice of the tensor.
--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
@ -95,7 +95,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
  values_[index].Resize(value.dims());
  values_[index].mutable_data<value_type>(platform::CPUPlace());
-  values_[index].CopyFrom<value_type>(value, platform::CPUPlace());
+  values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
                                      platform::CPUDeviceContext());
 }
 void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@ -151,7 +152,8 @@ LoDTensor TensorArray::Stack() const {
  for (size_t idx = 0; idx < size(); idx++) {
    result.Slice<value_type>(idx, idx + 1)
-        .CopyFrom<value_type>(Read(idx), platform::CPUPlace());
+        .CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
                              platform::CPUDeviceContext());
  }
  return result;
 }
@ -182,7 +184,8 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
      // copy
      value.Resize(value_dims);
      value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
-                                 platform::CPUPlace());
+                                 platform::CPUPlace(),
                                 platform::CPUDeviceContext());
    }
  }
 }
@ -236,7 +239,8 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
    auto target = result.Slice<value_type>(i, i + 1);
    auto source_ = source->Slice<value_type>(index, index + 1);
-    target.CopyFrom<value_type>(source_, platform::CPUPlace());
+    target.CopyFrom<value_type>(source_, platform::CPUPlace(),
                                platform::CPUDeviceContext());
  }
  return result;
@ -269,7 +273,8 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
      if (index >= seq_meta.end) break;
      auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
      auto target = result.Slice<float>(index, index + 1);
-      target.CopyFrom<float>(source_, platform::CPUPlace());
+      target.CopyFrom<float>(source_, platform::CPUPlace(),
                             platform::CPUDeviceContext());
    }
  }
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@ -88,7 +88,8 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
 template <typename T>
 inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place) {
+                             const platform::Place& dst_place,
                             const platform::DeviceContext& ctx) {
  src.check_memory_size<T>();
  Resize(src.dims());
@ -106,26 +107,45 @@ inline void Tensor::CopyFrom(const Tensor& src,
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(src_place) &&
           platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  } else if (platform::is_gpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
    auto ctx_place = ctx.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
    memory::Copy(
        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
  PADDLE_ENFORCE(cudaStreamSynchronize(0),
                 "cudaStreamSynchronize failed in Tensor CopyFrom");
 #endif
 }
 template <typename T>
 inline void Tensor::CopyFromVector(const std::vector<T>& src,
-                                   const platform::Place& dst_place) {
+                                   const platform::DeviceContext& ctx) {
  auto dst_place = ctx.GetPlace();
  auto src_ptr = static_cast<const void*>(src.data());
  platform::CPUPlace src_place;
  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@ -137,12 +157,11 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
  }
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place,
+    memory::Copy(
-                 src_ptr, size, 0);
+        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
        size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
  PADDLE_ENFORCE(cudaStreamSynchronize(0),
                 "cudaStreamSynchronize failed in Tensor CopyFromVector");
 #endif
 }
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@ -194,6 +194,7 @@ TEST(Tensor, CopyFrom) {
  {
    Tensor src_tensor;
    Tensor dst_tensor;
    CPUDeviceContext cpu_ctx((CPUPlace()));
    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
@ -201,7 +202,7 @@ TEST(Tensor, CopyFrom) {
    memcpy(src_ptr, arr, 9 * sizeof(int));
    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
+    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
    const int* dst_ptr = dst_tensor.data<int>();
    ASSERT_NE(src_ptr, dst_ptr);
@ -210,7 +211,7 @@ TEST(Tensor, CopyFrom) {
    }
    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
+    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
    const int* slice_ptr = slice_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
    ASSERT_NE(dst_ptr, slice_ptr);
@ -231,13 +232,15 @@ TEST(Tensor, CopyFrom) {
    // CPU Tensor to GPU Tensor
    auto gpu_place = new paddle::platform::GPUPlace(0);
-    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
+    CUDADeviceContext gpu_ctx(*gpu_place);
    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
    // GPU Tensor to CPU Tensor
    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
-    // Compare Tensors
+    // Sync before Compare Tensors
    gpu_ctx.Wait();
    const int* dst_ptr = dst_tensor.data<int>();
    ASSERT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 9; ++i) {
@ -247,12 +250,13 @@ TEST(Tensor, CopyFrom) {
    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
+    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
-    // Compare Slice Tensors
+    // Sync before Compare Slice Tensors
    gpu_ctx.Wait();
    const int* slice_ptr = slice_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
    ASSERT_NE(dst_ptr, slice_ptr);
@ -273,7 +277,8 @@ TEST(Tensor, CopyFromVector) {
    // Copy to CPU Tensor
    cpu_tensor.Resize(make_ddim({3, 3}));
    auto cpu_place = new paddle::platform::CPUPlace();
-    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+    CPUDeviceContext cpu_ctx(*cpu_place);
    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
    // Compare Tensors
    const int* cpu_ptr = cpu_tensor.data<int>();
@ -285,7 +290,7 @@ TEST(Tensor, CopyFromVector) {
    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
    cpu_ptr = cpu_tensor.data<int>();
    src_ptr = src_vec.data();
    ASSERT_NE(src_ptr, cpu_ptr);
@ -306,16 +311,19 @@ TEST(Tensor, CopyFromVector) {
    // Copy to CPU Tensor
    cpu_tensor.Resize(make_ddim({3, 3}));
    auto cpu_place = new paddle::platform::CPUPlace();
-    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+    CPUDeviceContext cpu_ctx(*cpu_place);
    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
    // Copy to GPUTensor
    gpu_tensor.Resize(make_ddim({3, 3}));
    auto gpu_place = new paddle::platform::GPUPlace();
-    gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place);
+    CUDADeviceContext gpu_ctx(*gpu_place);
    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
-    // Compare Tensors
+    // Sync before Compare Tensors
    gpu_ctx.Wait();
    const int* src_ptr = src_vec.data();
    const int* cpu_ptr = cpu_tensor.data<int>();
    const int* dst_ptr = dst_tensor.data<int>();
@ -329,11 +337,13 @@ TEST(Tensor, CopyFromVector) {
    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
    gpu_tensor.Resize(make_ddim({2, 2}));
-    gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place);
+    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
    // Sync before Compare Tensors
    gpu_ctx.Wait();
    src_ptr = src_vec.data();
    cpu_ptr = cpu_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@ -321,6 +321,23 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };
 template <typename AttrType>
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ThresholdedReluOpMaker(framework::OpProto *proto,
                         framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Y", "Output of ThresholdedRelu operator");
    AddComment(
        "ThresholdedRelu activation operator, "
        "thresholded_relu = x for x > threshold, "
        "thresholded_relu = 0 otherwise.");
    AddAttr<AttrType>("threshold", "The threshold location of activation")
        .SetDefault(static_cast<AttrType>(1.0));
  }
 };
 }  // namespace operators
 }  // namespace paddle
@ -392,6 +409,10 @@ REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
 REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
            hard_shrink_grad, ops::ActivationOpGrad);
 REGISTER_OP(thresholded_relu, ops::ActivationOp,
            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
            ops::ActivationOpGrad);
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
  REGISTER_OP_CPU_KERNEL(                                                      \
      act_type,                                                                \
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -590,6 +590,32 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
  }
 };
 template <typename T>
 struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = (x > static_cast<T>(threshold)).template cast<T>() * x;
  }
 };
 template <typename T>
 struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * (x > static_cast<T>(threshold)).template cast<T>();
  }
 };
 }  // namespace operators
 }  // namespace paddle
@ -615,4 +641,5 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
  __macro(elu, ELUFunctor, ELUGradFunctor);                       \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor)
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \
  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@ -12,22 +12,12 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/gemm_conv2d_op.h"
+#include "paddle/operators/conv2d_op.h"
 namespace paddle {
 namespace operators {
-int outputSize(int input_size, int filter_size, int padding, int stride) {
+void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
  return output_size;
 }
 class Conv2DOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {
  PADDLE_ENFORCE(ctx->HasInput("Input"),
                 "Input(Input) of Conv2DOp should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("Filter"),
@ -53,25 +43,22 @@ class Conv2DOp : public framework::OperatorWithKernel {
      "The number of output channels should be divided by groups.");
  auto output_height =
-        outputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
+      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
  auto output_width =
-        outputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
+      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
-    ctx->SetOutputDim(
+  ctx->SetOutputDim("Output",
-        "Output", {in_dims[0], filter_dims[0], output_height, output_width});
+                    {in_dims[0], filter_dims[0], output_height, output_width});
 }
 };
-class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
- public:
+                             framework::OpAttrChecker* op_checker)
  Conv2DOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
      "The input tensor of convolution operator. "
      "The format of input tensor is NCHW. Where N is batch size, C is the "
      "number of channels, H and W is the height and width of image.");
-    AddInput(
+  AddInput("Filter",
        "Filter",
           "The filter tensor of convolution operator."
           "The format of the filter tensor is MCHW, where M is the number of "
           "output image channels, C is the number of input image channels, "
@ -99,14 +86,8 @@ and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 )DOC");
 }
 };
 class Conv2DOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
- protected:
+void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  void InferShape(framework::InferShapeContext* ctx) const override {
  auto in_dims = ctx->GetInputDim("Input");
  auto filter_dims = ctx->GetInputDim("Filter");
  if (ctx->HasOutput(framework::GradVarName("Input"))) {
@ -116,7 +97,6 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
  }
 }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include "paddle/operators/gemm_conv2d_op.h"
+#include "paddle/operators/conv2d_op.h"
 namespace ops = paddle::operators;
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@ -24,6 +24,38 @@ namespace operators {
 using Tensor = framework::Tensor;
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
 inline int OutputSize(int input_size, int filter_size, int padding,
                      int stride) {
  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
  return output_size;
 }
 // Define Op classes in .h file so that other conv
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  Conv2DOpMaker(framework::OpProto* proto,
                framework::OpAttrChecker* op_checker);
 };
 class Conv2DOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override;
 };
 class Conv2DOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override;
 };
 template <typename Place, typename T>
 class GemmConv2DKernel : public framework::OpKernel<T> {
 public:
@ -74,7 +106,6 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
    framework::DDim output_matrix_shape = {output_channels,
                                           output_height * output_width};
    // convolution operator: im2col + gemm
    int in_step = input_channels / groups;
    int out_step = output_channels / groups;
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@ -0,0 +1,47 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/operators/conv2d_op.h"
 namespace paddle {
 namespace operators {
 class CudnnConvOpMaker : public Conv2DOpMaker {
 public:
  CudnnConvOpMaker(framework::OpProto* proto,
                   framework::OpAttrChecker* op_checker)
      : Conv2DOpMaker(proto, op_checker) {
    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
        .SetDefault(std::vector<int>{1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "
                 "allocated/freed each time the operator runs, larger "
                 "workspace size can increase performance but also requires "
                 "better hardward. This size should be carefully setted.")
        .SetDefault(4096);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
            ops::Conv2DOpGrad);
 REGISTER_OP_CPU_KERNEL(
    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    conv_cudnn_grad,
    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@ -0,0 +1,96 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/decayed_adagrad_op.h"
 namespace paddle {
 namespace operators {
 class DecayedAdagradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(framework::InferShapeContextBase *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of DecayedAdagradOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),
                   "Input(Grad) of DecayedAdagradOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Moment"),
                   "Input(Moment) of DecayedAdagradOp should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("LearningRate"),
        "Input(LearningRate) of DecayedAdagradOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
    auto lr_dims = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                      "LearningRate should have one element");
    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
                      "Param and Grad input of DecayedAdagradOp should have "
                      "the same dimension.");
    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
                      "Param and Moment input of DecayedAdagradOp should have "
                      "the same dimension.");
    ctx->SetOutputDim("ParamOut", param_dims);
    ctx->SetOutputDim("MomentOut", param_dims);
  }
 };
 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  DecayedAdagradOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");
    AddInput("Moment", "(Tensor) Second moment");
    AddInput("LearningRate", "(Tensor) Learning rate");
    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddOutput("MomentOut", "(Tensor) Output second moment");
    AddAttr<float>("decay",
                   "(float, default 0.95) "
                   "Discounting factor for coming gradient")
        .SetDefault(0.95);
    AddAttr<float>("epsilon",
                   "(float, default 1.0e-6) "
                   "Constant for numerical stability")
        .SetDefault(1.0e-6f);
    AddComment(R"DOC(
 Decayed Adagrad
 moment_out = decay * moment + (1 - decay) * grad * grad
 param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
                             ops::DecayedAdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
    decayed_adagrad,
    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
@ -0,0 +1,21 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/operators/decayed_adagrad_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    decayed_adagrad,
    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
@ -0,0 +1,56 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 template <typename Place, typename T>
 class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
    param_out_tensor->mutable_data<T>(ctx.GetPlace());
    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
    float decay = ctx.Attr<float>("decay");
    float epsilon = ctx.Attr<float>("epsilon");
    auto param = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("Param"));
    auto grad = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("Grad"));
    auto moment = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("Moment"));
    auto lr = framework::EigenVector<T>::Flatten(
        *ctx.Input<framework::Tensor>("LearningRate"));
    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
    auto place = ctx.GetEigenDevice<Place>();
    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
    param_out.device(place) =
        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/feed_op.h
+++ b/paddle/operators/feed_op.h
@ -34,7 +34,7 @@ class FeedKernel : public framework::OpKernel<T> {
    // TODO(qijun):
    //   check tensors[col].dims() with attribute,
    //   except the first dimenson.
-    out->CopyFrom<T>(tensors[col], ctx.GetPlace());
+    out->CopyFrom<T>(tensors[col], ctx.GetPlace(), ctx.device_context());
  }
 };
--- a/paddle/operators/fetch_op.h
+++ b/paddle/operators/fetch_op.h
@ -35,7 +35,8 @@ class FetchKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_GT(tensors->size(), static_cast<size_t>(col));
    (*tensors)[col].Resize(input->dims());
    (*tensors)[col].mutable_data<T>(platform::CPUPlace());
-    (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace());
+    (*tensors)[col].CopyFrom<T>(*input, platform::CPUPlace(),
                                ctx.device_context());
    // TODO(qijun): need to handle LodTensor later
  }
 };
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@ -49,10 +49,22 @@ void testIm2col() {
  memcpy(input_ptr, arr, 6 * sizeof(float));
  auto* place = new Place();
  paddle::platform::DeviceContext* context;
  if (paddle::platform::is_cpu_place(*place)) {
    context =
        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
  } else {
 #ifdef PADDLE_WITH_CUDA
    context =
        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
 #else
    PADDLE_THROW("no GPU support");
 #endif  // PADDLE_ONLY_CPU
  }
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    input.CopyFrom<float>(input_tmp, *place);
+    input.CopyFrom<float>(input_tmp, *place, *context);
  }
  output_cfo.mutable_data<float>(
      {1, filter_size, filter_size, output_height, output_width}, *place);
@ -66,18 +78,6 @@ void testIm2col() {
      paddle::operators::math::ColFormat::kOCF, Place, float>
      im2col_ocf;
  paddle::platform::DeviceContext* context;
  if (paddle::platform::is_cpu_place(*place)) {
    context =
        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
  } else {
 #ifdef PADDLE_WITH_CUDA
    context =
        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
 #else
    PADDLE_THROW("no GPU support");
 #endif  // PADDLE_ONLY_CPU
  }
  im2col(*context, input, output_cfo, stride, stride, padding, padding);
  im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
@ -85,7 +85,8 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_cfo_ptr = output_cfo.data<float>();
  } else {
-    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
+    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace(),
                               *context);
    out_cfo_ptr = output_tmp.data<float>();
  }
  EXPECT_EQ(out_cfo_ptr[0], 0);
@ -101,7 +102,8 @@ void testIm2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_ocf_ptr = output_ocf.data<float>();
  } else {
-    output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace());
+    output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace(),
                               *context);
    out_ocf_ptr = output_tmp.data<float>();
  }
  EXPECT_EQ(out_ocf_ptr[0], 0);
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@ -17,17 +17,18 @@ TEST(math_function, notrans_mul_trans) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
-  out.CopyFrom<float>(out_gpu, *cpu_place);
+  out.CopyFrom<float>(out_gpu, *cpu_place, context);
  float* out_ptr = out.data<float>();
  context.Wait();
  EXPECT_EQ(out_ptr[0], 5);
  EXPECT_EQ(out_ptr[1], 14);
  EXPECT_EQ(out_ptr[2], 14);
@ -50,17 +51,18 @@ TEST(math_function, trans_mul_notrans) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place);
+  input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
-  out.CopyFrom<float>(out_gpu, *cpu_place);
+  out.CopyFrom<float>(out_gpu, *cpu_place, context);
  float* out_ptr = out.data<float>();
  context.Wait();
  EXPECT_EQ(out_ptr[0], 9);
  EXPECT_EQ(out_ptr[1], 12);
  EXPECT_EQ(out_ptr[2], 15);
@ -98,9 +100,9 @@ TEST(math_function, gemm_notrans_cublas) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(*gpu_place);
@ -108,7 +110,7 @@ TEST(math_function, gemm_notrans_cublas) {
  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
-  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
  // numpy code:
  // a = np.arange(6).reshape(2, 3)
@ -116,6 +118,7 @@ TEST(math_function, gemm_notrans_cublas) {
  // c = np.arange(8).reshape(2, 4)[:, 1:]
  // out = np.arange(8).reshape(2, 4)
  // out[:, 1:] = np.dot(a, b) + c
  context.Wait();
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
  EXPECT_EQ(input3_ptr[2], 28);
@ -152,9 +155,9 @@ TEST(math_function, gemm_trans_cublas) {
  auto* gpu_place = new paddle::platform::GPUPlace(0);
  paddle::platform::CUDADeviceContext context(*gpu_place);
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
+  input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place);
+  input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place);
+  input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
  float* c = input3_gpu.mutable_data<float>(*gpu_place);
@ -162,7 +165,8 @@ TEST(math_function, gemm_trans_cublas) {
  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
-  input3.CopyFrom<float>(input3_gpu, *cpu_place);
+  input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
  context.Wait();
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
--- a/Show More
+++ b/Show More