Double backward of conv2d. (#17211)

* Add conv2d_grad_grad_op * Extracte the cuDNN conv algo searching code in conv_cudnn_helper.h. - Now use it in conv2d_grad_grad. - Will simply the searching code in conv2d and conv2d_grad in next PR. * Enhance and fix bug in unit testing of gradient_checker. * Support to fetch empty variables，return None in Python.
6 years ago · e32c9888f5
parent f456c8beb8
commit e32c9888f5
12 changed files with 791 additions and 77 deletions
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -386,9 +386,10 @@ class ExecutionContext {

  template <typename T>
  T& GetKernelConfig(int idx) const {
-    PADDLE_ENFORCE(kernel_configs_ && kernel_configs_->size() > idx,
-                   "%s selected kernel doesn't have kernel config %lu <= %d",
-                   op_.Type().c_str(), kernel_configs_->size(), idx);
+    PADDLE_ENFORCE(
+        kernel_configs_ && kernel_configs_->size() > static_cast<size_t>(idx),
+        "%s selected kernel doesn't have kernel config %lu <= %d",
+        op_.Type().c_str(), kernel_configs_->size(), idx);
    return *boost::get<std::shared_ptr<T>>(kernel_configs_->at(idx));
  }

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@ -644,6 +644,7 @@ class LeakyReluDoubleGrad : public framework::OperatorWithKernel {
 //
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
+//               dy = 0
 //
 class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
 public:
@ -655,11 +656,12 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
    op->SetType("relu_grad_grad");
    // input1: Out
    op->SetInput("Out", Input("Out"));
-    // X@GRAD@GRAD: ddx
+    // input2: ddx
    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
    op->SetAttrMap(Attrs());
-    // Out@GRAD@GRAD: ddy
+    // output1: ddy
    op->SetOutput("DOut", InputGrad("Out"));
+    // output2: ddy
    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
    return std::unique_ptr<::paddle::framework::OpDesc>(op);
  }
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@ -54,7 +54,13 @@ class FetchOp : public framework::OperatorBase {

    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
-    TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
+    if (src_item.IsInitialized() && src_item.numel() > 0) {
+      TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
+    } else {
+      // Not copy, if the src tensor is empty.
+      dst_item.clear();
+      dst_item.Resize({0});
+    }
    dst_item.set_lod(src_item.lod());

    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -506,13 +506,100 @@ class Conv3DGradMaker : public framework::SingleGradOpDescMaker {
  }
 };

+/*
+ * Inputs:  I, W, dO, ddI, ddW
+ * Outputs: ddO, dW, dI
+ */
+class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType(this->ForwardOpType() + "_grad");
+    // I, W, dO, ddI, ddW
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput("DOutput", Input(framework::GradVarName("Output")));
+    op->SetInput("DDInput", OutputGrad(framework::GradVarName("Input")));
+    op->SetInput("DDFilter", OutputGrad(framework::GradVarName("Filter")));
+
+    // ddO, dI, dW
+    // Unlike grad op, double grad op does not use name@GRAD@GRAD
+    // as key of ops' inputs and outputs.
+    op->SetOutput("DDOutput", InputGrad(framework::GradVarName("Output")));
+    op->SetOutput("DFilter", InputGrad("Filter"));
+    op->SetOutput("DInput", InputGrad("Input"));
+    op->SetAttrMap(Attrs());
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto x_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("Filter");
+  auto do_dims = ctx->GetInputDim("DOutput");
+
+  if (ctx->HasOutput("DDOutput")) {
+    ctx->SetOutputDim("DDOutput", do_dims);
+  }
+  if (ctx->HasOutput("DFilter")) {
+    ctx->SetOutputDim("DFilter", w_dims);
+  }
+  if (ctx->HasOutput("DInput")) {
+    ctx->SetOutputDim("DInput", x_dims);
+  }
+}
+
+framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+#ifdef PADDLE_WITH_CUDA
+  if (platform::CanCUDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    PADDLE_THROW("Now ConvDoubleGrad only supports cuDNN.");
+  }
+#endif
+  auto type = framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                      ctx.GetPlace(), layout_, library_,
+                                      customized_type_value);
+#ifdef PADDLE_WITH_CUDA
+  if (library_ == framework::LibraryType::kCUDNN) {
+    std::vector<framework::KernelConfig>& configs = kernel_configs_map_[type];
+    if (configs.empty()) {
+      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>> p0(
+          new framework::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>());
+      configs.push_back(p0);
+
+      std::shared_ptr<
+          framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>
+          p1(new framework::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>());
+      configs.push_back(p1);
+
+      std::shared_ptr<framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>
+          p2(new framework::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>());
+      configs.push_back(p2);
+    }
+  }
+#endif
+  return type;
+}
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
-REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
+REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker);
+REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad);

 // depthwise convolution op
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@ -107,6 +108,16 @@ class ConvOpGrad : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override;
 };

+class ConvOpDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
 template <typename DeviceContext, typename T>
 class GemmConvKernel : public framework::OpKernel<T> {
 public:
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@ -29,13 +29,14 @@ namespace platform {
 using framework::Tensor;

 template <typename T>
-cudnnDataType_t ToCudnnDataType(const T& t) {
+inline cudnnDataType_t ToCudnnDataType(const T& t) {
  auto type = framework::ToDataType(t);
  return ToCudnnDataType(type);
 }

 template <>
-cudnnDataType_t ToCudnnDataType(const framework::proto::VarType::Type& t) {
+inline cudnnDataType_t ToCudnnDataType(
+    const framework::proto::VarType::Type& t) {
  cudnnDataType_t type = CUDNN_DATA_FLOAT;
  switch (t) {
    case framework::proto::VarType::FP16:
@ -59,14 +60,14 @@ class ActivationDescriptor {
  struct Deleter {
    void operator()(T* t) {
      if (t != nullptr) {
-        PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t));
+        CUDNN_ENFORCE(dynload::cudnnDestroyActivationDescriptor(t));
        t = nullptr;
      }
    }
  };
  ActivationDescriptor() {
    T* raw_ptr;
-    PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr));
+    CUDNN_ENFORCE(dynload::cudnnCreateActivationDescriptor(&raw_ptr));
    desc_.reset(raw_ptr);
  }
  template <typename T>
@ -88,14 +89,14 @@ class TensorDescriptor {
  struct Deleter {
    void operator()(T* t) {
      if (t != nullptr) {
-        PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t));
+        CUDNN_ENFORCE(dynload::cudnnDestroyTensorDescriptor(t));
        t = nullptr;
      }
    }
  };
  TensorDescriptor() {
    T* raw_ptr;
-    PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
+    CUDNN_ENFORCE(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
    desc_.reset(raw_ptr);
  }
  T* desc() { return desc_.get(); }
@ -111,7 +112,7 @@ class TensorDescriptor {
    if (groups > 1) {
      dims_with_group[1] = dims_with_group[1] / groups;
    }
-    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
+    CUDNN_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
        desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(),
        dims_with_group.data(), strides.data()));
  }
@ -120,5 +121,83 @@ class TensorDescriptor {
  std::unique_ptr<T, Deleter> desc_;
 };

+class FilterDescriptor {
+ public:
+  using T = cudnnFilterStruct;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        CUDNN_ENFORCE(dynload::cudnnDestroyFilterDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  FilterDescriptor() {
+    T* raw_ptr;
+    CUDNN_ENFORCE(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(const Tensor& tensor, const cudnnTensorFormat_t format,
+           const int groups = 1) {
+    auto dims = framework::vectorize2int(tensor.dims());
+    if (groups > 1) {
+      dims[1] = dims[1] / groups;
+    }
+    CUDNN_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
+        desc_.get(), ToCudnnDataType(tensor.type()), format, dims.size(),
+        dims.data()));
+  }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
+class ConvolutionDescriptor {
+ public:
+  using T = cudnnConvolutionStruct;
+  struct Deleter {
+    void operator()(T* t) {
+      if (t != nullptr) {
+        CUDNN_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(t));
+        t = nullptr;
+      }
+    }
+  };
+  ConvolutionDescriptor() {
+    T* raw_ptr;
+    CUDNN_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
+    desc_.reset(raw_ptr);
+  }
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(cudnnDataType_t dtype, const std::vector<int>& pads,
+           const std::vector<int>& strides, const std::vector<int>& dilations,
+           const int groups = 1) {
+    cudnnDataType_t compute_type =
+        (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
+    T* desc = desc_.get();
+    CUDNN_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
+        desc, pads.size(), pads.data(), strides.data(), dilations.data(),
+        CUDNN_CROSS_CORRELATION, compute_type));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+        desc, CUDNN_DEFAULT_MATH));
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnSetConvolutionGroupCount(desc, groups));
+    if (dtype == CUDNN_DATA_HALF) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          desc, CUDNN_TENSOR_OP_MATH));
+    }
+#endif
+  }
+
+ private:
+  std::unique_ptr<T, Deleter> desc_;
+};
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@ -472,6 +472,9 @@ inline std::string TensorDTypeToPyDTypeStr(
 }  // namespace details

 inline py::array TensorToPyArray(const framework::Tensor &tensor) {
+  if (!tensor.IsInitialized()) {
+    return py::array();
+  }
  bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
  const auto &tensor_dims = tensor.dims();
  auto tensor_dtype = tensor.type();
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@ -119,7 +119,10 @@ def as_numpy(tensor):
            They can not be completely cast to Python ndarray. \
            Please set the parameter 'return_numpy' as 'False' to \
            return LoDTensor itself directly.")
-    return np.array(tensor)
+    if tensor._is_initialized():
+        return np.array(tensor)
+    else:
+        return None


 def has_feed_operators(block, feed_targets, feed_holder_name):
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@ -82,6 +82,10 @@ def set_var_in_scope(scope, place, name, value, recursive_seq_len=None):
    return t


+def var_to_np_array_in_scope(scope, place, name):
+    return np.array(scope.var(name).get_tensor())
+
+
 def make_jacobian(x, y_size, np_dtype):
    if isinstance(x, fluid.framework.Variable):
        return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
@ -192,14 +196,18 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
    x = _as_list(x)
    jacobian = make_jacobian(x, y_size, np_type)

-    dx = _as_list(dx)
    for i in six.moves.xrange(y_size):
        _set_item(dy_t, i, 1, np_type)

        dx_res = exe.run(program, scope=scope, fetch_list=dx)

        for j in six.moves.xrange(len(x)):
-            jacobian[j][:, i] = dx_res[j].flatten()
+            if dx_res[j] is not None:
+                jacobian[j][:, i] = dx_res[j].flatten()
+            else:
+                jacobian[j][:, i] = np.zeros(
+                    dx[j].shape, dtype=np_type).flatten()
+
        _set_item(dy_t, i, 0, np_type)

    return jacobian
@ -242,6 +250,7 @@ def grad_check(x,
    # check input arguments
    x = _as_list(x)
    y = _as_list(y)
+
    for v in x:
        v.stop_gradient = False
        v.persistable = True
@ -274,9 +283,24 @@ def grad_check(x,
    ]

    # [y_idx, x_idx]
-    analytical = [
-        _compute_analytical_jacobian(program, x, yi, place, scope) for yi in y
-    ]
+    analytical = []
+    for yi in y:
+        prog = program.clone()
+
+        clone_x = []
+        clone_y = None
+        for b in prog.blocks:
+            if b.has_var(yi.name):
+                clone_y = b.var(yi.name)
+                break
+        for xi in x:
+            for b in prog.blocks:
+                if b.has_var(xi.name):
+                    clone_x.append(b.var(xi.name))
+                    break
+
+        analytical.append(
+            _compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))

    for i, (x_idx,
            y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
@ -334,6 +358,7 @@ def double_grad_check(x,
    if y_grads is None:
        scope = fluid.executor.global_scope()
        y_grads = []
+        y_grads_init = []
        for yi in y:
            dyi_name = _append_grad_suffix_(yi.name)
            np_type = dtype_to_np_dtype(yi.dtype)
@ -343,9 +368,20 @@ def double_grad_check(x,
            v = np.random.random(size=yi.shape).astype(np_type)
            set_var_in_scope(scope, place, dyi_name, v)
            y_grads.append(dy)
+            y_grads_init.append(v)
    else:
        y_grads = _as_list(y_grads)
+        y_grads_init = [
+            var_to_np_array_in_scope(scope, place, v.name) for v in y_grads
+        ]

    # append first order grads
    target_grads = calc_gradient(y, x, y_grads)
+
+    # y_grads are the input of first-order backward,
+    # so, they are also the input of second-order backward.
+    x += y_grads
+    x_init = _as_list(x_init)
+    x_init += y_grads_init
+
    grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@ -46,7 +46,6 @@ class TestMulGradCheck(unittest.TestCase):
 class TestReluDoubleGradCheck(unittest.TestCase):
    @prog_scope()
    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
        shape = [2, 8]
        eps = 0.005
        dtype = np.float64
@ -71,7 +70,6 @@ class TestReluDoubleGradCheck(unittest.TestCase):
 class TestLeakyReluDoubleGradCheck(unittest.TestCase):
    @prog_scope()
    def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
        shape = [3, 7]
        eps = 0.005
        alpha = 0.2
@ -79,6 +77,7 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):

        x = layers.data('x', shape, False, dtype)
        x.persistable = True
+
        y = layers.leaky_relu(x, alpha=alpha)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        x_arr[np.abs(x_arr) < 0.005] = 0.02
@ -90,8 +89,30 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
+
+
+class TestConvDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 4, 14, 16]
+        eps = 0.005
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype)
+        y = layers.conv2d(x, 4, 1, bias_attr=False)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        w = fluid.default_main_program().global_block().all_parameters()
+        w_arr = []
+        for p in w:
+            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
+        gradient_checker.double_grad_check(
+            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        if core.is_compiled_with_cuda():
+            places = [fluid.CUDAPlace(0)]
+            for p in places:
+                self.func(p)


 if __name__ == "__main__":