[Dy2Stat]Support LoDTensorArray for slice op (#23091)

* Support LoDTensorArray for slice op. * Support read elements of list in dygraph_to_static * Fix infershape add test for infershape. * Support Tensor for Attr(starts) and Attr(ends). * Use new interfaces in VarTypeInference.
5 years ago · 803559499d
parent 84cf5db854
commit 803559499d
4 changed files with 304 additions and 32 deletions
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@ -33,13 +33,32 @@ class SliceOp : public framework::OperatorWithKernel {

    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
                      "Output (Out) of slice op should not be null.");
-
+    auto x_var_type = ctx->GetInputsVarType("Input")[0];
+    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      PADDLE_ENFORCE_EQ(axes.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The size of axes must be 1 when the Input of "
+                            "SliceOp is LoDTensorArray, "
+                            "but received %d.",
+                            axes.size()));
+      if (ctx->IsRuntime()) {
+        // If the var type of input is LOD_TENSOR_ARRAY,
+        // the output shape is determined by SliceKernel:Compute in runtime.
+        return;
+      } else {
+        // NOTE: A better way is needed to get accurate dims of tensor array.
+        // The resulted dim of GetInputDim("Input") is the dim of the
+        // last item written into TensorArray "Input". Maybe it's a bug to fix.
+        ctx->SetOutputDim("Out", ctx->GetInputDim("Input"));
+        return;
+      }
+    }
    auto in_dims = ctx->GetInputDim("Input");
    PADDLE_ENFORCE_LT(in_dims.size(), 7,
                      "The rank of input should be less than 7.");
    framework::DDim out_dims(in_dims);

-    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
@ -146,6 +165,25 @@ class SliceOp : public framework::OperatorWithKernel {
  }
 };

+class SliceOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = "Input";
+    auto out_name = "Out";
+    auto decrease_axis = ctx->GetAttr("decrease_axis");
+    auto not_decrease = boost::get<std::vector<int>>(decrease_axis).size() == 0;
+    if (not_decrease) {
+      // The default type of out is LoDTensor.
+      // However, if no axis is decreased and the type of input is not
+      // LoDTensor, the type of out should be the same as input.
+      // For example, input is a LoDTensorArray and no axis is decreased, the
+      // output should be a LoDTensorArray.
+      ctx->SetOutputType(out_name, ctx->GetInputType(x_name));
+      ctx->SetOutputDataType(out_name, ctx->GetInputDataType(x_name));
+    }
+  }
+};
+
 class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
@ -236,6 +274,14 @@ class SliceOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, "Input should not be null");
    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
                      "Input(Out@GRAD) should not be null");
+    auto x_var_type = ctx->GetInputsVarType("Input")[0];
+    if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      // If the var type of input is LOD_TENSOR_ARRAY,
+      // the output shape is determined by SliceGradKernel:Compute in runtime.
+      if (ctx->IsRuntime()) {
+        return;
+      }
+    }
    auto x_dims = ctx->GetInputDim("Input");
    auto x_grad_name = framework::GradVarName("Input");
    if (ctx->HasOutput(x_grad_name)) {
@ -262,6 +308,21 @@ class SliceOpGrad : public framework::OperatorWithKernel {
  }
 };

+class SliceOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x = "Input";
+    auto d_out = framework::GradVarName("Out");
+    auto out = framework::GradVarName("Input");
+    // The types of grad_input and input should always be the same.
+    // The default type of out is LoDTensor, but the type of input can be
+    // LoDTensor or LoDTensorArray,
+    // so set the type of both to be the same.
+    ctx->SetOutputType(out, ctx->GetInputType(x));
+    ctx->SetOutputDataType(out, ctx->GetInputDataType(d_out));
+  }
+};
+
 template <typename T>
 class SliceOpGradMaker : public framework::SingleGradOpMaker<T> {
 public:
@ -324,11 +385,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SliceOpGradNoNeedBufferVarsInference,
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
                  ops::SliceOpGradMaker<paddle::framework::OpDesc>,
-                  ops::SliceOpGradMaker<paddle::imperative::OpBase>);
+                  ops::SliceOpGradMaker<paddle::imperative::OpBase>,
+                  ops::SliceOpVarTypeInference);
 REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad,
                  ops::SliceDoubleOpGradMaker<paddle::framework::OpDesc>,
                  ops::SliceDoubleOpGradMaker<paddle::imperative::OpBase>,
-                  ops::SliceOpGradNoNeedBufferVarsInference);
+                  ops::SliceOpGradNoNeedBufferVarsInference,
+                  ops::SliceOpGradVarTypeInference);

 REGISTER_OP_CPU_KERNEL(
    slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@ -17,6 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"

 namespace paddle {
 namespace operators {
@ -58,7 +59,12 @@ template <typename DeviceContext, typename T>
 class SliceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    const framework::Variable* input_var = ctx.InputVar("Input");
+    bool is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
+    int rank = is_tensor_array
+                   ? 1
+                   : ctx.Input<framework::Tensor>("Input")->dims().size();
+
    switch (rank) {
      case 1:
        SliceCompute<1>(ctx);
@ -86,17 +92,17 @@ class SliceKernel : public framework::OpKernel<T> {
  void SliceCompute(const framework::ExecutionContext& context) const {
    auto& place =
        *context.template device_context<DeviceContext>().eigen_device();
-    auto in = context.Input<framework::Tensor>("Input");
-    auto out = context.Output<framework::Tensor>("Out");
-    auto out_dims = out->dims();
-    auto in_dims = in->dims();
+    const framework::Variable* input_var = context.InputVar("Input");
+    framework::Variable* out_var = context.OutputVar("Out");
+    bool input_is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
+    bool out_is_tensor_array = out_var->IsType<framework::LoDTensorArray>();

    auto axes = context.Attr<std::vector<int>>("axes");
    auto starts = context.Attr<std::vector<int>>("starts");
+
    auto ends = context.Attr<std::vector<int>>("ends");
    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
    auto infer_flags = context.Attr<std::vector<int>>("infer_flags");
-
    auto list_new_ends_tensor =
        context.MultiInput<framework::Tensor>("EndsTensorList");
    auto list_new_starts_tensor =
@ -109,7 +115,6 @@ class SliceKernel : public framework::OpKernel<T> {
    if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) {
      need_infer = true;
    }
-
    if (need_infer) {
      if (context.HasInput("StartsTensor")) {
        auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
@ -117,17 +122,70 @@ class SliceKernel : public framework::OpKernel<T> {
      } else if (list_new_starts_tensor.size() > 0) {
        starts = get_new_data_from_tensorlist(list_new_starts_tensor);
      }
-      PADDLE_ENFORCE_EQ(
-          starts.size(), axes.size(),
-          "The size of starts must be equal to the size of axes.");
      if (context.HasInput("EndsTensor")) {
        auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
        ends = get_new_data_from_tensor(ends_tensor);
      } else if (list_new_ends_tensor.size() > 0) {
        ends = get_new_data_from_tensorlist(list_new_ends_tensor);
      }
-      PADDLE_ENFORCE_EQ(ends.size(), axes.size(),
-                        "The size of ends must be equal to the size of axes.");
+    }
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+    if (input_is_tensor_array) {
+      auto in_array = context.Input<framework::LoDTensorArray>("Input");
+      // If the input is LoDTensorArray, the rank of input is 1.
+      int in_size = in_array->size();
+      int start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+      int end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, in_size);
+
+      PADDLE_ENFORCE_GT(end, start,
+                        platform::errors::InvalidArgument(
+                            "Attr(ends) should be greater than attr(starts) in "
+                            "slice op. But received ends = %d, starts = %d.",
+                            end, start));
+      int out_size = end - start;
+
+      if (out_is_tensor_array) {
+        auto out_array = context.Output<framework::LoDTensorArray>("Out");
+        out_array->resize(out_size);
+
+        for (int i = 0; i < out_size; ++i) {
+          auto* out_tensor = &out_array->at(i);
+          auto in_tensor = in_array->at(i + start);
+          out_tensor->set_lod(in_tensor.lod());
+          if (in_tensor.memory_size() > 0) {
+            TensorCopy(in_tensor, context.GetPlace(), out_tensor);
+          } else {
+            VLOG(10)
+                << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                   "nothing has been written to output array["
+                << i << "].";
+          }
+        }
+      } else {
+        auto out = context.Output<framework::Tensor>("Out");
+        auto in_tensor = in_array->at(start);
+        TensorCopy(in_tensor, context.GetPlace(), out);
+      }
+
+      return;
+    }
+
+    auto in = context.Input<framework::Tensor>("Input");
+    auto out = context.Output<framework::Tensor>("Out");
+
+    auto out_dims = out->dims();
+    auto in_dims = in->dims();
+    if (need_infer) {
      out_dims = in_dims;
      int dim_value, start, end;
      for (size_t i = 0; i < axes.size(); ++i) {
@ -233,7 +291,12 @@ template <typename DeviceContext, typename T>
 class SliceGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    size_t rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    const framework::Variable* input_var = ctx.InputVar("Input");
+    bool is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
+    size_t rank = is_tensor_array
+                      ? 1
+                      : ctx.Input<framework::Tensor>("Input")->dims().size();
+
    switch (rank) {
      case 1:
        SliceCompute<1>(ctx);
@ -261,17 +324,9 @@ class SliceGradKernel : public framework::OpKernel<T> {
  void SliceCompute(const framework::ExecutionContext& context) const {
    auto& place =
        *context.template device_context<DeviceContext>().eigen_device();
-    auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_input->mutable_data<T>(context.GetPlace());
-    auto out_dims = d_out->dims();
-    auto in_dims = d_input->dims();
    auto axes = context.Attr<std::vector<int>>("axes");
    auto starts = context.Attr<std::vector<int>>("starts");
    auto ends = context.Attr<std::vector<int>>("ends");
-
    auto list_new_ends_tensor =
        context.MultiInput<framework::Tensor>("EndsTensorList");
    auto list_new_starts_tensor =
@ -290,6 +345,66 @@ class SliceGradKernel : public framework::OpKernel<T> {
      auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
      ends = get_new_data_from_tensor(ends_tensor);
    }
+    framework::Variable* d_input_var =
+        context.OutputVar(framework::GradVarName("Input"));
+    const framework::Variable* d_out_var =
+        context.InputVar(framework::GradVarName("Out"));
+    bool d_input_is_tensor_array =
+        d_input_var->IsType<framework::LoDTensorArray>();
+    bool d_out_is_tensor_array = d_out_var->IsType<framework::LoDTensorArray>();
+
+    if (d_input_is_tensor_array) {
+      auto* input_array = context.Input<framework::LoDTensorArray>("Input");
+      auto* d_input_array = context.Output<framework::LoDTensorArray>(
+          framework::GradVarName("Input"));
+
+      int d_in_size = input_array->size();
+      d_input_array->resize(d_in_size);
+      // If the input is LoDTensorArray, the rank of input is 1.
+      // So only use the 0th element of starts.
+      int start = starts[0] < 0 ? (starts[0] + d_in_size) : starts[0];
+      start = std::max(start, 0);
+      // set zero
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto& dev_ctx = *pool.Get(context.GetPlace());
+      T value = 0.0;
+      math::SetConstant<DeviceContext, T> functor;
+      for (int i = 0; i < d_in_size; ++i) {
+        auto dim = input_array->at(i).dims();
+        d_input_array->at(i).Resize(dim);
+        d_input_array->at(i).mutable_data<T>(context.GetPlace());
+        functor(reinterpret_cast<const DeviceContext&>(dev_ctx),
+                &d_input_array->at(i), static_cast<T>(value));
+      }
+
+      if (d_out_is_tensor_array) {
+        auto* d_out_array = context.Input<framework::LoDTensorArray>(
+            framework::GradVarName("Out"));
+        int d_out_size = d_out_array->size();
+        for (int i = 0; i < d_out_size; ++i) {
+          TensorCopy(d_out_array->at(i), context.GetPlace(),
+                     &(d_input_array->at(start + i)));
+        }
+
+      } else {
+        auto* d_out =
+            context.Input<framework::Tensor>(framework::GradVarName("Out"));
+        TensorCopy(*d_out, context.GetPlace(), &(d_input_array->at(start)));
+      }
+      return;
+    }
+
+    auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    auto* d_input =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+    d_input->mutable_data<T>(context.GetPlace());
+
+    auto out_dims = d_out->dims();
+    auto in_dims = d_input->dims();

    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
    if (decrease_axis.size() > 0) {
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
@ -42,28 +42,29 @@ def test_slice_in_if(x):
                shape=[1, 2], value=9, dtype="int64"))
    if x.numpy()[0] > 0:
        a[0] = x
-    return a
+    out = a[0:]
+    return out


 def test_slice_in_while_loop(x, iter_num):
    x = fluid.dygraph.to_variable(x)
-    iter_num = fluid.layers.fill_constant(
+    iter_num_var = fluid.layers.fill_constant(
        shape=[1], value=iter_num, dtype="int32")
    a = []
    i = 0
    # Note: `i < iter_num` can't be supported in dygraph mode now,
    # but PR22892 is fixing it https://github.com/PaddlePaddle/Paddle/pull/22892.
    # If PR22892 merged, change `i < iter_num.numpy()[0]` to `i < iter_num`.
-    while i < iter_num.numpy()[0]:
+    while i < iter_num_var.numpy()[0]:
        a.append(x)
        i += 1

    i = 0
-    while i < iter_num.numpy()[0]:
+    while i < iter_num_var.numpy()[0]:
        a[i] = fluid.layers.fill_constant(shape=[2], value=2, dtype="float32")
        i += 1
-
-    return a
+    out = a[0:iter_num]
+    return out


 def test_slice_in_for_loop(x, iter_num):
@ -79,7 +80,8 @@ def test_slice_in_for_loop(x, iter_num):

    for i in range(iter_num):
        a[i] = x
-    return a
+    out = a[2]
+    return out


 class TestSliceWithoutControlFlow(unittest.TestCase):
@ -148,6 +150,8 @@ class TestSliceInWhileLoop(TestSliceWithoutControlFlow):
    def run_dygraph_mode(self):
        with fluid.dygraph.guard():
            var_res = self.dygraph_func(self.input, self.iter_num)
+            if not isinstance(var_res, list):
+                var_res = [var_res]
            numpy_res = [ele.numpy() for ele in var_res]
            return numpy_res

@ -173,6 +177,15 @@ class TestSliceInForLoop(TestSliceInWhileLoop):
    def init_dygraph_func(self):
        self.dygraph_func = test_slice_in_for_loop

+    def run_static_mode(self):
+        main_program = fluid.Program()
+        with fluid.program_guard(main_program):
+            static_out = dygraph_to_static_func(self.dygraph_func)(
+                self.input, self.iter_num)
+        exe = fluid.Executor(self.place)
+        numpy_res = exe.run(main_program, fetch_list=static_out)
+        return numpy_res
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@ -19,6 +19,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers


 # Situation 1: starts(list, no tensor), ends(list, no tensor)
@ -528,5 +529,85 @@ class TestSliceAPI(unittest.TestCase):
        assert np.array_equal(res_7, input[-1, 0:100, :, 2:-1])


+class TestSliceApiWithLoDTensorArray(unittest.TestCase):
+    def setUp(self):
+        self.shape = (3, 4)
+        self.data = np.random.random(size=self.shape).astype('float32')
+        self.idx = 0
+        self.start = 0
+        self.end = 2
+        self.axis = 1
+
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        self.exe = fluid.Executor(self.place)
+
+    def set_program_and_run(self, main_program, case_num):
+        with fluid.program_guard(main_program):
+            x = [
+                fluid.data(
+                    name='x0', shape=self.shape, dtype="float32"), fluid.data(
+                        name='x1', shape=self.shape, dtype="float32"),
+                fluid.data(
+                    name='x2', shape=self.shape, dtype="float32")
+            ]
+
+            for each_x in x:
+                each_x.stop_gradient = False
+
+            arr = layers.create_array(dtype="float32")
+            for i in range(3):
+                idx = layers.array_length(arr)
+                arr = layers.array_write(x=x[i], i=idx, array=arr)
+
+            if case_num == 1:
+                self.sliced_arr = output = arr[0]
+
+            elif case_num == 2:
+                end = fluid.layers.array_length(arr) - 1
+                end = fluid.layers.cast(end, "int32")
+                self.sliced_arr = slice_arr = arr[self.start:end]
+                output, _ = fluid.layers.tensor_array_to_tensor(
+                    slice_arr, axis=self.axis, use_stack=True)
+
+            loss = fluid.layers.reduce_sum(output)
+            fluid.backward.append_backward(loss)
+            g_vars = list(
+                map(main_program.global_block().var,
+                    [each_x.name + "@GRAD" for each_x in x]))
+            self.out, self.g_x0, self.g_x1, self.g_x2 = \
+                self.exe.run(main_program,
+                             feed = {'x0': self.data,
+                                     'x1': self.data,
+                                     'x2': self.data},
+                             fetch_list=[output] + g_vars)
+
+    def test_case_1(self):
+        main_program = fluid.Program()
+        self.set_program_and_run(main_program, 1)
+
+        self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR)
+        self.assertEqual(self.sliced_arr.shape, self.shape)
+        self.assertTrue(np.array_equal(self.out, self.data))
+        self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data)))
+        self.assertTrue(np.array_equal(self.g_x1, np.zeros_like(self.data)))
+        self.assertTrue(np.array_equal(self.g_x2, np.zeros_like(self.data)))
+
+    def test_case_2(self):
+        main_program = fluid.Program()
+        self.set_program_and_run(main_program, 2)
+
+        self.assertTrue(
+            self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        self.assertEqual(self.sliced_arr.shape, self.shape)
+        self.assertTrue(
+            np.array_equal(
+                self.out, np.stack(
+                    [self.data, self.data], axis=self.axis)))
+        self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data)))
+        self.assertTrue(np.array_equal(self.g_x1, np.ones_like(self.data)))
+        self.assertTrue(np.array_equal(self.g_x2, np.zeros_like(self.data)))
+
+
 if __name__ == '__main__':
    unittest.main()