kunlun add op (#27890)

* add stack pool2d roi_align xpu op,test=kunlun * error message opt, test=kunlun * add xpu unittest,test=kunlun * skip check grad,test=kunlun * fix boostget , test=kunlun
4 years ago · b0edda4d99
parent 82eb486edd
commit b0edda4d99
6 changed files with 939 additions and 0 deletions
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/pool_op.h"
+#include <unordered_map>
+
+#ifdef PADDLE_WITH_XPU
+namespace paddle {
+namespace operators {
+
+xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive,
+                              bool is_test) {
+  if (pooltype == "max") {
+    return xpu::Pooling_t::MAX_WITHOUT_INDEX;
+  } else if (pooltype == "avg") {
+    if (exclusive) {
+      return xpu::Pooling_t::AVG_WITHOUT_PAD;
+    } else {
+      return xpu::Pooling_t::AVG_WITH_PAD;
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Pool op only supports 2D and 3D input."));
+  }
+}
+template <typename DeviceContext, typename T>
+class PoolXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool is_test = context.Attr<bool>("is_test");
+    bool adaptive = context.Attr<bool>("adaptive");
+    PADDLE_ENFORCE_EQ(!adaptive, true,
+                      platform::errors::InvalidArgument(
+                          "XPU does not support adaptive == true!"));
+    PADDLE_ENFORCE_EQ(ksize.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "XPU only support 2 dimension pooling!"));
+    int* index_data = nullptr;
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    const int c = in_x->dims()[0] * in_x->dims()[1];
+    const int in_h = in_x->dims()[2];
+    const int in_w = in_x->dims()[3];
+    const int out_h = out->dims()[2];
+    const int out_w = out->dims()[3];
+    const int win_h = ksize[0];
+    const int win_w = ksize[1];
+    const int stride_h = strides[0];
+    const int stride_w = strides[1];
+    const int pad_up = paddings[0];
+    const int pad_down = paddings[0];
+    const int pad_left = paddings[1];
+    const int pad_right = paddings[1];
+    const float* input = in_x->data<float>();
+    out->mutable_data<T>(context.GetPlace());
+    float* output = out->data<float>();
+    xpu::Pooling_t pool_type = XPUPoolingType(pooling_type, exclusive, is_test);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::pooling_forward<float, float>(
+        dev_ctx.x_context(), input, output, index_data, pool_type, c, in_h,
+        in_w, pad_left, pad_right, pad_up, pad_down, win_h, win_w, stride_h,
+        stride_w, out_h, out_w);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::InvalidArgument("pool2d XPU kernel error!"));
+  }
+};
+template <typename DeviceContext, typename T>
+class PoolGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
+    const int* index_data = nullptr;
+    PADDLE_ENFORCE_EQ(!adaptive, true,
+                      platform::errors::InvalidArgument(
+                          "XPU does not support adaptive == true!"));
+    PADDLE_ENFORCE_EQ(ksize.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "XPU only support 2 dimension pooling!"));
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    if (!in_x_grad) {
+      return;
+    }
+    const int c = in_x->dims()[0] * in_x->dims()[1];
+    const int in_h = in_x->dims()[2];
+    const int in_w = in_x->dims()[3];
+    const int out_h = out->dims()[2];
+    const int out_w = out->dims()[3];
+    const int win_h = ksize[0];
+    const int win_w = ksize[1];
+    const int stride_h = strides[0];
+    const int stride_w = strides[1];
+    const int pad_up = paddings[0];
+    const int pad_down = paddings[0];
+    const int pad_left = paddings[1];
+    const int pad_right = paddings[1];
+    const float* input = in_x->data<float>();
+    const float* output = out->data<float>();
+    const float* output_grad = out_grad->data<float>();
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    float* input_grad = in_x_grad->data<float>();
+    xpu::Pooling_t pool_type = XPUPoolingType(pooling_type, exclusive, false);
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // Need to init memory in the first place
+    const int zero = 0;
+    int r =
+        xpu::memset(dev_ctx.x_context(), reinterpret_cast<void**>(input_grad),
+                    zero, in_x_grad->numel() * sizeof(float));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::InvalidArgument(
+                          "There are pool2d grad XPU kernel error raised!"));
+    r = xpu::pooling_backward(dev_ctx.x_context(), input, output, index_data,
+                              output_grad, input_grad, pool_type, c, in_h, in_w,
+                              pad_left, pad_right, pad_up, pad_down, win_h,
+                              win_w, stride_h, stride_w, out_h, out_w);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::InvalidArgument(
+                          "There are pool2d grad XPU kernel error raised!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    pool2d, ops::PoolXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    pool2d_grad,
+    ops::PoolGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/roi_align_op.h"
+#include <memory>
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class XPUROIAlignOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+    const T* input_data = in->data<T>();
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        platform::errors::InvalidArgument(
+            "The rois_batch_size and imgs batch_size must be the same."));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      platform::errors::InvalidArgument(
+                          "The rois_num from input and lod must be the same."));
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* rois_data = rois->data<T>();
+    for (int n = 0; n < rois_batch_size; n++) {
+      int cur_batch_rois_num = rois_lod[n + 1] - rois_lod[n];
+      if (cur_batch_rois_num != 0) {
+        int r = xpu::roi_align(
+            dev_ctx.x_context(), input_data + n * channels * height * width,
+            rois_data + rois_lod[n] * 4, cur_batch_rois_num, channels, height,
+            width, pooled_height, pooled_width, sampling_ratio, spatial_scale,
+            output_data +
+                rois_lod[n] * channels * pooled_height * pooled_width);
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::InvalidArgument("roi_align XPU kernel error!"));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    roi_align,
+    ops::XPUROIAlignOpKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@ -0,0 +1,75 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/stack_op.h"
+#ifdef PADDLE_WITH_XPU
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+template <typename DeviceContext, typename T>
+class StackXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) {
+      axis += (x[0]->dims().size() + 1);
+    }
+    int n = static_cast<int>(x.size());
+    PADDLE_ENFORCE_LE(n, 24,
+                      platform::errors::InvalidArgument(
+                          "XPU only surpport at most 24 tensors for now"));
+    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
+    int pre = 1, post = 1;
+    auto& dim = x[0]->dims();
+    for (auto i = 0; i < axis; ++i) {
+      pre *= dim[i];
+    }
+    for (auto i = axis; i < dim.size(); ++i) {
+      post *= dim[i];
+    }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    void* x_datas_host = std::malloc(n * sizeof(void*));
+    void* x_datas_device = nullptr;
+    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
+                              n * sizeof(void*)) == XPU_SUCCESS);
+    for (auto i = 0; i < n; ++i) {
+      ((const void**)x_datas_host)[i] = x[i]->data<T>();
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 x_datas_device, platform::CPUPlace(), x_datas_host,
+                 n * sizeof(void*));
+    int r = xpu::stack_forward<float>(dev_ctx.x_context(), pre, post, n,
+                                      x_datas_device, y_data);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::InvalidArgument(
+                          "There are stack XPU kernel error raised!"));
+    dev_ctx.Wait();
+    std::free(x_datas_host);
+    xpu_free(x_datas_device);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(stack,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, float>);
+#endif
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@ -0,0 +1,183 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import math
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+@skip_check_grad_ci(reason="There is no grad kernel for roi_align_xpu kernel.")
+class TestROIAlignOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_align()
+
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+        }
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width,
+            'sampling_ratio': self.sampling_ratio
+        }
+
+        self.outputs = {'Out': self.out_data}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = -1
+
+        self.x = np.random.random(self.x_dim).astype('float64')
+
+    def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
+                 bin_size_h, bin_size_w):
+        count = roi_bin_grid_h * roi_bin_grid_w
+        bilinear_pos = np.zeros(
+            [self.channels, self.pooled_height, self.pooled_width, count, 4],
+            np.float64)
+        bilinear_w = np.zeros(
+            [self.pooled_height, self.pooled_width, count, 4], np.float64)
+        for ph in range(self.pooled_width):
+            for pw in range(self.pooled_height):
+                c = 0
+                for iy in range(roi_bin_grid_h):
+                    y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
+                        bin_size_h / roi_bin_grid_h
+                    for ix in range(roi_bin_grid_w):
+                        x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
+                            bin_size_w / roi_bin_grid_w
+                        if y < -1.0 or y > self.height or \
+                               x < -1.0 or x > self.width:
+                            continue
+                        if y <= 0:
+                            y = 0
+                        if x <= 0:
+                            x = 0
+                        y_low = int(y)
+                        x_low = int(x)
+                        if y_low >= self.height - 1:
+                            y = y_high = y_low = self.height - 1
+                        else:
+                            y_high = y_low + 1
+                        if x_low >= self.width - 1:
+                            x = x_high = x_low = self.width - 1
+                        else:
+                            x_high = x_low + 1
+                        ly = y - y_low
+                        lx = x - x_low
+                        hy = 1 - ly
+                        hx = 1 - lx
+                        for ch in range(self.channels):
+                            bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
+                                                                 x_high]
+                            bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
+                                                                 x_high]
+                        bilinear_w[ph, pw, c, 0] = hy * hx
+                        bilinear_w[ph, pw, c, 1] = hy * lx
+                        bilinear_w[ph, pw, c, 2] = ly * hx
+                        bilinear_w[ph, pw, c, 3] = ly * lx
+                        c = c + 1
+        return bilinear_pos, bilinear_w
+
+    def calc_roi_align(self):
+        self.out_data = np.zeros(
+            (self.rois_num, self.channels, self.pooled_height,
+             self.pooled_width)).astype('float64')
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            x_i = self.x[roi_batch_id]
+            roi_xmin = roi[1] * self.spatial_scale
+            roi_ymin = roi[2] * self.spatial_scale
+            roi_xmax = roi[3] * self.spatial_scale
+            roi_ymax = roi[4] * self.spatial_scale
+            roi_width = max(roi_xmax - roi_xmin, 1)
+            roi_height = max(roi_ymax - roi_ymin, 1)
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_height / self.pooled_height)
+            roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_width / self.pooled_width)
+            count = int(roi_bin_grid_h * roi_bin_grid_w)
+            pre_size = count * self.pooled_width * self.pooled_height
+            bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
+                                                     int(roi_bin_grid_h),
+                                                     int(roi_bin_grid_w),
+                                                     bin_size_h, bin_size_w)
+            for ch in range(self.channels):
+                align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
+                output_val = align_per_bin.mean(axis=-1)
+                self.out_data[i, ch, :, :] = output_val
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
+
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype("float64")
+
+    def setUp(self):
+        self.op_type = "roi_align"
+        self.set_data()
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@ -0,0 +1,100 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+@skip_check_grad_ci(reason="There is no grad kernel for stack_xpu op.")
+class TestStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float64'
+
+    def initParameters(self):
+        pass
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'stack'
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+class TestStackOp1(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 16
+
+
+class TestStackOp2(TestStackOpBase):
+    def initParameters(self):
+        self.num_inputs = 20
+
+
+class TestStackOp3(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestStackOpBase):
+    def initParameters(self):
+        self.axis = -4
+
+
+class TestStackOp5(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestStackOpBase):
+    def initParameters(self):
+        self.axis = 3
+
+
+if __name__ == '__main__':
+    unittest.main()