update kunlun conv2d/softmax/elementwise implemetation (#29229)

* update conv2d & softmax to new xpu api * test=kunlun * remove useless comments * test=kunlun * remote softmax xpu op * test=kunlun * update kunlun softmax * test=kunlun * update xpu unitest * test=kunlun * fix elementwise_grad bug for kunlun *test=kunlun
5 years ago · 64f29fbb70
parent b11ab12787
commit 64f29fbb70
8 changed files with 95 additions and 163 deletions
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@ -4,7 +4,7 @@ endif()

 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_10.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_30.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@ -27,10 +27,6 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
    // that avoids modifying the variable in the Scope.
    Tensor filter = *context.Input<Tensor>("Filter");
    Tensor* output = context.Output<Tensor>("Output");
-    // Tensor* max_input = context.Output<Tensor>("MaxInput");
-    // Tensor* max_filter = context.Output<Tensor>("MaxFilter");
-    // max_input->mutable_data<T>(context.GetPlace());
-    // max_filter->mutable_data<T>(context.GetPlace());
    output->mutable_data<T>(context.GetPlace());
    int groups = context.Attr<int>("groups");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
@ -43,52 +39,18 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
    const int f = static_cast<int>(filter.dims()[0]);
    const int win_h = static_cast<int>(filter.dims()[2]);
    const int win_w = static_cast<int>(filter.dims()[3]);
-    PADDLE_ENFORCE_EQ(
-        dilations[0] == 1 && dilations[1] == 1, true,
-        platform::errors::InvalidArgument("XPU only support dilation == 1."));
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // PADDLE_ENFORCE_EQ(
-    //     xpu::findmax(dev_ctx.x_context(), input->data<T>(), input->numel(),
-    //                  max_input->data<T>()) == xpu::Error_t::SUCCESS,
-    //     true, platform::errors::InvalidArgument(
-    //               "XPU conv kernel error,can not finde max_input,please "
-    //               "check whether Baidu Kunlun "
-    //               "Card is properly installed."));
-    // PADDLE_ENFORCE_EQ(
-    //     xpu::findmax(dev_ctx.x_context(), filter.data<T>(), filter.numel(),
-    //                  max_filter->data<T>()) == xpu::Error_t::SUCCESS,
-    //     true, platform::errors::InvalidArgument(
-    //               "XPU conv kernel error,can not find max_filter,please "
-    //               "check whether Baidu Kunlun "
-    //               "Card is properly installed."));
-    if (groups == 1) {
-      int r = xpu::conv2d_forward_int16<float, float, float, float>(
-          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
-          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
-          dilations[1], groups, input->data<float>(), filter.data<float>(),
-          output->data<float>(), nullptr, nullptr, xpu::Activation_t::LINEAR,
-          nullptr, nullptr);
-      // max_input->data<float>(), max_filter->data<float>());
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU conv kernel return wrong value[%d], "
-                                     "please check whether Baidu Kunlun Card "
-                                     "is properly installed.",
-                                     r));
-    } else {
-      int r = xpu::conv2d_int16_with_group<float, float, float>(
-          dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
-          output->data<float>(), batch_size, img_c, img_h, img_w, f, win_h,
-          win_w, groups, strides[0], strides[1], paddings[0], paddings[1],
-          nullptr, nullptr);
-      // max_input->data<float>(), max_filter->data<float>());
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU conv kernel return wrong value[%d], "
-                                     "please check whether Baidu Kunlun Card "
-                                     "is properly installed.",
-                                     r));
-    }
+    std::vector<int> k_size;
+    k_size.push_back(win_h);
+    k_size.push_back(win_w);
+    int r = xpu::conv2d<float, float, float, int16_t>(
+        dev_ctx.x_context(), input->data<float>(), filter.data<float>(),
+        output->data<float>(), batch_size, img_c, img_h, img_w, f, k_size,
+        strides, paddings, dilations, groups, nullptr, nullptr, nullptr, true);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU conv kernel return wrong value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
  }
 };
 template <typename DeviceContext, typename T>
@ -96,9 +58,6 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    const Tensor* input = context.Input<Tensor>("Input");
-    // const Tensor* max_input = context.Input<Tensor>("MaxInput");
-    // const Tensor* max_filter = context.Input<Tensor>("MaxFilter");
-    // Tensor* max_output_grad = context.Output<Tensor>("MaxOutputGrad");
    const Tensor* output_grad =
        context.Input<Tensor>(framework::GradVarName("Output"));
    Tensor* input_grad =
@ -115,11 +74,6 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
    const int batch_size = static_cast<int>(input->dims()[0]);
-    PADDLE_ENFORCE_EQ(groups == 1, true, platform::errors::InvalidArgument(
-                                             "XPU only support groups == 1."));
-    PADDLE_ENFORCE_EQ(
-        dilations[0] == 1 && dilations[1] == 1, true,
-        platform::errors::InvalidArgument("XPU only support dilation == 1."));
    const int img_c = static_cast<int>(input->dims()[1]);
    const int img_h = static_cast<int>(input->dims()[2]);
    const int img_w = static_cast<int>(input->dims()[3]);
@ -133,52 +87,24 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
      filter_grad->mutable_data<T>(context.GetPlace());
    }
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // max_output_grad->Resize({4});
-    // max_output_grad->mutable_data<T>(context.GetPlace());
-    // PADDLE_ENFORCE_EQ(
-    //     xpu::findmax(dev_ctx.x_context(), output_grad->data<T>(),
-    //                  output_grad->numel(),
-    //                  max_output_grad->data<T>()) == xpu::Error_t::SUCCESS,
-    //     true,
-    //     platform::errors::External(
-    //         "XPU conv kernel error, can not find max_output_grad, please
-    //         check "
-    //         "whether Baidu Kunlun Card is "
-    //         "properly installed."));
-    if (input_grad) {
-      int r = xpu::conv2d_backward_int16(
-          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
-          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
-          dilations[1], groups, output_grad->data<float>(),
-          filter.data<float>(), input_grad->data<float>(), nullptr, nullptr);
-      // max_output_grad->data<float>(), max_filter->data<float>());
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU conv kernel return wrong value[%d], "
-                                     "please check whether Baidu Kunlun Card "
-                                     "is properly installed.",
-                                     r));
-    }
-    if (filter_grad) {
-      int r = xpu::conv2d_backward_weight_int16(
-          dev_ctx.x_context(), batch_size, img_c, img_h, img_w, f, win_h, win_w,
-          strides[0], strides[1], paddings[0], paddings[1], dilations[0],
-          dilations[1], groups, output_grad->data<float>(),
-          input->data<float>(), filter_grad->data<float>(), nullptr, nullptr);
-      // max_output_grad->data<float>(), max_input->data<float>());
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU conv kernel return wrong value[%d], "
-                                     "please check whether Baidu Kunlun Card "
-                                     "is properly installed.",
-                                     r));
-    }
+    std::vector<int> k_size;
+    k_size.push_back(win_h);
+    k_size.push_back(win_w);
+    int r = xpu::conv2d_grad<float, float, float, int16_t>(
+        dev_ctx.x_context(), input->data<T>(), filter.data<T>(),
+        output_grad->data<T>(), input_grad ? input_grad->data<T>() : nullptr,
+        filter_grad ? filter_grad->data<T>() : nullptr, batch_size, img_c,
+        img_h, img_w, f, k_size, strides, paddings, dilations, groups, nullptr,
+        nullptr, nullptr, nullptr, nullptr, true);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU conv kernel return wrong value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-// TODO(xingzhaolong): neon kernel for mobile
 REGISTER_OP_XPU_KERNEL(
    depthwise_conv2d,
    ops::GemmConvXPUKernel<paddle::platform::XPUDeviceContext, float>);
@ -187,4 +113,7 @@ REGISTER_OP_XPU_KERNEL(
 REGISTER_OP_XPU_KERNEL(
    conv2d_grad,
    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    depthwise_conv2d_grad,
+    ops::GemmConvGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
 #endif
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@ -65,7 +65,7 @@ static std::pair<std::vector<int>, std::vector<int>> XPUReducesAxisVector(
  }
  int yidx = 0;
  for (size_t i = 0; i < x_vector.size(); ++i) {
-    if (y[yidx] == 1) {
+    if (yidx >= y.size() || y[yidx] == 1) {
      axis_v.push_back(i);
      yidx++;
      continue;
@ -134,10 +134,10 @@ void XPUElementwise(
    std::pair<std::vector<int>, std::vector<int>> bcast_v =
        XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim);

-    ret = xpu::broadcast<T>(
-        dev_ctx.x_context(), x_data,
-        x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), z->numel()),
-        bcast_v.first, bcast_v.second);
+    ret = xpu::broadcast<T>(dev_ctx.x_context(), x_data,
+                            x_broadcast_tensor.mutable_data<T>(
+                                ctx.GetPlace(), z->numel() * sizeof(T)),
+                            bcast_v.first, bcast_v.second);
    PADDLE_ENFORCE_EQ(
        ret, xpu::SUCCESS,
        platform::errors::External(
@ -153,10 +153,10 @@ void XPUElementwise(
    std::vector<int> bcast_y_v;
    std::pair<std::vector<int>, std::vector<int>> bcast_v =
        XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim);
-    ret = xpu::broadcast<T>(
-        dev_ctx.x_context(), y_data,
-        y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), z->numel()),
-        bcast_v.first, bcast_v.second);
+    ret = xpu::broadcast<T>(dev_ctx.x_context(), y_data,
+                            y_broadcast_tensor.mutable_data<T>(
+                                ctx.GetPlace(), z->numel() * sizeof(T)),
+                            bcast_v.first, bcast_v.second);
    PADDLE_ENFORCE_EQ(
        ret, xpu::SUCCESS,
        platform::errors::External(
@ -231,13 +231,15 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
  bool dx_need_reduce = (dx != nullptr) && (dx->numel() != len);
  bool dy_need_reduce = (dy != nullptr) && (dy->numel() != len);

-  T* dx_data = ((dx == nullptr) || dx_need_reduce)
-                   ? (dx_local_tensor.mutable_data<T>(ctx.GetPlace(), len))
-                   : (dx->mutable_data<T>(ctx.GetPlace()));
+  T* dx_data =
+      ((dx == nullptr) || dx_need_reduce)
+          ? (dx_local_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)))
+          : (dx->mutable_data<T>(ctx.GetPlace()));

-  T* dy_data = ((dy == nullptr) || dy_need_reduce)
-                   ? (dy_local_tensor.mutable_data<T>(ctx.GetPlace(), len))
-                   : (dy->mutable_data<T>(ctx.GetPlace()));
+  T* dy_data =
+      ((dy == nullptr) || dy_need_reduce)
+          ? (dy_local_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)))
+          : (dy->mutable_data<T>(ctx.GetPlace()));

  int ret = xpu::SUCCESS;
  auto& dev_ctx =
@ -250,8 +252,8 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
        XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim);
    ret = xpu::broadcast<T>(
        dev_ctx.x_context(), x_data,
-        x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len), bcast_v.first,
-        bcast_v.second);
+        x_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)),
+        bcast_v.first, bcast_v.second);
    PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS,
                      platform::errors::External(
                          "XPU kernel broadcast error occur! %d", ret));
@ -267,8 +269,8 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
        XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim);
    ret = xpu::broadcast<T>(
        dev_ctx.x_context(), y_data,
-        y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len), bcast_v.first,
-        bcast_v.second);
+        y_broadcast_tensor.mutable_data<T>(ctx.GetPlace(), len * sizeof(T)),
+        bcast_v.first, bcast_v.second);
    PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS,
                      platform::errors::External(
                          "XPU kernel broadcast error occur! %d", ret));
@ -287,9 +289,9 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
    const framework::DDim& dx_dims = dx->dims();
    std::pair<std::vector<int>, std::vector<int>> reduce_v =
        XPUReducesAxisVector(out_dim, dx_dims);
-    ret = xpu::reduce_sum(dev_ctx.x_context(), dx_data,
-                          dx->mutable_data<T>(ctx.GetPlace()), reduce_v.first,
-                          reduce_v.second);
+    ret = xpu::reduce_sum<T>(dev_ctx.x_context(), dx_data,
+                             dx->mutable_data<T>(ctx.GetPlace()),
+                             reduce_v.first, reduce_v.second);
    PADDLE_ENFORCE_EQ(
        ret, xpu::SUCCESS,
        platform::errors::External("XPU kernel reduce_sum occur error in "
@ -302,9 +304,9 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
    const framework::DDim& dy_dims = dy->dims();
    std::pair<std::vector<int>, std::vector<int>> reduce_v =
        XPUReducesAxisVector(out_dim, dy_dims);
-    ret = xpu::reduce_sum(dev_ctx.x_context(), dy_data,
-                          dy->mutable_data<T>(ctx.GetPlace()), reduce_v.first,
-                          reduce_v.second);
+    ret = xpu::reduce_sum<T>(dev_ctx.x_context(), dy_data,
+                             dy->mutable_data<T>(ctx.GetPlace()),
+                             reduce_v.first, reduce_v.second);
    PADDLE_ENFORCE_EQ(
        ret, xpu::SUCCESS,
        platform::errors::External("XPU kernel reduce_sum occur error in "
--- a/paddle/fluid/operators/softmax_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@ -1,11 +1,8 @@
 /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -30,29 +27,27 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
    auto* x = context.Input<Tensor>("X");
    auto* out = context.Output<Tensor>("Out");
    const int rank = x->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true,
-                      platform::errors::InvalidArgument(
-                          "xpu softmax kernel only support last dimension of x "
-                          "(axis==-1 or axis==x_dims-1), but received axis: "
-                          "%d, x's shape: %s.",
-                          axis, x->dims()));
+    int axis = CanonicalAxis(context.Attr<int>("axis"), rank);

    // allocate memory on device.
    out->mutable_data<T>(context.GetPlace());

-    const int n = SizeToAxis(axis, x->dims());
-    const int d = SizeFromAxis(axis, x->dims());
+    std::vector<int> x_dims;
+    for (int i = 0; i < rank; i++) {
+      x_dims.push_back(x->dims()[i]);
+    }
+    if (axis < 0) {
+      axis += rank;
+    }

    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data<float>(),
-                                   out->data<float>(), n, d, d <= 2048);
+    int r = xpu::softmax<T>(dev_ctx.x_context(), x->data<float>(),
+                            out->data<float>(), x_dims, axis);
    PADDLE_ENFORCE_EQ(
        r, XPU_SUCCESS,
        platform::errors::External("XPU API(softmax2d_forward) return wrong "
-                                   "value[%d], please check whether "
-                                   "Baidu Kunlun Card is properly installed.",
-                                   r));
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
  }
 };

@ -64,24 +59,28 @@ class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
    const int rank = dx->dims().size();
-    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis = CanonicalAxis(context.Attr<int>("axis"), rank);

    // allocate memory on device.
    dx->mutable_data<T>(context.GetPlace());

-    const int n = SizeToAxis(axis, dx->dims());
-    const int d = SizeFromAxis(axis, dx->dims());
+    std::vector<int> x_dims;
+    for (int i = 0; i < rank; i++) {
+      x_dims.push_back(dx->dims()[i]);
+    }
+    if (axis < 0) {
+      axis += rank;
+    }

    auto& dev_ctx = context.template device_context<DeviceContext>();
-    int r =
-        xpu::softmax2d_backward(dev_ctx.x_context(), out->data<float>(),
-                                dout->data<float>(), dx->data<float>(), n, d);
+    int r = xpu::softmax_grad<T>(dev_ctx.x_context(), out->data<float>(),
+                                 dout->data<float>(), dx->data<float>(), x_dims,
+                                 axis);
    PADDLE_ENFORCE_EQ(
        r, XPU_SUCCESS,
        platform::errors::External("XPU API(softmax2d_backward) return wrong "
-                                   "value[%d], please check whether "
-                                   "Baidu Kunlun Card is properly installed.",
-                                   r));
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
  }
 };

--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu_header.h
@ -15,6 +15,7 @@
 #pragma once

 #ifdef PADDLE_WITH_XPU
+#include <map>
 #include <string>
 #include <unordered_map>

@ -48,4 +49,11 @@ class XPUActHelper {
    return res->second;
  }
 };
+
+static std::map<int, std::string> XPUAPIErrorMsg = {
+    {xpu::Error_t::SUCCESS, "xpu api success"},
+    {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
+    {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
+    {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
+
 #endif
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@ -1915,6 +1915,10 @@ def load(program, model_path, executor=None, var_list=None):
            place = paddle.fluid.CPUPlace()
        elif p.is_cuda_pinned_place():
            place = paddle.fluid.CUDAPinnedPlace()
+        elif p.is_xpu_place():
+            p = paddle.fluid.core.Place()
+            p.set_place(t._place())
+            place = paddle.fluid.XPUPlace(p.xpu_device_id())
        else:
            p = paddle.fluid.core.Place()
            p.set_place(t._place())
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@ -362,17 +362,6 @@ class XPUOpTest(OpTest):
        if not type(output_names) is list:
            output_names = [output_names]

-        numeric_grads = user_defined_grads or [
-            get_numeric_gradient(
-                place,
-                self.scope,
-                self.op,
-                self.inputs,
-                input_to_check,
-                output_names,
-                delta=numeric_grad_delta,
-                in_place=in_place) for input_to_check in inputs_to_check
-        ]
        analytic_grads = self._get_gradient(inputs_to_check, place,
                                            output_names, no_grad_set)
        return analytic_grads
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
@ -13,6 +13,9 @@
 # limitations under the License.

 from __future__ import print_function
+import sys
+sys.path.append("..")
+
 from test_softmax_op import stable_softmax
 from op_test import OpTest
 import paddle.fluid.core as core
@ -20,8 +23,6 @@ import paddle

 import unittest
 import numpy as np
-import sys
-sys.path.append("..")


 def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):