Rewrite Matmul, make code cleaner

8 years ago · c6a6d87f96
parent 0285a2b95d
commit c6a6d87f96
6 changed files with 258 additions and 418 deletions
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@ -13,10 +13,47 @@
 // limitations under the License.

 #include "paddle/fluid/operators/math/blas.h"
+
+#include <utility>
 namespace paddle {
 namespace operators {
 namespace math {
-// Do nothing. Blas is a header only library.
+MatDim GetMatDim(const framework::DDim& dim, int num_flatten_cols, bool trans) {
+  MatDim retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = framework::flatten_to_2d(dim, num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (dim.size() == 1) {
+      retv.height_ = 1;
+      retv.width_ = dim[0];
+    } else if (dim.size() == 2) {
+      retv.height_ = dim[0];
+      retv.width_ = dim[1];
+    } else {
+      if (dim.size() == 3) {
+        retv.batch_size_ = dim[0];
+        retv.height_ = dim[1];
+        retv.width_ = dim[2];
+      } else {
+        auto dim_vec = framework::vectorize(dim);
+        retv.batch_size_ = 1;
+        for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+          retv.batch_size_ *= dim_vec[i];
+          retv.height_ = dim_vec[dim_vec.size() - 2];
+          retv.width_ = dim_vec[dim_vec.size() - 1];
+        }
+      }
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@ -46,6 +46,17 @@ namespace paddle {
 namespace operators {
 namespace math {

+struct MatDim {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+extern MatDim GetMatDim(const framework::DDim& tensor, int num_flatten_cols,
+                        bool trans);
+
 template <typename DeviceContext>
 class Blas {
 public:
@ -90,6 +101,28 @@ class Blas {
                   int K, T alpha, const T* A, const T* B, T beta, T* C,
                   int batchCount, int64_t strideA, int64_t strideB) const;

+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, const MatDim& dim_a,
+              const framework::Tensor& mat_b, const MatDim& dim_b, T alpha,
+              framework::Tensor* mat_out, T beta) const {
+    PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+    CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
+    if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
+      this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
+                             dim_a.width_, alpha, mat_a.data<T>(),
+                             mat_b.data<T>(), beta, mat_out->data<T>());
+    } else {
+      PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
+                     dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+      this->template BatchedGEMM<T>(
+          transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
+          mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
+          dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
+          dim_a.stride_, dim_b.stride_);
+    }
+  }
+
 private:
  const DeviceContext& context_;
 };
--- a/paddle/fluid/operators/math/matmul.h
+++ b/paddle/fluid/operators/math/matmul.h
@ -1,149 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// Implements the logic of numpy matmul:
-// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
-//
-// but allowing also for a, b to be transposed
-//
-// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
-// yet.
-template <typename DeviceContext, typename T>
-class MatMulFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  bool trans_a, const framework::Tensor& b, bool trans_b,
-                  T alpha, framework::Tensor* out, T beta) {
-    auto dim_a = a.dims();
-    auto dim_b = b.dims();
-
-    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
-                   "Tensors must all be in the same place.");
-    PADDLE_ENFORCE_GE(dim_a.size(), 1,
-                      "Input tensor a must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_b.size(), 1,
-                      "Input tensor b must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_a.size() > 3) {
-      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
-                     "The dimensions of X and Y must be the same, and both of "
-                     "them should be %d-dimensional.",
-                     dim_b.size());
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_a.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_a[j]);
-        batch_count *= dim_a[j];
-      }
-    }
-
-    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
-        strideA = 0, strideB = 0;
-
-    switch (dim_a.size()) {
-      case 1:
-        // similar to np.matmul:
-        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
-        M = trans_a ? dim_a[0] : 1;
-        kA = trans_a ? 1 : dim_a[0];
-        break;
-      case 2:
-        M = trans_a ? dim_a[1] : dim_a[0];
-        kA = trans_a ? dim_a[0] : dim_a[1];
-        break;
-      case 3:
-        batchCountA = dim_a[0];
-        M = trans_a ? dim_a[2] : dim_a[1];
-        kA = trans_a ? dim_a[1] : dim_a[2];
-        strideA = M * kA;
-        break;
-      default:
-        batchCountA = batch_count;
-        size_t mat_s = dim_a.size() - 2;
-        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
-        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
-        strideA = M * kA;
-    }
-
-    switch (dim_b.size()) {
-      case 1:
-        // similar to np.matmul:
-        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
-        kB = trans_b ? 1 : dim_b[0];
-        N = trans_b ? dim_b[0] : 1;
-        break;
-      case 2:
-        kB = trans_b ? dim_b[1] : dim_b[0];
-        N = trans_b ? dim_b[0] : dim_b[1];
-        break;
-      case 3:
-        batchCountB = dim_b[0];
-        kB = trans_b ? dim_b[2] : dim_b[1];
-        N = trans_b ? dim_b[1] : dim_b[2];
-        strideB = kB * N;
-        break;
-      default:
-        batchCountB = batch_count;
-        size_t mat_s = dim_b.size() - 2;
-        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
-        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
-        strideB = kB * N;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        kA, kB,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountA && batchCountB) {
-      PADDLE_ENFORCE_EQ(
-          batchCountA, batchCountB,
-          "When input tensors a and b are both batched, they must have the "
-          "same batch dimension.");
-    }
-    int batchCount = std::max(batchCountA, batchCountB);
-
-    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-    auto blas = GetBlas<DeviceContext, T>(context);
-
-    if (!batchCount) {
-      // regular matrix multiplication
-      blas.GEMM(transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(), beta,
-                out->data<T>());
-    } else {
-      // batched matrix multiplication
-      blas.BatchedGEMM(transA, transB, M, N, kA, alpha, a.data<T>(),
-                       b.data<T>(), beta, out->data<T>(), batchCount, strideA,
-                       strideB);
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@ -36,121 +36,39 @@ class MatMulOp : public framework::OperatorWithKernel {

    auto dim_x = context->GetInputDim("X");
    auto dim_y = context->GetInputDim("Y");
-    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
-    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
-
-    PADDLE_ENFORCE_GE(dim_x.size(), 1,
-                      "Input tensor X must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_y.size(), 1,
-                      "Input tensor Y must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_x.size() > 3) {
-      PADDLE_ENFORCE_EQ(
-          dim_y.size(), dim_x.size(),
-          "The dimensions of X and Y must be the same, and both of "
-          "them should be %d-dimensional.",
-          dim_x.size());
-
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_x.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_x[j]);
-        batch_count *= dim_x[j];
-      }
-    }

-    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
-    bool remove_initial_dim = false, remove_final_dim = false;
-
-    switch (dim_x.size()) {
-      case 1:
-        if (transpose_x) {
-          M = dim_x[0];
-          KX = 1;
-        } else {
-          M = 1;
-          KX = dim_x[0];
-          remove_initial_dim = true;
-        }
-        break;
-      case 2:
-        M = transpose_x ? dim_x[1] : dim_x[0];
-        KX = transpose_x ? dim_x[0] : dim_x[1];
-        break;
-      case 3:
-        batchCountX = dim_x[0];
-        M = transpose_x ? dim_x[2] : dim_x[1];
-        KX = transpose_x ? dim_x[1] : dim_x[2];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = dim_x.size() - 2;
-        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
-        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
-        break;
-    }
+    auto mat_dim_x = math::GetMatDim(GetXDim(dim_x), 0,
+                                     context->Attrs().Get<bool>("transpose_X"));
+    auto mat_dim_y = math::GetMatDim(GetYDim(dim_y), 0,
+                                     context->Attrs().Get<bool>("transpose_Y"));

-    switch (dim_y.size()) {
-      case 1:
-        if (transpose_y) {
-          N = dim_y[0];
-          KY = 1;
-        } else {
-          N = 1;
-          KY = dim_y[0];
-          remove_final_dim = true;
-        }
-        break;
-      case 2:
-        KY = transpose_y ? dim_y[1] : dim_y[0];
-        N = transpose_y ? dim_y[0] : dim_y[1];
-        break;
-      case 3:
-        batchCountY = dim_y[0];
-        KY = transpose_y ? dim_y[2] : dim_y[1];
-        N = transpose_y ? dim_y[1] : dim_y[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = dim_y.size() - 2;
-        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
-        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
+    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
+    PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
+                   mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    std::vector<int64_t> dim_out;
+    if (mat_dim_x.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_x);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else if (mat_dim_y.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_y);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else {
+      dim_out = {mat_dim_x.height_, mat_dim_y.width_};
    }

-    PADDLE_ENFORCE_EQ(
-        KX, KY,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
+    if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) {
+      std::swap(dim_out[dim_out.size() - 2], dim_out[dim_out.size() - 1]);
+      dim_out.resize(dim_out.size() - 1);
    }
-    int batchCount = std::max(batchCountX, batchCountY);

-    std::vector<int64_t> dim_out;
-    if (batchCount) {
-      if (dim_x.size() > 3) {
-        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
-      } else {
-        dim_out.push_back(batchCount);
-      }
+    if (dim_y.size() == 1 && dim_out[dim_out.size() - 1] == 1) {
+      dim_out.resize(dim_out.size() - 1);
    }
-    if (!remove_initial_dim) {
-      dim_out.push_back(M);
-    }
-    if (!remove_final_dim) {
-      dim_out.push_back(N);
-    }
-    if (dim_out.size() == 0) {
-      // We don't support 0-dimensional Tensors (scalars), so instead
-      // treat the output as a Tensor of shape (1, ) in this case.
-      dim_out.push_back(1);
+
+    if (dim_out.empty()) {
+      dim_out = {1};
    }
    context->SetOutputDim("Out", framework::make_ddim(dim_out));
    context->ShareLoD("X", /*->*/ "Out");
--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@ -111,21 +111,24 @@ class Generator(object):


 # Generate test cases for all possibilities
-for dim_X in [1, 2, 3]:
-    for dim_Y in [1, 2, 3]:
-        for transpose_X in [False, True]:
-            for transpose_Y in [False, True]:
-                test_name = (
-                    'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                        dim_X, dim_Y, transpose_X, transpose_Y))
-                shape_X, shape_Y = generate_compatible_shapes(
-                    dim_X, dim_Y, transpose_X, transpose_Y)
-                globals()[test_name] = type(test_name, (Generator, OpTest), {
-                    'shape_X': shape_X,
-                    'shape_Y': shape_Y,
-                    'transpose_X': transpose_X,
-                    'transpose_Y': transpose_Y,
-                })
+def inject_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (Generator, OpTest), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+    })
+
+
+for dim_X in (1, 2, 3):
+    for dim_Y in (1, 2, 3):
+        for transose_x in (False, True):
+            for transose_y in (False, True):
+                inject_test(dim_X, dim_Y, transose_x, transose_y)


 # Test case n-dim
@ -149,7 +152,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
    return shape_X, shape_Y


-# Test case n-dim
+# # Test case n-dim
 for dim in [4]:
    for transpose_X in [False, True]:
        for transpose_Y in [False, True]: