Rewrite Matmul, make code cleaner

testDrivenImageClassification
Yu Yang 7 years ago
parent 0285a2b95d
commit c6a6d87f96

@ -13,10 +13,47 @@
// limitations under the License.
#include "paddle/fluid/operators/math/blas.h"
#include <utility>
namespace paddle {
namespace operators {
namespace math {
// Do nothing. Blas is a header only library.
MatDim GetMatDim(const framework::DDim& dim, int num_flatten_cols, bool trans) {
MatDim retv;
if (num_flatten_cols > 1) {
auto flatten_dim = framework::flatten_to_2d(dim, num_flatten_cols);
retv.height_ = flatten_dim[0];
retv.width_ = flatten_dim[1];
} else {
if (dim.size() == 1) {
retv.height_ = 1;
retv.width_ = dim[0];
} else if (dim.size() == 2) {
retv.height_ = dim[0];
retv.width_ = dim[1];
} else {
if (dim.size() == 3) {
retv.batch_size_ = dim[0];
retv.height_ = dim[1];
retv.width_ = dim[2];
} else {
auto dim_vec = framework::vectorize(dim);
retv.batch_size_ = 1;
for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
retv.batch_size_ *= dim_vec[i];
retv.height_ = dim_vec[dim_vec.size() - 2];
retv.width_ = dim_vec[dim_vec.size() - 1];
}
}
retv.stride_ = retv.height_ * retv.width_;
}
}
if (trans) {
std::swap(retv.width_, retv.height_);
}
retv.trans_ = trans;
return retv;
}
} // namespace math
} // namespace operators
} // namespace paddle

@ -46,6 +46,17 @@ namespace paddle {
namespace operators {
namespace math {
struct MatDim {
int64_t height_;
int64_t width_;
int64_t stride_{0};
int64_t batch_size_{0};
bool trans_;
};
extern MatDim GetMatDim(const framework::DDim& tensor, int num_flatten_cols,
bool trans);
template <typename DeviceContext>
class Blas {
public:
@ -90,6 +101,28 @@ class Blas {
int K, T alpha, const T* A, const T* B, T beta, T* C,
int batchCount, int64_t strideA, int64_t strideB) const;
template <typename T>
void MatMul(const framework::Tensor& mat_a, const MatDim& dim_a,
const framework::Tensor& mat_b, const MatDim& dim_b, T alpha,
framework::Tensor* mat_out, T beta) const {
PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
dim_a.width_, alpha, mat_a.data<T>(),
mat_b.data<T>(), beta, mat_out->data<T>());
} else {
PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
this->template BatchedGEMM<T>(
transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
dim_a.stride_, dim_b.stride_);
}
}
private:
const DeviceContext& context_;
};

@ -1,149 +0,0 @@
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
namespace math {
// Implements the logic of numpy matmul:
// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
//
// but allowing also for a, b to be transposed
//
// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
// yet.
template <typename DeviceContext, typename T>
class MatMulFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& a,
bool trans_a, const framework::Tensor& b, bool trans_b,
T alpha, framework::Tensor* out, T beta) {
auto dim_a = a.dims();
auto dim_b = b.dims();
PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
"Tensors must all be in the same place.");
PADDLE_ENFORCE_GE(dim_a.size(), 1,
"Input tensor a must be at least 1-dimensional.");
PADDLE_ENFORCE_GE(dim_b.size(), 1,
"Input tensor b must be at least 1-dimensional.");
std::vector<int64_t> out_dim;
int64_t batch_count = 1;
if (dim_a.size() > 3) {
PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
"The dimensions of X and Y must be the same, and both of "
"them should be %d-dimensional.",
dim_b.size());
// The first rank-2 dimensions are accumulated on the batch_count, and the
// last two dimensions are used for matrix multiplication.
for (int j = 0; j < dim_a.size() - 2; ++j) {
PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
"The %d-th dimension of X and Y must be the same.",
j);
out_dim.push_back(dim_a[j]);
batch_count *= dim_a[j];
}
}
int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
strideA = 0, strideB = 0;
switch (dim_a.size()) {
case 1:
// similar to np.matmul:
// prepend dimension 1 (no transpose) or append dimension 1 (transpose)
M = trans_a ? dim_a[0] : 1;
kA = trans_a ? 1 : dim_a[0];
break;
case 2:
M = trans_a ? dim_a[1] : dim_a[0];
kA = trans_a ? dim_a[0] : dim_a[1];
break;
case 3:
batchCountA = dim_a[0];
M = trans_a ? dim_a[2] : dim_a[1];
kA = trans_a ? dim_a[1] : dim_a[2];
strideA = M * kA;
break;
default:
batchCountA = batch_count;
size_t mat_s = dim_a.size() - 2;
M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
strideA = M * kA;
}
switch (dim_b.size()) {
case 1:
// similar to np.matmul:
// append dimension 1 (no transpose) or prepend dimension 1 (transpose)
kB = trans_b ? 1 : dim_b[0];
N = trans_b ? dim_b[0] : 1;
break;
case 2:
kB = trans_b ? dim_b[1] : dim_b[0];
N = trans_b ? dim_b[0] : dim_b[1];
break;
case 3:
batchCountB = dim_b[0];
kB = trans_b ? dim_b[2] : dim_b[1];
N = trans_b ? dim_b[1] : dim_b[2];
strideB = kB * N;
break;
default:
batchCountB = batch_count;
size_t mat_s = dim_b.size() - 2;
kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
strideB = kB * N;
}
PADDLE_ENFORCE_EQ(
kA, kB,
"First matrix's width must be equal with second matrix's height.");
if (batchCountA && batchCountB) {
PADDLE_ENFORCE_EQ(
batchCountA, batchCountB,
"When input tensors a and b are both batched, they must have the "
"same batch dimension.");
}
int batchCount = std::max(batchCountA, batchCountB);
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
auto blas = GetBlas<DeviceContext, T>(context);
if (!batchCount) {
// regular matrix multiplication
blas.GEMM(transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(), beta,
out->data<T>());
} else {
// batched matrix multiplication
blas.BatchedGEMM(transA, transB, M, N, kA, alpha, a.data<T>(),
b.data<T>(), beta, out->data<T>(), batchCount, strideA,
strideB);
}
}
};
} // namespace math
} // namespace operators
} // namespace paddle

@ -36,121 +36,39 @@ class MatMulOp : public framework::OperatorWithKernel {
auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Y");
bool transpose_x = context->Attrs().Get<bool>("transpose_X");
bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
PADDLE_ENFORCE_GE(dim_x.size(), 1,
"Input tensor X must be at least 1-dimensional.");
PADDLE_ENFORCE_GE(dim_y.size(), 1,
"Input tensor Y must be at least 1-dimensional.");
std::vector<int64_t> out_dim;
int64_t batch_count = 1;
if (dim_x.size() > 3) {
PADDLE_ENFORCE_EQ(
dim_y.size(), dim_x.size(),
"The dimensions of X and Y must be the same, and both of "
"them should be %d-dimensional.",
dim_x.size());
// The first rank-2 dimensions are accumulated on the batch_count, and the
// last two dimensions are used for matrix multiplication.
for (int j = 0; j < dim_x.size() - 2; ++j) {
PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
"The %d-th dimension of X and Y must be the same.",
j);
out_dim.push_back(dim_x[j]);
batch_count *= dim_x[j];
}
}
int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
bool remove_initial_dim = false, remove_final_dim = false;
switch (dim_x.size()) {
case 1:
if (transpose_x) {
M = dim_x[0];
KX = 1;
} else {
M = 1;
KX = dim_x[0];
remove_initial_dim = true;
}
break;
case 2:
M = transpose_x ? dim_x[1] : dim_x[0];
KX = transpose_x ? dim_x[0] : dim_x[1];
break;
case 3:
batchCountX = dim_x[0];
M = transpose_x ? dim_x[2] : dim_x[1];
KX = transpose_x ? dim_x[1] : dim_x[2];
break;
default:
batchCountX = batch_count;
size_t mat_s = dim_x.size() - 2;
M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
break;
}
auto mat_dim_x = math::GetMatDim(GetXDim(dim_x), 0,
context->Attrs().Get<bool>("transpose_X"));
auto mat_dim_y = math::GetMatDim(GetYDim(dim_y), 0,
context->Attrs().Get<bool>("transpose_Y"));
switch (dim_y.size()) {
case 1:
if (transpose_y) {
N = dim_y[0];
KY = 1;
} else {
N = 1;
KY = dim_y[0];
remove_final_dim = true;
}
break;
case 2:
KY = transpose_y ? dim_y[1] : dim_y[0];
N = transpose_y ? dim_y[0] : dim_y[1];
break;
case 3:
batchCountY = dim_y[0];
KY = transpose_y ? dim_y[2] : dim_y[1];
N = transpose_y ? dim_y[1] : dim_y[2];
break;
default:
batchCountY = batch_count;
size_t mat_s = dim_y.size() - 2;
KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
std::vector<int64_t> dim_out;
if (mat_dim_x.batch_size_ != 0) {
dim_out = framework::vectorize(dim_x);
dim_out[dim_out.size() - 2] = mat_dim_x.height_;
dim_out[dim_out.size() - 1] = mat_dim_y.width_;
} else if (mat_dim_y.batch_size_ != 0) {
dim_out = framework::vectorize(dim_y);
dim_out[dim_out.size() - 2] = mat_dim_x.height_;
dim_out[dim_out.size() - 1] = mat_dim_y.width_;
} else {
dim_out = {mat_dim_x.height_, mat_dim_y.width_};
}
PADDLE_ENFORCE_EQ(
KX, KY,
"First matrix's width must be equal with second matrix's height.");
if (batchCountX && batchCountY) {
PADDLE_ENFORCE_EQ(
batchCountX, batchCountY,
"When Input(X) and Input(Y) are both three dimensional, they "
"must have the same batch dimension.");
if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) {
std::swap(dim_out[dim_out.size() - 2], dim_out[dim_out.size() - 1]);
dim_out.resize(dim_out.size() - 1);
}
int batchCount = std::max(batchCountX, batchCountY);
std::vector<int64_t> dim_out;
if (batchCount) {
if (dim_x.size() > 3) {
dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
} else {
dim_out.push_back(batchCount);
}
if (dim_y.size() == 1 && dim_out[dim_out.size() - 1] == 1) {
dim_out.resize(dim_out.size() - 1);
}
if (!remove_initial_dim) {
dim_out.push_back(M);
}
if (!remove_final_dim) {
dim_out.push_back(N);
}
if (dim_out.size() == 0) {
// We don't support 0-dimensional Tensors (scalars), so instead
// treat the output as a Tensor of shape (1, ) in this case.
dim_out.push_back(1);
if (dim_out.empty()) {
dim_out = {1};
}
context->SetOutputDim("Out", framework::make_ddim(dim_out));
context->ShareLoD("X", /*->*/ "Out");

File diff suppressed because it is too large Load Diff

@ -111,21 +111,24 @@ class Generator(object):
# Generate test cases for all possibilities
for dim_X in [1, 2, 3]:
for dim_Y in [1, 2, 3]:
for transpose_X in [False, True]:
for transpose_Y in [False, True]:
test_name = (
'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_X, dim_Y, transpose_X, transpose_Y))
shape_X, shape_Y = generate_compatible_shapes(
dim_X, dim_Y, transpose_X, transpose_Y)
globals()[test_name] = type(test_name, (Generator, OpTest), {
'shape_X': shape_X,
'shape_Y': shape_Y,
'transpose_X': transpose_X,
'transpose_Y': transpose_Y,
})
def inject_test(dim_x, dim_y, trans_x, trans_y):
test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
dim_x, dim_y, trans_x, trans_y))
shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
trans_y)
globals()[test_name] = type(test_name, (Generator, OpTest), {
'shape_X': shape_x,
'shape_Y': shape_y,
'transpose_X': trans_x,
'transpose_Y': trans_y,
})
for dim_X in (1, 2, 3):
for dim_Y in (1, 2, 3):
for transose_x in (False, True):
for transose_y in (False, True):
inject_test(dim_X, dim_Y, transose_x, transose_y)
# Test case n-dim
@ -149,7 +152,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
return shape_X, shape_Y
# Test case n-dim
# # Test case n-dim
for dim in [4]:
for transpose_X in [False, True]:
for transpose_Y in [False, True]:

Loading…
Cancel
Save