add GpuMatrix::mul, CpuMatrix::mul operators

avx_docs
xutianbing 8 years ago
parent 936301f157
commit 1f0cbcf350

@ -167,7 +167,7 @@ public:
ValueType valueType() const { return valueType_; }
BufferType bufferType() const { return bufferType_; }
const TensorShape& shape() const { return shape_; }
bool isSparse() const { return TENSOR_SPARSE == bufferType_; }
bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
const SequenceArg& sequence() const;

File diff suppressed because it is too large Load Diff

@ -19,6 +19,40 @@ limitations under the License. */
#include "paddle/math/SparseMatrix.h"
namespace paddle {
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuSparseMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuMatrix& a,
const CpuSparseMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(CpuSparseMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
@ -27,4 +61,11 @@ void MulOp(GpuMatrix& out,
real scaleAB,
real scaleT);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuMatrix& a,
const GpuSparseMatrix& b,
real scaleAB,
real scaleT);
} // namespace paddle

@ -20,6 +20,65 @@ limitations under the License. */
namespace paddle {
/**
* out = scale_t * out + scale_ab * (a * b)
* out : output matrix, M * N
*/
template <>
void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scale_ab,
real scale_t) {
CHECK(!out.isTransposed()) << "Not supported";
if (!a.isTransposed() && !b.isTransposed()) {
/// a : M * K, b: K * N
CHECK_EQ(out.width_, b.width_);
CHECK_EQ(out.height_, a.height_);
CHECK_EQ(a.width_, b.height_);
} else if (a.isTransposed() && !b.isTransposed()) {
/// a : K * M, b : K * N
CHECK_EQ(out.width_, b.width_);
CHECK_EQ(out.height_, a.width_);
CHECK_EQ(a.height_, b.height_);
} else if (!a.isTransposed() && b.isTransposed()) {
/// a: M * K, b : N * K
CHECK_EQ(out.width_, b.height_);
CHECK_EQ(out.height_, a.height_);
CHECK_EQ(a.width_, b.width_);
} else {
LOG(FATAL) << "Is not supported";
}
real* a_data = a.data_;
real* b_data = b.data_;
real* out_data = out.data_;
int dim_m = out.getHeight();
int dim_n = out.getWidth();
int dim_k = !a.isTransposed() ? a.width_ : a.height_;
int lda = a.getStride();
int ldb = b.getStride();
int ldc = out.getStride();
hl_trans_op_t trans_a = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
hl_trans_op_t trans_b = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
hl_matrix_mul(a_data,
trans_a,
b_data,
trans_b,
out_data,
dim_m,
dim_n,
dim_k,
scale_ab,
scale_t,
lda,
ldb,
ldc);
}
/**
* out = scale_t * out + scale_ab * (a * b)
* out : M * N
*/
template <>
void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
@ -32,12 +91,15 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
CHECK(b.useGpu_ == true) << "Matrix type are not equal";
CHECK(!out.trans_ && !b.trans_) << "not supported";
if (!a.trans_) {
/// a: M * K, b: K * N
CHECK(out.width_ == b.width_ && out.height_ == a.height_
&& a.width_ == b.height_) << "Matrix dimensions are not equal";
} else {
/// a: K * M, transpose, b: K * N
CHECK(out.width_ == b.width_ && out.height_ == a.width_
&& a.height_ == b.height_) << "Matrix dimensions are not equal";
}
hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
hl_sparse_matrix_s a_data = a.sMatrix_.get();
real* b_data = b.data_;
@ -54,4 +116,58 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
scale_t);
}
/**
* out = scale_t * out + scale_ab * (a * b)
* out : M * N
*/
template <>
void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& a,
const GpuSparseMatrix& b,
real scale_ab,
real scale_t) {
CHECK(out.isContiguous());
CHECK(a.isContiguous());
CHECK(a.useGpu_ == true) << "Matrix type are not equal";
hl_sparse_matrix_s b_data = b.sMatrix_.get();
real* a_data = a.data_;
real* out_data = out.data_;
hl_trans_op_t trans_b = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
if (!b.trans_) {
/// a : M * K, b : K * N
CHECK(out.width_ == b.width_ &&
out.height_ == a.height_ && a.width_ == b.height_)
<< "Matrix dimensions are not equal";
} else {
/// a : M * K, b : N * K, transpose
CHECK(out.width_ == b.height_ &&
out.height_ == a.height_ && a.width_ == b.width_)
<< "Matrix dimensions are not equal";
}
if (b.format_ == SPARSE_CSC) {
hl_matrix_dense_mul_csc(a_data,
HPPL_OP_N,
b_data,
trans_b,
out_data,
out.height_,
out.width_,
a.width_,
scale_ab,
scale_t);
} else {
hl_matrix_dense_mul_csr(a_data,
HPPL_OP_N,
b_data,
trans_b,
out_data,
out.height_,
out.width_,
a.width_,
scale_ab,
scale_t);
}
}
} // namespace paddle

@ -22,31 +22,41 @@ using namespace paddle; // NOLINT
void testSpMatrixMul(int M, int N, int K, real rate, real scale1, real scale2) {
/// todo(tianbing) check CPU/GPU
const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOP-GPU");
const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
gpuFunc->init(FuncConfig().set("scaleAB", scale1).set("scaleT", scale2));
int nnz = M * K * rate;
auto gpuA = std::make_shared<GpuSparseMatrix>(M, K, nnz);
const auto gpuB = std::make_shared<GpuMatrix>(K, N);
const auto gpuOut = std::make_shared<GpuMatrix>(M, N);
int nnz = M * N * rate;
MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
gpuA->randomizeUniform();
gpuB->randomizeUniform();
gpuOut->randomizeUniform();
MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
cpuA->randomizeUniform();
cpuB->randomizeUniform();
cpuC->randomizeUniform();
hl_stream_t stream(HPPL_STREAM_3);
gpuA->copyFrom(*cpuA, stream);
gpuB->copyFrom(*cpuB, stream);
gpuC->copyFrom(*cpuC, stream);
hl_stream_synchronize(stream);
BufferArgs inputs;
BufferArgs outputs;
inputs.addArg(*gpuA);
inputs.addArg(*gpuB);
outputs.addArg(*gpuOut);
inputs.addArg(*gpuA->getTranspose());
inputs.addArg(*gpuB->getTranspose());
outputs.addArg(*gpuC, ASSIGN_TO);
gpuFunc->calc(inputs, outputs);
}
TEST(SMatrix, sMatrixMul) {
for (auto M : {1, 40, 128, 200}) {
for (auto N : {100, 2000, 20480}) {
for (auto K : {100, 512, 1024}) {
for (auto N : {100}) {
for (auto K : {100}) {
/// todo(tianbing), add scaleAB and scaleT
VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
testSpMatrixMul(M, N, K, 0.05, 1, 1);

Loading…
Cancel
Save