add GpuMatrix::mul, CpuMatrix::mul operators

8 years ago · 1f0cbcf350
parent 936301f157
commit 1f0cbcf350
5 changed files with 652 additions and 25 deletions
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@ -167,7 +167,7 @@ public:
  ValueType valueType() const { return valueType_; }
  BufferType bufferType() const { return bufferType_; }
  const TensorShape& shape() const { return shape_; }
-  bool isSparse() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }

  const SequenceArg& sequence() const;
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@ -19,6 +19,40 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"

 namespace paddle {
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);

 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
@ -27,4 +61,11 @@ void MulOp(GpuMatrix& out,
           real scaleAB,
           real scaleT);

+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
 }  // namespace paddle
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@ -20,6 +20,65 @@ limitations under the License. */
 namespace paddle {
 /**
 * out = scale_t * out + scale_ab * (a * b)
+ * out : output matrix, M * N
+ */
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scale_ab,
+                            real scale_t) {
+  CHECK(!out.isTransposed()) << "Not supported";
+
+  if (!a.isTransposed() && !b.isTransposed()) {
+    /// a : M * K, b: K * N
+    CHECK_EQ(out.width_, b.width_);
+    CHECK_EQ(out.height_, a.height_);
+    CHECK_EQ(a.width_, b.height_);
+  } else if (a.isTransposed() && !b.isTransposed()) {
+    /// a : K * M, b : K * N
+    CHECK_EQ(out.width_, b.width_);
+    CHECK_EQ(out.height_, a.width_);
+    CHECK_EQ(a.height_, b.height_);
+  } else if (!a.isTransposed() && b.isTransposed()) {
+    /// a: M * K, b : N * K
+    CHECK_EQ(out.width_, b.height_);
+    CHECK_EQ(out.height_, a.height_);
+    CHECK_EQ(a.width_, b.width_);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+
+  real* a_data = a.data_;
+  real* b_data = b.data_;
+  real* out_data = out.data_;
+  int dim_m = out.getHeight();
+  int dim_n = out.getWidth();
+  int dim_k = !a.isTransposed() ? a.width_ : a.height_;
+  int lda = a.getStride();
+  int ldb = b.getStride();
+  int ldc = out.getStride();
+  hl_trans_op_t trans_a = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+  hl_trans_op_t trans_b = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+
+  hl_matrix_mul(a_data,
+                trans_a,
+                b_data,
+                trans_b,
+                out_data,
+                dim_m,
+                dim_n,
+                dim_k,
+                scale_ab,
+                scale_t,
+                lda,
+                ldb,
+                ldc);
+}
+
+/**
+ * out = scale_t * out + scale_ab * (a * b)
+ * out : M * N
 */
 template <>
 void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
@ -32,12 +91,15 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
  CHECK(!out.trans_ && !b.trans_) << "not supported";
  if (!a.trans_) {
+    /// a: M * K,  b: K * N
    CHECK(out.width_ == b.width_ && out.height_ == a.height_
        && a.width_ == b.height_) << "Matrix dimensions are not equal";
  } else {
+    /// a: K * M, transpose,  b: K * N
    CHECK(out.width_ == b.width_ && out.height_ == a.width_
        && a.height_ == b.height_) << "Matrix dimensions are not equal";
  }
+
  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
  hl_sparse_matrix_s a_data = a.sMatrix_.get();
  real* b_data = b.data_;
@ -54,4 +116,58 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                          scale_t);
 }

+/**
+ * out = scale_t * out + scale_ab * (a * b)
+ * out : M * N
+ */
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scale_ab,
+                            real scale_t) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
+
+  hl_sparse_matrix_s b_data = b.sMatrix_.get();
+  real* a_data = a.data_;
+  real* out_data = out.data_;
+  hl_trans_op_t trans_b = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  if (!b.trans_) {
+    /// a : M * K, b : K * N
+    CHECK(out.width_ == b.width_ &&
+          out.height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    /// a : M * K, b : N * K, transpose
+    CHECK(out.width_ == b.height_ &&
+          out.height_ == a.height_ && a.width_ == b.width_)
+        << "Matrix dimensions are not equal";
+  }
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(a_data,
+                            HPPL_OP_N,
+                            b_data,
+                            trans_b,
+                            out_data,
+                            out.height_,
+                            out.width_,
+                            a.width_,
+                            scale_ab,
+                            scale_t);
+  } else {
+    hl_matrix_dense_mul_csr(a_data,
+                            HPPL_OP_N,
+                            b_data,
+                            trans_b,
+                            out_data,
+                            out.height_,
+                            out.width_,
+                            a.width_,
+                            scale_ab,
+                            scale_t);
+  }
+}
+
 }  // namespace paddle
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@ -22,31 +22,41 @@ using namespace paddle;  // NOLINT

 void testSpMatrixMul(int M, int N, int K, real rate, real scale1, real scale2) {
  /// todo(tianbing) check CPU/GPU
-  const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOP-GPU");
+  const auto gpuFunc = FunctionBase::funcRegistrar_.createByType("MulOp-GPU");
  gpuFunc->init(FuncConfig().set("scaleAB", scale1).set("scaleT", scale2));

-  int nnz = M * K * rate;
-  auto gpuA = std::make_shared<GpuSparseMatrix>(M, K, nnz);
-  const auto gpuB = std::make_shared<GpuMatrix>(K, N);
-  const auto gpuOut = std::make_shared<GpuMatrix>(M, N);
+  int nnz = M * N * rate;
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
+  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));

-  gpuA->randomizeUniform();
-  gpuB->randomizeUniform();
-  gpuOut->randomizeUniform();
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
+  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  gpuC->copyFrom(*cpuC, stream);
+  hl_stream_synchronize(stream);

  BufferArgs inputs;
  BufferArgs outputs;
-  inputs.addArg(*gpuA);
-  inputs.addArg(*gpuB);
-  outputs.addArg(*gpuOut);
+  inputs.addArg(*gpuA->getTranspose());
+  inputs.addArg(*gpuB->getTranspose());
+  outputs.addArg(*gpuC, ASSIGN_TO);

  gpuFunc->calc(inputs, outputs);
 }

 TEST(SMatrix, sMatrixMul) {
  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100, 2000, 20480}) {
-      for (auto K : {100, 512, 1024}) {
+    for (auto N : {100}) {
+      for (auto K : {100}) {
        /// todo(tianbing), add scaleAB and scaleT
        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
        testSpMatrixMul(M, N, K, 0.05, 1, 1);