Daoyuan's comments.

8 years ago · b3be735807
parent bc5d7bb6d2
commit b3be735807
6 changed files with 217 additions and 259 deletions
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@ -71,24 +71,17 @@ public:
 public:
  BufferArg(ValueType valueType,
            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED,
-            bool trans = false)
+            ArgType argType = UNSPECIFIED)
      : buf_(nullptr),
        valueType_(valueType),
        shape_(shape),
-        argType_(argType),
-        trans_(trans) {}
+        argType_(argType) {}

  BufferArg(void* buf,
            ValueType valueType,
            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED,
-            bool trans = false)
-      : buf_(buf),
-        valueType_(valueType),
-        shape_(shape),
-        argType_(argType),
-        trans_(trans) {}
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}

  BufferArg(void* buf, ValueType valueType)
      : buf_(buf), valueType_(valueType) {}
@ -98,8 +91,7 @@ public:
            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
        valueType_(DataType<real>::value),
        shape_(2),
-        argType_(argType),
-        trans_(matrix.isTransposed()) {
+        argType_(argType) {
    bufferType_ = TENSOR_NORMAL;
    shape_.setDim(0, matrix.getHeight());
    shape_.setDim(1, matrix.getWidth());
@ -112,8 +104,7 @@ public:
            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
        valueType_(DataType<real>::value),
        shape_(shape),
-        argType_(argType),
-        trans_(matrix.isTransposed()) {
+        argType_(argType) {
    bufferType_ = TENSOR_NORMAL;
    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
  }
@ -145,7 +136,7 @@ public:
    // CHECK(deviceType_ == DType);
    CHECK_EQ((size_t)2, shape_.ndims());
    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1], trans_);
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
  }

  template <typename VType, DeviceType DType>
@ -169,7 +160,6 @@ public:
  ValueType valueType() const { return valueType_; }
  BufferType bufferType() const { return bufferType_; }
  const TensorShape& shape() const { return shape_; }
-  bool isTransposed() const { return trans_; }
  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
  virtual size_t numElements() const { return shape_.getElements(); }
@ -183,7 +173,6 @@ protected:
  TensorShape shape_;
  BufferType bufferType_{TENSOR_UNKNOWN};
  ArgType argType_{UNSPECIFIED};
-  bool trans_{false};
  // todo(tianbing), add deviceType_
  // leading dimensions. The size is dims_.size()
  // Dims lds_;
@ -277,9 +266,8 @@ public:
                  size_t nnz,
                  SparseFormat format,
                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED,
-                  bool trans = false)
-      : BufferArg(buf, valueType, shape, argType, trans),
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
        row_(row),
        col_(col),
        nnz_(nnz),
@ -302,9 +290,8 @@ public:
                  size_t nnz,
                  SparseFormat format,
                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED,
-                  bool trans = false)
-      : BufferArg(valueType, shape, argType, trans),
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
        /// len of row_ : height + 1 (CSR), buf_ == nullptr
        row_(format == SPARSE_CSR
                 ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape[0] + 1})
@ -343,7 +330,7 @@ public:
        nnz_,
        type_,
        format_,
-        trans_);
+        false);
  }

  ~SparseMatrixArg() {}
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@ -64,22 +64,14 @@ public:
    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));

-    cpuInputs_.emplace_back(
-        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
-                                    input.valueType(),
-                                    input.shape(),
-                                    UNSPECIFIED,
-                                    input.isTransposed()));
-    gpuInputs_.emplace_back(
-        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
-                                    input.valueType(),
-                                    input.shape(),
-                                    UNSPECIFIED,
-                                    input.isTransposed()));
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
  }

  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+  void addOutputs(const BufferArg& output, ArgType argType = ADD_TO) {
    size_t size =
        output.shape().getElements() * sizeOfValuType(output.valueType());
    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
@ -89,16 +81,14 @@ public:
        cpuMemory_.back()->getBuf(),
        output.valueType(),
        output.shape(),
-        // todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO
-        argType,
-        output.isTransposed()));
+        // todo(tianbing), argType = output.getArgType(), but default ADD_TO
+        argType));
    gpuOutputs_.emplace_back(std::make_shared<BufferArg>(
        gpuMemory_.back()->getBuf(),
        output.valueType(),
        output.shape(),
-        // todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO
-        argType,
-        output.isTransposed()));
+        // todo(tianbing), argType = output.getArgType(), but default ADD_TO
+        argType));
  }

  /// add and init output sparse matrix
@ -107,15 +97,13 @@ public:
                                                   output.shape()[1],
                                                   output.nnz(),
                                                   output.dataType(),
-                                                   output.dataFormat(),
-                                                   output.isTransposed());
+                                                   output.dataFormat());

    gpuSparse_ = std::make_shared<GpuSparseMatrix>(output.shape()[0],
                                                   output.shape()[1],
                                                   output.nnz(),
                                                   output.dataType(),
-                                                   output.dataFormat(),
-                                                   output.isTransposed());
+                                                   output.dataFormat());

    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
@ -154,15 +142,13 @@ public:
                                                   input.shape()[1],
                                                   input.nnz(),
                                                   input.dataType(),
-                                                   input.dataFormat(),
-                                                   input.isTransposed());
+                                                   input.dataFormat());

    gpuSparse_ = std::make_shared<GpuSparseMatrix>(input.shape()[0],
                                                   input.shape()[1],
                                                   input.nnz(),
                                                   input.dataType(),
-                                                   input.dataFormat(),
-                                                   input.isTransposed());
+                                                   input.dataFormat());

    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@ -26,55 +26,79 @@ void MulOp(CpuMatrix& out,
           const CpuMatrix& a,
           const CpuMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
           const CpuSparseMatrix& a,
           const CpuMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
           const CpuMatrix& a,
           const CpuSparseMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(CpuSparseMatrix& out,
           const CpuMatrix& a,
           const CpuMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
           const GpuMatrix& a,
           const GpuMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
           const GpuSparseMatrix& a,
           const GpuMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
           const GpuMatrix& a,
           const GpuSparseMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 template <DeviceType DType>
 void MulOp(GpuSparseMatrix& out,
           const GpuMatrix& a,
           const GpuMatrix& b,
           real scaleAB,
-           real scaleT);
+           real scaleT,
+           bool aTrans,
+           bool bTrans,
+           bool cTrans);

 }  // namespace paddle
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@ -27,38 +27,22 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                            const GpuMatrix& a,
                            const GpuMatrix& b,
                            real scaleAB,
-                            real scaleT) {
-  CHECK(!out.isTransposed()) << "Transpose not supported for out matrix";
-  if (!a.isTransposed() && !b.isTransposed()) {
-      /// a : M * K, b: K * N
-      CHECK(out.getWidth() == b.getWidth() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getHeight());
-  } else if (a.isTransposed() && !b.isTransposed()) {
-      /// a : K * M, b : K * N
-      CHECK(out.getWidth() == b.getWidth() &&
-              out.getHeight() == a.getWidth() &&
-              a.getHeight() == b.getHeight());
-  } else if (!a.isTransposed() && b.isTransposed()) {
-      /// a: M * K, b : N * K
-      CHECK(out.getWidth() == b.getHeight() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getWidth());
-  } else {
-    LOG(FATAL) << "Not support for both a and b are Transposed Matrices";
-  }
-
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
  real* aData = const_cast<real*>(a.getData());
  real* bData = const_cast<real*>(b.getData());
  real* outData = const_cast<real*>(out.getData());
  hl_matrix_mul(aData,
-                !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T,
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
                bData,
-                !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T,
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
                outData,
                out.getHeight(),
                out.getWidth(),
-                !a.isTransposed() ? a.getWidth() : a.getHeight(),
+                !aTrans ? a.getWidth() : a.getHeight(),
                scaleAB,
                scaleT,
                a.getStride(),
@ -75,27 +59,19 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                            const GpuSparseMatrix& a,
                            const GpuMatrix& b,
                            real scaleAB,
-                            real scaleT) {
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
  CHECK(out.isContiguous());
  CHECK(b.isContiguous());
-  CHECK(b.useGpu_) << "Matrix type are not equal";
-  CHECK(!out.isTransposed() && !b.isTransposed()) << "not supported";
-  if (!a.isTransposed()) {
-    /// a: M * K,  b: K * N
-    CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getHeight()
-        && a.getWidth() == b.getHeight()) << "Matrix dimensions are not equal";
-  } else {
-    /// a: K * M, transpose,  b: K * N
-    CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getWidth()
-        && a.getHeight() == b.getHeight()) << "Matrix dimensions are not equal";
-  }
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";

-  hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
  hl_sparse_matrix_s aData = a.sMatrix_.get();
  real* bData = const_cast<real*>(b.getData());
  real* outData = const_cast<real*>(out.getData());
  hl_matrix_csr_mul_dense(aData,
-                          aTrans,
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
                          bData,
                          HPPL_OP_N,
                          outData,
@ -115,25 +91,14 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
                            const GpuMatrix& a,
                            const GpuSparseMatrix& b,
                            real scaleAB,
-                            real scaleT) {
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
  CHECK(out.isContiguous());
  CHECK(a.isContiguous());
-  CHECK(a.useGpu_) << "Matrix type are not equal";
-  if (!b.isTransposed()) {
-      /// a : M * K, b : K * N
-      CHECK(out.getWidth() == b.getWidth() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getHeight())
-          << "Matrix dimensions are not equal";
-  } else {
-      /// a : M * K, b : N * K, transpose
-      CHECK(out.getWidth() == b.getHeight() &&
-              out.getHeight() == a.getHeight() &&
-              a.getWidth() == b.getWidth())
-          << "Matrix dimensions are not equal";
-  }
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";

-  hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
  hl_sparse_matrix_s bData = b.sMatrix_.get();
  real* aData = const_cast<real*>(a.getData());
  real* outData = const_cast<real*>(out.getData());
@ -142,7 +107,7 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
    hl_matrix_dense_mul_csc(aData,
                            HPPL_OP_N,
                            bData,
-                            bTrans,
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
                            outData,
                            out.getHeight(),
                            out.getWidth(),
@ -153,7 +118,7 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
    hl_matrix_dense_mul_csr(aData,
                            HPPL_OP_N,
                            bData,
-                            bTrans,
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
                            outData,
                            out.getHeight(),
                            out.getWidth(),
@ -168,35 +133,26 @@ void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
                            const GpuMatrix& a,
                            const GpuMatrix& b,
                            real scaleAB,
-                            real scaleT) {
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans,
+                            bool cTrans) {
  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  CHECK(!out.isTransposed()) << "Transpose is not supported for out matrix";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK(out.getHeight() == a.getHeight() &&
-         out.getWidth() == b.getWidth() &&
-         a.getWidth() == b.getHeight());
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK(out.getHeight() == a.getWidth() &&
-          out.getWidth() == b.getWidth() &&
-          a.getHeight() == b.getHeight());
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK(out.getHeight() == a.getHeight() &&
-          out.getWidth() == b.getHeight() &&
-          a.getWidth() == b.getWidth());
-  } else {
-    LOG(FATAL) << "Not support for both a and b are Transposed Matrices";
-  }

-  hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
-  int dimK = !b.isTransposed() ? b.getHeight() : b.getWidth();
  real* aData = const_cast<real*>(a.getData());
  real* bData = const_cast<real*>(b.getData());
  hl_sparse_matrix_s outData = out.sMatrix_.get();

-  hl_sparse_matrix_mul(aData, aTrans, bData, bTrans, outData,
-      out.getHeight(), out.getWidth(), dimK, scaleAB, scaleT);
+  hl_sparse_matrix_mul(aData,
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       bData,
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       outData,
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
 }

 }  // namespace paddle
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@ -39,18 +39,21 @@ void testFuncDDDMatrix(
  size_t widthC = dimN;
  // init Test object
  FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", transa)
+                           .set("bTrans", transb)
+                           .set("cTrans", false));
  // prepare input arguments
  /// matrix A : HA * WA
-  test.addInputs(BufferArg(
-      VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}, UNSPECIFIED, transa));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
  /// matrix B: HB * WB
-  test.addInputs(BufferArg(
-      VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}, UNSPECIFIED, transb));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));

  /// output matrix C: HC * WC
  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  ADD_TO);
+                  beta == 1.0 ? ADD_TO : ASSIGN_TO);
  // run Function
  test.run();
 }
@ -88,21 +91,22 @@ void testFuncDSparseDMatrix(
  real beta = 1.0;
  // init Test object
  FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", false)
+                           .set("bTrans", false)
+                           .set("cTrans", false));
  // prepare input arguments
  /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
-                                 TensorShape{dimM, dimK},
-                                 nnz,
-                                 FORMAT,
-                                 FLOAT_VALUE,
-                                 UNSPECIFIED,
-                                 false));
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
  /// matrix B: K * N
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));

  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  beta == 1.0 ? ADD_TO : ASSIGN_TO);
  // run Function
  test.run();
 }
@ -138,22 +142,23 @@ void testFuncDDSparseMatrix(
  real beta = 1.0;
  // init Test object
  FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", false)
+                           .set("bTrans", false)
+                           .set("cTrans", false));
  // prepare input arguments
  /// matrix A : M * K
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));

  /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
-                                 TensorShape{dimK, dimN},
-                                 nnz,
-                                 FORMAT,
-                                 FLOAT_VALUE,
-                                 UNSPECIFIED,
-                                 false));
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));

  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  beta == 1.0 ? ADD_TO : ASSIGN_TO);
  // run Function
  test.run();
 }
@ -189,7 +194,12 @@ void testFuncSparseDDMatrix(
  real beta = 1.0;
  // init Test object
  FunctionCompare test("MulOp",
-                       FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
+                       FuncConfig()
+                           .set("scaleAB", alpha)
+                           .set("scaleT", beta)
+                           .set("aTrans", false)
+                           .set("bTrans", false)
+                           .set("cTrans", false));
  // prepare input arguments
  /// matrix A : M * K
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@ -198,14 +208,10 @@ void testFuncSparseDDMatrix(
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));

  /// output sparse matrix C: M * N
-  test.addOutputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
-                                  TensorShape{dimM, dimN},
-                                  nnz,
-                                  FORMAT,
-                                  FLOAT_VALUE,
-                                  UNSPECIFIED,
-                                  false),
-                  ADD_TO);
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      beta == 1.0 ? ADD_TO : ASSIGN_TO);
  // run Function
  test.run();
 }