Daoyuan's comments.

avx_docs
xutianbing 8 years ago
parent bc5d7bb6d2
commit b3be735807

@ -71,24 +71,17 @@ public:
public:
BufferArg(ValueType valueType,
const TensorShape& shape,
ArgType argType = UNSPECIFIED,
bool trans = false)
ArgType argType = UNSPECIFIED)
: buf_(nullptr),
valueType_(valueType),
shape_(shape),
argType_(argType),
trans_(trans) {}
argType_(argType) {}
BufferArg(void* buf,
ValueType valueType,
const TensorShape& shape,
ArgType argType = UNSPECIFIED,
bool trans = false)
: buf_(buf),
valueType_(valueType),
shape_(shape),
argType_(argType),
trans_(trans) {}
ArgType argType = UNSPECIFIED)
: buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
BufferArg(void* buf, ValueType valueType)
: buf_(buf), valueType_(valueType) {}
@ -98,8 +91,7 @@ public:
const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
valueType_(DataType<real>::value),
shape_(2),
argType_(argType),
trans_(matrix.isTransposed()) {
argType_(argType) {
bufferType_ = TENSOR_NORMAL;
shape_.setDim(0, matrix.getHeight());
shape_.setDim(1, matrix.getWidth());
@ -112,8 +104,7 @@ public:
const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
valueType_(DataType<real>::value),
shape_(shape),
argType_(argType),
trans_(matrix.isTransposed()) {
argType_(argType) {
bufferType_ = TENSOR_NORMAL;
CHECK_EQ(matrix.getElementCnt(), shape.getElements());
}
@ -145,7 +136,7 @@ public:
// CHECK(deviceType_ == DType);
CHECK_EQ((size_t)2, shape_.ndims());
return typename Tensor<real, DType>::Matrix(
reinterpret_cast<real*>(buf_), shape_[0], shape_[1], trans_);
reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
}
template <typename VType, DeviceType DType>
@ -169,7 +160,6 @@ public:
ValueType valueType() const { return valueType_; }
BufferType bufferType() const { return bufferType_; }
const TensorShape& shape() const { return shape_; }
bool isTransposed() const { return trans_; }
bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
virtual size_t numElements() const { return shape_.getElements(); }
@ -183,7 +173,6 @@ protected:
TensorShape shape_;
BufferType bufferType_{TENSOR_UNKNOWN};
ArgType argType_{UNSPECIFIED};
bool trans_{false};
// todo(tianbing), add deviceType_
// leading dimensions. The size is dims_.size()
// Dims lds_;
@ -277,9 +266,8 @@ public:
size_t nnz,
SparseFormat format,
SparseValueType type,
ArgType argType = UNSPECIFIED,
bool trans = false)
: BufferArg(buf, valueType, shape, argType, trans),
ArgType argType = UNSPECIFIED)
: BufferArg(buf, valueType, shape, argType),
row_(row),
col_(col),
nnz_(nnz),
@ -302,9 +290,8 @@ public:
size_t nnz,
SparseFormat format,
SparseValueType type,
ArgType argType = UNSPECIFIED,
bool trans = false)
: BufferArg(valueType, shape, argType, trans),
ArgType argType = UNSPECIFIED)
: BufferArg(valueType, shape, argType),
/// len of row_ : height + 1 (CSR), buf_ == nullptr
row_(format == SPARSE_CSR
? BufferArg(VALUE_TYPE_INT32, TensorShape{shape[0] + 1})
@ -343,7 +330,7 @@ public:
nnz_,
type_,
format_,
trans_);
false);
}
~SparseMatrixArg() {}

@ -64,22 +64,14 @@ public:
cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuInputs_.emplace_back(
std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
input.valueType(),
input.shape(),
UNSPECIFIED,
input.isTransposed()));
gpuInputs_.emplace_back(
std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
input.valueType(),
input.shape(),
UNSPECIFIED,
input.isTransposed()));
cpuInputs_.emplace_back(std::make_shared<BufferArg>(
cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
gpuInputs_.emplace_back(std::make_shared<BufferArg>(
gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
}
// output need only contains shape, do not contains data.
void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
void addOutputs(const BufferArg& output, ArgType argType = ADD_TO) {
size_t size =
output.shape().getElements() * sizeOfValuType(output.valueType());
cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
@ -89,16 +81,14 @@ public:
cpuMemory_.back()->getBuf(),
output.valueType(),
output.shape(),
// todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO
argType,
output.isTransposed()));
// todo(tianbing), argType = output.getArgType(), but default ADD_TO
argType));
gpuOutputs_.emplace_back(std::make_shared<BufferArg>(
gpuMemory_.back()->getBuf(),
output.valueType(),
output.shape(),
// todo(tianbing), argType = output.getArgType(), but default ASSIGN_TO
argType,
output.isTransposed()));
// todo(tianbing), argType = output.getArgType(), but default ADD_TO
argType));
}
/// add and init output sparse matrix
@ -107,15 +97,13 @@ public:
output.shape()[1],
output.nnz(),
output.dataType(),
output.dataFormat(),
output.isTransposed());
output.dataFormat());
gpuSparse_ = std::make_shared<GpuSparseMatrix>(output.shape()[0],
output.shape()[1],
output.nnz(),
output.dataType(),
output.dataFormat(),
output.isTransposed());
output.dataFormat());
/// init sparse matrix
hl_stream_t stream(HPPL_STREAM_1);
@ -154,15 +142,13 @@ public:
input.shape()[1],
input.nnz(),
input.dataType(),
input.dataFormat(),
input.isTransposed());
input.dataFormat());
gpuSparse_ = std::make_shared<GpuSparseMatrix>(input.shape()[0],
input.shape()[1],
input.nnz(),
input.dataType(),
input.dataFormat(),
input.isTransposed());
input.dataFormat());
/// init sparse matrix
hl_stream_t stream(HPPL_STREAM_1);

File diff suppressed because it is too large Load Diff

@ -26,55 +26,79 @@ void MulOp(CpuMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuSparseMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(CpuMatrix& out,
const CpuMatrix& a,
const CpuSparseMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(CpuSparseMatrix& out,
const CpuMatrix& a,
const CpuMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuSparseMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(GpuMatrix& out,
const GpuMatrix& a,
const GpuSparseMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
template <DeviceType DType>
void MulOp(GpuSparseMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT);
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans);
} // namespace paddle

@ -27,38 +27,22 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT) {
CHECK(!out.isTransposed()) << "Transpose not supported for out matrix";
if (!a.isTransposed() && !b.isTransposed()) {
/// a : M * K, b: K * N
CHECK(out.getWidth() == b.getWidth() &&
out.getHeight() == a.getHeight() &&
a.getWidth() == b.getHeight());
} else if (a.isTransposed() && !b.isTransposed()) {
/// a : K * M, b : K * N
CHECK(out.getWidth() == b.getWidth() &&
out.getHeight() == a.getWidth() &&
a.getHeight() == b.getHeight());
} else if (!a.isTransposed() && b.isTransposed()) {
/// a: M * K, b : N * K
CHECK(out.getWidth() == b.getHeight() &&
out.getHeight() == a.getHeight() &&
a.getWidth() == b.getWidth());
} else {
LOG(FATAL) << "Not support for both a and b are Transposed Matrices";
}
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans) {
CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
real* aData = const_cast<real*>(a.getData());
real* bData = const_cast<real*>(b.getData());
real* outData = const_cast<real*>(out.getData());
hl_matrix_mul(aData,
!a.isTransposed() ? HPPL_OP_N : HPPL_OP_T,
!aTrans ? HPPL_OP_N : HPPL_OP_T,
bData,
!b.isTransposed() ? HPPL_OP_N : HPPL_OP_T,
!bTrans ? HPPL_OP_N : HPPL_OP_T,
outData,
out.getHeight(),
out.getWidth(),
!a.isTransposed() ? a.getWidth() : a.getHeight(),
!aTrans ? a.getWidth() : a.getHeight(),
scaleAB,
scaleT,
a.getStride(),
@ -75,27 +59,19 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuSparseMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT) {
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans) {
CHECK(out.isContiguous());
CHECK(b.isContiguous());
CHECK(b.useGpu_) << "Matrix type are not equal";
CHECK(!out.isTransposed() && !b.isTransposed()) << "not supported";
if (!a.isTransposed()) {
/// a: M * K, b: K * N
CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getHeight()
&& a.getWidth() == b.getHeight()) << "Matrix dimensions are not equal";
} else {
/// a: K * M, transpose, b: K * N
CHECK(out.getWidth() == b.getWidth() && out.getHeight() == a.getWidth()
&& a.getHeight() == b.getHeight()) << "Matrix dimensions are not equal";
}
CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
hl_sparse_matrix_s aData = a.sMatrix_.get();
real* bData = const_cast<real*>(b.getData());
real* outData = const_cast<real*>(out.getData());
hl_matrix_csr_mul_dense(aData,
aTrans,
aTrans ? HPPL_OP_T : HPPL_OP_N,
bData,
HPPL_OP_N,
outData,
@ -115,25 +91,14 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
const GpuMatrix& a,
const GpuSparseMatrix& b,
real scaleAB,
real scaleT) {
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans) {
CHECK(out.isContiguous());
CHECK(a.isContiguous());
CHECK(a.useGpu_) << "Matrix type are not equal";
if (!b.isTransposed()) {
/// a : M * K, b : K * N
CHECK(out.getWidth() == b.getWidth() &&
out.getHeight() == a.getHeight() &&
a.getWidth() == b.getHeight())
<< "Matrix dimensions are not equal";
} else {
/// a : M * K, b : N * K, transpose
CHECK(out.getWidth() == b.getHeight() &&
out.getHeight() == a.getHeight() &&
a.getWidth() == b.getWidth())
<< "Matrix dimensions are not equal";
}
CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
hl_sparse_matrix_s bData = b.sMatrix_.get();
real* aData = const_cast<real*>(a.getData());
real* outData = const_cast<real*>(out.getData());
@ -142,7 +107,7 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
hl_matrix_dense_mul_csc(aData,
HPPL_OP_N,
bData,
bTrans,
bTrans ? HPPL_OP_T : HPPL_OP_N,
outData,
out.getHeight(),
out.getWidth(),
@ -153,7 +118,7 @@ void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
hl_matrix_dense_mul_csr(aData,
HPPL_OP_N,
bData,
bTrans,
bTrans ? HPPL_OP_T : HPPL_OP_N,
outData,
out.getHeight(),
out.getWidth(),
@ -168,35 +133,26 @@ void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
const GpuMatrix& a,
const GpuMatrix& b,
real scaleAB,
real scaleT) {
real scaleT,
bool aTrans,
bool bTrans,
bool cTrans) {
CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
CHECK(!out.isTransposed()) << "Transpose is not supported for out matrix";
if (!a.isTransposed() && !b.isTransposed()) {
CHECK(out.getHeight() == a.getHeight() &&
out.getWidth() == b.getWidth() &&
a.getWidth() == b.getHeight());
} else if (a.isTransposed() && !b.isTransposed()) {
CHECK(out.getHeight() == a.getWidth() &&
out.getWidth() == b.getWidth() &&
a.getHeight() == b.getHeight());
} else if (!a.isTransposed() && b.isTransposed()) {
CHECK(out.getHeight() == a.getHeight() &&
out.getWidth() == b.getHeight() &&
a.getWidth() == b.getWidth());
} else {
LOG(FATAL) << "Not support for both a and b are Transposed Matrices";
}
hl_trans_op_t aTrans = a.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
hl_trans_op_t bTrans = b.isTransposed() ? HPPL_OP_T : HPPL_OP_N;
int dimK = !b.isTransposed() ? b.getHeight() : b.getWidth();
real* aData = const_cast<real*>(a.getData());
real* bData = const_cast<real*>(b.getData());
hl_sparse_matrix_s outData = out.sMatrix_.get();
hl_sparse_matrix_mul(aData, aTrans, bData, bTrans, outData,
out.getHeight(), out.getWidth(), dimK, scaleAB, scaleT);
hl_sparse_matrix_mul(aData,
aTrans ? HPPL_OP_T : HPPL_OP_N,
bData,
bTrans ? HPPL_OP_T : HPPL_OP_N,
outData,
out.getHeight(),
out.getWidth(),
!bTrans ? b.getHeight() : b.getWidth(),
scaleAB,
scaleT);
}
} // namespace paddle

@ -39,18 +39,21 @@ void testFuncDDDMatrix(
size_t widthC = dimN;
// init Test object
FunctionCompare test("MulOp",
FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
FuncConfig()
.set("scaleAB", alpha)
.set("scaleT", beta)
.set("aTrans", transa)
.set("bTrans", transb)
.set("cTrans", false));
// prepare input arguments
/// matrix A : HA * WA
test.addInputs(BufferArg(
VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}, UNSPECIFIED, transa));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
/// matrix B: HB * WB
test.addInputs(BufferArg(
VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}, UNSPECIFIED, transb));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
/// output matrix C: HC * WC
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
ADD_TO);
beta == 1.0 ? ADD_TO : ASSIGN_TO);
// run Function
test.run();
}
@ -88,21 +91,22 @@ void testFuncDSparseDMatrix(
real beta = 1.0;
// init Test object
FunctionCompare test("MulOp",
FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
FuncConfig()
.set("scaleAB", alpha)
.set("scaleT", beta)
.set("aTrans", false)
.set("bTrans", false)
.set("cTrans", false));
// prepare input arguments
/// sparse matrix A : M * K
test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
TensorShape{dimM, dimK},
nnz,
FORMAT,
FLOAT_VALUE,
UNSPECIFIED,
false));
test.addInputs(SparseMatrixArg(
VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
/// matrix B: K * N
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
/// output matrix C: M * N
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO);
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
beta == 1.0 ? ADD_TO : ASSIGN_TO);
// run Function
test.run();
}
@ -138,22 +142,23 @@ void testFuncDDSparseMatrix(
real beta = 1.0;
// init Test object
FunctionCompare test("MulOp",
FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
FuncConfig()
.set("scaleAB", alpha)
.set("scaleT", beta)
.set("aTrans", false)
.set("bTrans", false)
.set("cTrans", false));
// prepare input arguments
/// matrix A : M * K
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
/// matrix B: K * N
test.addInputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
TensorShape{dimK, dimN},
nnz,
FORMAT,
FLOAT_VALUE,
UNSPECIFIED,
false));
test.addInputs(SparseMatrixArg(
VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
/// output matrix C: M * N
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}), ADD_TO);
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
beta == 1.0 ? ADD_TO : ASSIGN_TO);
// run Function
test.run();
}
@ -189,7 +194,12 @@ void testFuncSparseDDMatrix(
real beta = 1.0;
// init Test object
FunctionCompare test("MulOp",
FuncConfig().set("scaleAB", alpha).set("scaleT", beta));
FuncConfig()
.set("scaleAB", alpha)
.set("scaleT", beta)
.set("aTrans", false)
.set("bTrans", false)
.set("cTrans", false));
// prepare input arguments
/// matrix A : M * K
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@ -198,14 +208,10 @@ void testFuncSparseDDMatrix(
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
/// output sparse matrix C: M * N
test.addOutputs(SparseMatrixArg(VALUE_TYPE_FLOAT,
TensorShape{dimM, dimN},
nnz,
FORMAT,
FLOAT_VALUE,
UNSPECIFIED,
false),
ADD_TO);
test.addOutputs(
SparseMatrixArg(
VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
beta == 1.0 ? ADD_TO : ASSIGN_TO);
// run Function
test.run();
}

Loading…
Cancel
Save