|
|
@ -15,6 +15,7 @@ limitations under the License. */
|
|
|
|
|
|
|
|
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <mutex>
|
|
|
|
#include <mutex>
|
|
|
|
|
|
|
|
#include "hl_cuda.h"
|
|
|
|
#include "hl_cuda_cublas.h"
|
|
|
|
#include "hl_cuda_cublas.h"
|
|
|
|
#include "hl_thread.ph"
|
|
|
|
#include "hl_thread.ph"
|
|
|
|
#include "hl_dso_loader.h"
|
|
|
|
#include "hl_dso_loader.h"
|
|
|
@ -75,6 +76,8 @@ DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
|
|
|
|
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
|
|
|
|
|
|
|
|
DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
|
|
|
|
CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
|
|
|
|
CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
|
|
|
|
|
|
|
|
|
|
|
|
#undef DYNAMIC_LOAD_CUBLAS_WRAP
|
|
|
|
#undef DYNAMIC_LOAD_CUBLAS_WRAP
|
|
|
@ -88,10 +91,14 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
|
|
|
|
#define CUBLAS_GEAM dynload::cublasSgeam
|
|
|
|
#define CUBLAS_GEAM dynload::cublasSgeam
|
|
|
|
#define CUBLAS_GEMV dynload::cublasSgemv
|
|
|
|
#define CUBLAS_GEMV dynload::cublasSgemv
|
|
|
|
#define CUBLAS_GEMM dynload::cublasSgemm
|
|
|
|
#define CUBLAS_GEMM dynload::cublasSgemm
|
|
|
|
|
|
|
|
#define CUBLAS_GETRF dynload::cublasSgetrfBatched
|
|
|
|
|
|
|
|
#define CUBLAS_GETRI dynload::cublasSgetriBatched
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
#define CUBLAS_GEAM dynload::cublasDgeam
|
|
|
|
#define CUBLAS_GEAM dynload::cublasDgeam
|
|
|
|
#define CUBLAS_GEMV dynload::cublasDgemv
|
|
|
|
#define CUBLAS_GEMV dynload::cublasDgemv
|
|
|
|
#define CUBLAS_GEMM dynload::cublasDgemm
|
|
|
|
#define CUBLAS_GEMM dynload::cublasDgemm
|
|
|
|
|
|
|
|
#define CUBLAS_GETRF dynload::cublasDgetrfBatched
|
|
|
|
|
|
|
|
#define CUBLAS_GETRI dynload::cublasDgetriBatched
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
const char* hl_cublas_get_error_string(cublasStatus_t status) {
|
|
|
|
const char* hl_cublas_get_error_string(cublasStatus_t status) {
|
|
|
@ -162,6 +169,54 @@ void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
|
|
|
|
hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
|
|
|
|
hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
|
|
|
|
|
|
|
|
/* Solve Ax = I */
|
|
|
|
|
|
|
|
CHECK_NOTNULL(A_d);
|
|
|
|
|
|
|
|
CHECK_NOTNULL(C_d);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Step 1: Compute the LU decomposition of matrix A */
|
|
|
|
|
|
|
|
real **inout_h = &A_d;
|
|
|
|
|
|
|
|
real **inout_d = (real **)hl_malloc_device(sizeof(real *));
|
|
|
|
|
|
|
|
hl_memcpy(inout_d, inout_h, sizeof(real *));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));
|
|
|
|
|
|
|
|
int *info_d = (int *)t_resource.gpu_mem;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Note: cublasSgetrfBatched is used to calculate a number of
|
|
|
|
|
|
|
|
small-sized matrices. There may be a better way to reconstruct
|
|
|
|
|
|
|
|
the API for better performance.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
|
|
|
|
|
|
|
|
dimN, inout_d, lda, pivot_d,
|
|
|
|
|
|
|
|
info_d, 1));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int info_h;
|
|
|
|
|
|
|
|
hl_memcpy(&info_h, info_d, sizeof(int));
|
|
|
|
|
|
|
|
if (info_h != 0) {
|
|
|
|
|
|
|
|
LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Step 2: Compute the inverse of the matrix given its LU decomposition */
|
|
|
|
|
|
|
|
real **out_h = &C_d;
|
|
|
|
|
|
|
|
real **out_d = (real **)hl_malloc_device(sizeof(real *));
|
|
|
|
|
|
|
|
hl_memcpy(out_d, out_h, sizeof(real *));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
|
|
|
|
|
|
|
|
dimN, (const real **)inout_d, lda, pivot_d,
|
|
|
|
|
|
|
|
out_d, ldc, info_d, 1));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hl_memcpy(&info_h, info_d, sizeof(int));
|
|
|
|
|
|
|
|
if (info_h != 0) {
|
|
|
|
|
|
|
|
LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hl_free_mem_device(inout_d);
|
|
|
|
|
|
|
|
hl_free_mem_device(pivot_d);
|
|
|
|
|
|
|
|
hl_free_mem_device(out_d);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHECK_SYNC("hl_matrix_inverse failed");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
|
|
|
|
void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
|
|
|
|
real *B_d, hl_trans_op_t transb,
|
|
|
|
real *B_d, hl_trans_op_t transb,
|
|
|
|
real *C_d,
|
|
|
|
real *C_d,
|
|
|
|