19namespace cuSolverConnector {
23void trtri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
T* A,
const int& lda)
25 size_t d_lwork = 0, h_lwork = 0;
28 void* d_work =
nullptr, *h_work =
nullptr;
31 h_work = malloc(h_lwork);
32 if (h_work ==
nullptr) {
33 throw std::bad_alloc();
37 int* d_info =
nullptr;
40 cusolverErrcheck(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n,
GetTypeCuda<T>::cuda_data_type,
reinterpret_cast<Type*
>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
41 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
43 throw std::runtime_error(
"trtri: failed to invert matrix");
51void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
float * A,
const int& lda)
54 cusolverErrcheck(cusolverDnSpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
56 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
float)));
58 cusolverErrcheck(cusolverDnSpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork,
nullptr));
62void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
double * A,
const int& lda)
65 cusolverErrcheck(cusolverDnDpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
67 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
double)));
69 cusolverErrcheck(cusolverDnDpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork,
nullptr));
73void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n, std::complex<float> * A,
const int& lda)
76 cusolverErrcheck(cusolverDnCpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex *
>(A), n, &lwork));
78 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuComplex)));
80 cusolverErrcheck(cusolverDnCpotri(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex *
>(A), n, work, lwork,
nullptr));
84void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n, std::complex<double> * A,
const int& lda)
87 cusolverErrcheck(cusolverDnZpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex *
>(A), n, &lwork));
88 cuDoubleComplex* work;
89 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuDoubleComplex)));
91 cusolverErrcheck(cusolverDnZpotri(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex *
>(A), n, work, lwork,
nullptr));
97void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n,
float * A,
const int& lda)
101 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
102 cusolverErrcheck(cusolverDnSpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
104 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
float)));
106 cusolverErrcheck(cusolverDnSpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
111void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n,
double * A,
const int& lda)
115 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
116 cusolverErrcheck(cusolverDnDpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
118 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
double)));
120 cusolverErrcheck(cusolverDnDpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
125void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n, std::complex<float> * A,
const int& lda)
129 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
130 cusolverErrcheck(cusolverDnCpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex*
>(A), lda, &lwork));
132 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuComplex)));
134 cusolverErrcheck(cusolverDnCpotrf(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex*
>(A), lda, work, lwork, info));
139void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n, std::complex<double> * A,
const int& lda)
143 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
144 cusolverErrcheck(cusolverDnZpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, &lwork));
145 cuDoubleComplex* work;
146 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuDoubleComplex)));
148 cusolverErrcheck(cusolverDnZpotrf(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, work, lwork, info));
155void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n,
float* A,
const int& lda,
float * W)
160 int* d_info =
nullptr;
161 float* d_work =
nullptr;
165 cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
166 n, A, lda, W, &lwork));
168 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
170 cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
171 n, A, lda, W, d_work, lwork, d_info));
173 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
175 throw std::runtime_error(
"heevd: failed to invert matrix");
181void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n,
double* A,
const int& lda,
double * W)
186 int* d_info =
nullptr;
187 double* d_work =
nullptr;
191 cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
192 n, A, lda, W, &lwork));
194 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
196 cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
197 n, A, lda, W, d_work, lwork, d_info));
199 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
201 throw std::runtime_error(
"heevd: failed to invert matrix");
207void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n, std::complex<float>* A,
const int& lda,
float * W)
212 int* d_info =
nullptr;
213 cuComplex* d_work =
nullptr;
217 cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
218 n,
reinterpret_cast<cuComplex*
>(A), lda, W, &lwork));
220 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
222 cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
223 n,
reinterpret_cast<cuComplex*
>(A), lda, W, d_work, lwork, d_info));
225 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
227 throw std::runtime_error(
"heevd: failed to invert matrix");
233void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n, std::complex<double>* A,
const int& lda,
double* W)
238 int* d_info =
nullptr;
239 cuDoubleComplex* d_work =
nullptr;
243 cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
244 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, W, &lwork));
246 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
248 cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
249 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, W, d_work, lwork, d_info));
251 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
253 throw std::runtime_error(
"heevd: failed to invert matrix");
264void heevdx(cusolverDnHandle_t& cusolver_handle,
271 const int il,
const int iu,
272 const float vl,
const float vu,
277 int* d_info =
nullptr;
278 float* d_work =
nullptr;
282 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
283 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
284 cusolverEigRange_t range_t = cublas_eig_range(range);
288 jobz_t, range_t, uplo_t,
296 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
301 jobz_t, range_t, uplo_t,
312 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
314 cudaFree(d_info); cudaFree(d_work);
315 throw std::runtime_error(
"heevdx (float) failed with info = " + std::to_string(h_info));
324void heevdx(cusolverDnHandle_t& cusolver_handle,
331 const int il,
const int iu,
332 const double vl,
const double vu,
337 int* d_info =
nullptr;
338 double* d_work =
nullptr;
342 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
343 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
344 cusolverEigRange_t range_t = cublas_eig_range(range);
348 jobz_t, range_t, uplo_t,
356 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
360 jobz_t, range_t, uplo_t,
371 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
373 cudaFree(d_info); cudaFree(d_work);
374 throw std::runtime_error(
"heevdx (double) failed with info = " + std::to_string(h_info));
383void heevdx(cusolverDnHandle_t& cusolver_handle,
386 std::complex<float>* d_A,
390 const int il,
const int iu,
391 const float vl,
const float vu,
396 int* d_info =
nullptr;
397 cuComplex* d_work =
nullptr;
401 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
402 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
403 cusolverEigRange_t range_t = cublas_eig_range(range);
407 jobz_t, range_t, uplo_t,
409 reinterpret_cast<cuComplex*
>(d_A), lda,
416 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
420 jobz_t, range_t, uplo_t,
422 reinterpret_cast<cuComplex*
>(d_A), lda,
431 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
433 cudaFree(d_info); cudaFree(d_work);
434 throw std::runtime_error(
"heevdx (complex<float>) failed with info = " + std::to_string(h_info));
443void heevdx(cusolverDnHandle_t& cusolver_handle,
446 std::complex<double>* d_A,
450 const int il,
const int iu,
451 const double vl,
const double vu,
456 int* d_info =
nullptr;
457 cuDoubleComplex* d_work =
nullptr;
461 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
462 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
463 cusolverEigRange_t range_t = cublas_eig_range(range);
467 jobz_t, range_t, uplo_t,
469 reinterpret_cast<cuDoubleComplex*
>(d_A), lda,
476 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
480 jobz_t, range_t, uplo_t,
482 reinterpret_cast<cuDoubleComplex*
>(d_A), lda,
491 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
493 cudaFree(d_info); cudaFree(d_work);
494 throw std::runtime_error(
"heevdx (complex<double>) failed with info = " + std::to_string(h_info));
502void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n,
float* A,
const int& lda,
float* B,
const int& ldb,
float * W)
507 int* d_info =
nullptr;
508 float* d_work =
nullptr;
512 cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
513 n, A, lda, B, ldb, W, &lwork));
515 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
517 cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
518 n, A, lda, B, ldb, W, d_work, lwork, d_info));
520 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
522 throw std::runtime_error(
"heevd: failed to invert matrix");
528void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n,
double* A,
const int& lda,
double* B,
const int& ldb,
double * W)
533 int* d_info =
nullptr;
534 double* d_work =
nullptr;
538 cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
539 n, A, lda, B, ldb, W, &lwork));
541 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
543 cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
544 n, A, lda, B, ldb, W, d_work, lwork, d_info));
546 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
548 throw std::runtime_error(
"heevd: failed to invert matrix");
554void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n, std::complex<float>* A,
const int& lda, std::complex<float>* B,
const int& ldb,
float* W)
559 int* d_info =
nullptr;
560 cuComplex* d_work =
nullptr;
564 cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
565 n,
reinterpret_cast<cuComplex*
>(A), lda,
reinterpret_cast<cuComplex*
>(B), ldb, W, &lwork));
567 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
569 cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
570 n,
reinterpret_cast<cuComplex*
>(A), lda,
reinterpret_cast<cuComplex*
>(B), ldb, W, d_work, lwork, d_info));
572 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
574 throw std::runtime_error(
"heevd: failed to invert matrix");
580void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n, std::complex<double>* A,
const int& lda, std::complex<double>* B,
const int& ldb,
double* W)
585 int* d_info =
nullptr;
586 cuDoubleComplex* d_work =
nullptr;
590 cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
591 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, W, &lwork));
593 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
595 cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
596 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, W, d_work, lwork, d_info));
598 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
600 throw std::runtime_error(
"heevd: failed to invert matrix");
614 cusolverDnHandle_t& cusolver_handle,
632 int *d_info =
nullptr;
633 float *d_work =
nullptr;
639 float *d_A_copy =
nullptr, *d_B_copy =
nullptr;
640 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(
float) * n * lda));
641 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(
float) * n * lda));
642 cudaErrcheck(cudaMemcpy(d_A_copy, d_A,
sizeof(
float) * n * lda, cudaMemcpyDeviceToDevice));
643 cudaErrcheck(cudaMemcpy(d_B_copy, d_B,
sizeof(
float) * n * lda, cudaMemcpyDeviceToDevice));
646 cusolverEigType_t itype_t = cublas_eig_type(itype);
647 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
648 cusolverEigRange_t range_t = cublas_eig_range(range);
649 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
654 itype_t, jobz_t, range_t, uplo_t,
665 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
670 itype_t, jobz_t, range_t, uplo_t,
683 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
685 throw std::runtime_error(
"hegvdx (float): illegal argument #" + std::to_string(-h_info));
686 }
else if (h_info > 0) {
689 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
690 throw std::runtime_error(
"hegvdx (float): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
691 }
else if (h_info > n) {
692 throw std::runtime_error(
"hegvdx (float): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
698 const int m = (*h_meig);
699 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy,
sizeof(
float) * n * m, cudaMemcpyDeviceToDevice));
713 cusolverDnHandle_t& cusolver_handle,
731 int *d_info =
nullptr;
732 double *d_work =
nullptr;
736 double *d_A_copy =
nullptr, *d_B_copy =
nullptr;
737 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(
double) * n * lda));
738 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(
double) * n * lda));
739 cudaErrcheck(cudaMemcpy(d_A_copy, d_A,
sizeof(
double) * n * lda, cudaMemcpyDeviceToDevice));
740 cudaErrcheck(cudaMemcpy(d_B_copy, d_B,
sizeof(
double) * n * lda, cudaMemcpyDeviceToDevice));
742 cusolverEigType_t itype_t = cublas_eig_type(itype);
743 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
744 cusolverEigRange_t range_t = cublas_eig_range(range);
745 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
749 itype_t, jobz_t, range_t, uplo_t,
759 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
763 itype_t, jobz_t, range_t, uplo_t,
775 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
777 throw std::runtime_error(
"hegvdx (double): illegal argument #" + std::to_string(-h_info));
778 }
else if (h_info > 0) {
779 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
780 throw std::runtime_error(
"hegvdx (double): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
781 }
else if (h_info > n) {
782 throw std::runtime_error(
"hegvdx (double): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
787 const int m = (*h_meig);
788 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy,
sizeof(
double) * n * m, cudaMemcpyDeviceToDevice));
801 cusolverDnHandle_t& cusolver_handle,
808 std::complex<float>* d_A,
809 std::complex<float>* d_B,
816 std::complex<float>* d_eigen_vec
819 int *d_info =
nullptr;
820 cuComplex *d_work =
nullptr;
824 cuComplex *d_A_copy =
nullptr, *d_B_copy =
nullptr;
825 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(cuComplex) * n * lda));
826 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(cuComplex) * n * lda));
827 cudaErrcheck(cudaMemcpy(d_A_copy,
reinterpret_cast<cuComplex*
>(d_A),
sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
828 cudaErrcheck(cudaMemcpy(d_B_copy,
reinterpret_cast<cuComplex*
>(d_B),
sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
830 cusolverEigType_t itype_t = cublas_eig_type(itype);
831 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
832 cusolverEigRange_t range_t = cublas_eig_range(range);
833 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
837 itype_t, jobz_t, range_t, uplo_t,
847 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
851 itype_t, jobz_t, range_t, uplo_t,
863 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
865 throw std::runtime_error(
"hegvdx (complex<float>): illegal argument #" + std::to_string(-h_info));
866 }
else if (h_info > 0) {
867 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
868 throw std::runtime_error(
"hegvdx (complex<float>): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
869 }
else if (h_info > n) {
870 throw std::runtime_error(
"hegvdx (complex<float>): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
875 const int m = (*h_meig);
876 cudaErrcheck(cudaMemcpy(
reinterpret_cast<cuComplex*
>(d_eigen_vec), d_A_copy,
sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice));
889 cusolverDnHandle_t& cusolver_handle,
896 std::complex<double>* d_A,
897 std::complex<double>* d_B,
904 std::complex<double>* d_eigen_vec
907 int *d_info =
nullptr;
908 cuDoubleComplex *d_work =
nullptr;
912 cuDoubleComplex *d_A_copy =
nullptr, *d_B_copy =
nullptr;
913 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(cuDoubleComplex) * n * lda));
914 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(cuDoubleComplex) * n * lda));
915 cudaErrcheck(cudaMemcpy(d_A_copy,
reinterpret_cast<cuDoubleComplex*
>(d_A),
sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
916 cudaErrcheck(cudaMemcpy(d_B_copy,
reinterpret_cast<cuDoubleComplex*
>(d_B),
sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
918 cusolverEigType_t itype_t = cublas_eig_type(itype);
919 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
920 cusolverEigRange_t range_t = cublas_eig_range(range);
921 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
925 itype_t, jobz_t, range_t, uplo_t,
935 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
939 itype_t, jobz_t, range_t, uplo_t,
951 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
953 throw std::runtime_error(
"hegvdx (complex<double>): illegal argument #" + std::to_string(-h_info));
954 }
else if (h_info > 0) {
955 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
956 throw std::runtime_error(
"hegvdx (complex<double>): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
957 }
else if (h_info > n) {
958 throw std::runtime_error(
"hegvdx (complex<double>): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
963 const int m = (*h_meig);
964 cudaErrcheck(cudaMemcpy(
reinterpret_cast<cuDoubleComplex*
>(d_eigen_vec), d_A_copy,
sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice));
976void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n,
float* A,
const int& lda,
int* ipiv)
981 int* d_info =
nullptr;
982 float* d_work =
nullptr;
986 cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
989 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
992 cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
994 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
996 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1003void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n,
double* A,
const int& lda,
int* ipiv)
1008 int* d_info =
nullptr;
1009 double* d_work =
nullptr;
1010 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1013 cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
1016 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
1019 cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
1021 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1023 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1030void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n, std::complex<float>* A,
const int& lda,
int* ipiv)
1035 int* d_info =
nullptr;
1036 cuComplex* d_work =
nullptr;
1037 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1040 cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n,
reinterpret_cast<cuComplex*
>(A), lda, &lwork));
1043 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
1046 cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n,
reinterpret_cast<cuComplex*
>(A), lda, d_work, ipiv, d_info));
1048 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1050 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1057void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n, std::complex<double>* A,
const int& lda,
int* ipiv)
1062 int* d_info =
nullptr;
1063 cuDoubleComplex* d_work =
nullptr;
1064 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1067 cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, &lwork));
1070 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
1073 cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, d_work, ipiv, d_info));
1075 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1077 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1085void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs,
float* A,
const int& lda,
const int* ipiv,
float* B,
const int& ldb)
1088 int* d_info =
nullptr;
1089 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1091 cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1093 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1095 throw std::runtime_error(
"getrs: failed to solve the linear system");
1101void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs,
double* A,
const int& lda,
const int* ipiv,
double* B,
const int& ldb)
1104 int* d_info =
nullptr;
1105 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1107 cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1109 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1111 throw std::runtime_error(
"getrs: failed to solve the linear system");
1117void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs, std::complex<float>* A,
const int& lda,
const int* ipiv, std::complex<float>* B,
const int& ldb)
1120 int* d_info =
nullptr;
1121 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1123 cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs,
reinterpret_cast<cuComplex*
>(A), lda, ipiv,
reinterpret_cast<cuComplex*
>(B), ldb, d_info));
1125 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1127 throw std::runtime_error(
"getrs: failed to solve the linear system");
1133void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs, std::complex<double>* A,
const int& lda,
const int* ipiv, std::complex<double>* B,
const int& ldb)
1136 int* d_info =
nullptr;
1137 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1139 cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs,
reinterpret_cast<cuDoubleComplex*
>(A), lda, ipiv,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, d_info));
1141 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1143 throw std::runtime_error(
"getrs: failed to solve the linear system");
1242static inline void geqrf(
1243 cusolverDnHandle_t& cusolver_handle,
1252 cusolver_handle, m, n, d_A, lda, &lwork));
1254 float* d_work =
nullptr;
1255 int* d_info =
nullptr;
1258 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
float) * lwork));
1260 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1263 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1266 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1268 std::cout <<
"geqrf (S): info = " << h_info << std::endl;
1271 throw std::runtime_error(
"geqrf (S): QR factorization failed");
1279static inline void geqrf(
1280 cusolverDnHandle_t& cusolver_handle,
1289 cusolver_handle, m, n, d_A, lda, &lwork));
1291 double* d_work =
nullptr;
1292 int* d_info =
nullptr;
1295 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
double) * lwork));
1297 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1300 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1303 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1305 std::cout <<
"geqrf (D): info = " << h_info << std::endl;
1308 throw std::runtime_error(
"geqrf (D): QR factorization failed");
1316static inline void geqrf(
1317 cusolverDnHandle_t& cusolver_handle,
1320 std::complex<float>* d_A,
1322 std::complex<float>* d_tau
1326 cusolver_handle, m, n,
1327 reinterpret_cast<cuComplex*
>(d_A),
1332 cuComplex* d_work =
nullptr;
1333 int* d_info =
nullptr;
1336 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuComplex) * lwork));
1338 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1341 cusolver_handle, m, n,
1342 reinterpret_cast<cuComplex*
>(d_A),
1344 reinterpret_cast<cuComplex*
>(d_tau),
1345 d_work, lwork, d_info));
1348 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1350 std::cout <<
"geqrf (C): info = " << h_info << std::endl;
1353 throw std::runtime_error(
"geqrf (C): QR factorization failed");
1361static inline void geqrf(
1362 cusolverDnHandle_t& cusolver_handle,
1365 std::complex<double>* d_A,
1367 std::complex<double>* d_tau
1371 cusolver_handle, m, n,
1372 reinterpret_cast<cuDoubleComplex*
>(d_A),
1377 cuDoubleComplex* d_work =
nullptr;
1378 int* d_info =
nullptr;
1381 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuDoubleComplex) * lwork));
1383 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1386 cusolver_handle, m, n,
1387 reinterpret_cast<cuDoubleComplex*
>(d_A),
1389 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1390 d_work, lwork, d_info));
1393 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1395 std::cout <<
"geqrf (Z): info = " << h_info << std::endl;
1398 throw std::runtime_error(
"geqrf (Z): QR factorization failed");
1407static inline void orgqr(
1408 cusolverDnHandle_t& cusolver_handle,
1418 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1420 float* d_work =
nullptr;
1421 int* d_info =
nullptr;
1424 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
float) * lwork));
1426 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1429 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1432 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1434 std::cout <<
"orgqr (S): info = " << h_info <<
" (failure at parameter " << -h_info <<
")" << std::endl;
1437 throw std::runtime_error(
"orgqr (S): failed to generate Q matrix");
1446static inline void orgqr(
1447 cusolverDnHandle_t& cusolver_handle,
1457 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1459 double* d_work =
nullptr;
1460 int* d_info =
nullptr;
1463 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
double) * lwork));
1465 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1468 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1471 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1473 std::cout <<
"orgqr (D): info = " << h_info << std::endl;
1476 throw std::runtime_error(
"orgqr (D): failed to generate Q matrix");
1484static inline void orgqr(
1485 cusolverDnHandle_t& cusolver_handle,
1489 std::complex<float>* d_A,
1491 std::complex<float>* d_tau
1495 cusolver_handle, m, n, k,
1496 reinterpret_cast<cuComplex*
>(d_A),
1498 reinterpret_cast<cuComplex*
>(d_tau),
1501 cuComplex* d_work =
nullptr;
1502 int* d_info =
nullptr;
1505 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuComplex) * lwork));
1507 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1510 cusolver_handle, m, n, k,
1511 reinterpret_cast<cuComplex*
>(d_A),
1513 reinterpret_cast<cuComplex*
>(d_tau),
1514 d_work, lwork, d_info));
1517 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1519 std::cout <<
"orgqr (C): info = " << h_info << std::endl;
1522 throw std::runtime_error(
"orgqr (C): failed to generate Q matrix");
1530static inline void orgqr(
1531 cusolverDnHandle_t& cusolver_handle,
1535 std::complex<double>* d_A,
1537 std::complex<double>* d_tau
1541 cusolver_handle, m, n, k,
1542 reinterpret_cast<cuDoubleComplex*
>(d_A),
1544 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1547 cuDoubleComplex* d_work =
nullptr;
1548 int* d_info =
nullptr;
1551 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuDoubleComplex) * lwork));
1553 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1556 cusolver_handle, m, n, k,
1557 reinterpret_cast<cuDoubleComplex*
>(d_A),
1559 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1560 d_work, lwork, d_info));
1563 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1565 std::cout <<
"orgqr (Z): info = " << h_info << std::endl;
1568 throw std::runtime_error(
"orgqr (Z): failed to generate Q matrix");