20namespace cuSolverConnector {
24void trtri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
T* A,
const int& lda)
26 size_t d_lwork = 0, h_lwork = 0;
29 void* d_work =
nullptr, *h_work =
nullptr;
32 h_work = malloc(h_lwork);
33 if (h_work ==
nullptr) {
34 throw std::bad_alloc();
38 int* d_info =
nullptr;
41 cusolverErrcheck(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n,
GetTypeCuda<T>::cuda_data_type,
reinterpret_cast<Type*
>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
42 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
44 throw std::runtime_error(
"trtri: failed to invert matrix");
52void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
float * A,
const int& lda)
55 cusolverErrcheck(cusolverDnSpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
57 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
float)));
59 cusolverErrcheck(cusolverDnSpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork,
nullptr));
63void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
double * A,
const int& lda)
66 cusolverErrcheck(cusolverDnDpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
68 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
double)));
70 cusolverErrcheck(cusolverDnDpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork,
nullptr));
74void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n, std::complex<float> * A,
const int& lda)
77 cusolverErrcheck(cusolverDnCpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex *
>(A), n, &lwork));
79 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuComplex)));
81 cusolverErrcheck(cusolverDnCpotri(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex *
>(A), n, work, lwork,
nullptr));
85void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n, std::complex<double> * A,
const int& lda)
88 cusolverErrcheck(cusolverDnZpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex *
>(A), n, &lwork));
89 cuDoubleComplex* work;
90 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuDoubleComplex)));
92 cusolverErrcheck(cusolverDnZpotri(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex *
>(A), n, work, lwork,
nullptr));
98void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n,
float * A,
const int& lda)
102 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
103 cusolverErrcheck(cusolverDnSpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
105 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
float)));
107 cusolverErrcheck(cusolverDnSpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
112void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n,
double * A,
const int& lda)
116 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
117 cusolverErrcheck(cusolverDnDpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
119 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(
double)));
121 cusolverErrcheck(cusolverDnDpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
126void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n, std::complex<float> * A,
const int& lda)
130 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
131 cusolverErrcheck(cusolverDnCpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex*
>(A), lda, &lwork));
133 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuComplex)));
135 cusolverErrcheck(cusolverDnCpotrf(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex*
>(A), lda, work, lwork, info));
140void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n, std::complex<double> * A,
const int& lda)
144 cudaErrcheck(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
145 cusolverErrcheck(cusolverDnZpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, &lwork));
146 cuDoubleComplex* work;
147 cudaErrcheck(cudaMalloc((
void**)&work, lwork *
sizeof(cuDoubleComplex)));
149 cusolverErrcheck(cusolverDnZpotrf(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, work, lwork, info));
156void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n,
float* A,
const int& lda,
float * W)
161 int* d_info =
nullptr;
162 float* d_work =
nullptr;
166 cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
167 n, A, lda, W, &lwork));
169 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
171 cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
172 n, A, lda, W, d_work, lwork, d_info));
174 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
176 throw std::runtime_error(
"heevd: failed to invert matrix");
182void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n,
double* A,
const int& lda,
double * W)
187 int* d_info =
nullptr;
188 double* d_work =
nullptr;
192 cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
193 n, A, lda, W, &lwork));
195 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
197 cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
198 n, A, lda, W, d_work, lwork, d_info));
200 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
202 throw std::runtime_error(
"heevd: failed to invert matrix");
208void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n, std::complex<float>* A,
const int& lda,
float * W)
213 int* d_info =
nullptr;
214 cuComplex* d_work =
nullptr;
218 cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
219 n,
reinterpret_cast<cuComplex*
>(A), lda, W, &lwork));
221 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
223 cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
224 n,
reinterpret_cast<cuComplex*
>(A), lda, W, d_work, lwork, d_info));
226 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
228 throw std::runtime_error(
"heevd: failed to invert matrix");
234void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n, std::complex<double>* A,
const int& lda,
double* W)
239 int* d_info =
nullptr;
240 cuDoubleComplex* d_work =
nullptr;
244 cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
245 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, W, &lwork));
247 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
249 cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
250 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, W, d_work, lwork, d_info));
252 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
254 throw std::runtime_error(
"heevd: failed to invert matrix");
265void heevdx(cusolverDnHandle_t& cusolver_handle,
272 const int il,
const int iu,
273 const float vl,
const float vu,
278 int* d_info =
nullptr;
279 float* d_work =
nullptr;
283 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
284 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
285 cusolverEigRange_t range_t = cublas_eig_range(range);
289 jobz_t, range_t, uplo_t,
297 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
302 jobz_t, range_t, uplo_t,
313 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
315 cudaFree(d_info); cudaFree(d_work);
316 throw std::runtime_error(
"heevdx (float) failed with info = " + std::to_string(h_info));
325void heevdx(cusolverDnHandle_t& cusolver_handle,
332 const int il,
const int iu,
333 const double vl,
const double vu,
338 int* d_info =
nullptr;
339 double* d_work =
nullptr;
343 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
344 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
345 cusolverEigRange_t range_t = cublas_eig_range(range);
349 jobz_t, range_t, uplo_t,
357 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
361 jobz_t, range_t, uplo_t,
372 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
374 cudaFree(d_info); cudaFree(d_work);
375 throw std::runtime_error(
"heevdx (double) failed with info = " + std::to_string(h_info));
384void heevdx(cusolverDnHandle_t& cusolver_handle,
387 std::complex<float>* d_A,
391 const int il,
const int iu,
392 const float vl,
const float vu,
397 int* d_info =
nullptr;
398 cuComplex* d_work =
nullptr;
402 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
403 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
404 cusolverEigRange_t range_t = cublas_eig_range(range);
408 jobz_t, range_t, uplo_t,
410 reinterpret_cast<cuComplex*
>(d_A), lda,
417 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
421 jobz_t, range_t, uplo_t,
423 reinterpret_cast<cuComplex*
>(d_A), lda,
432 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
434 cudaFree(d_info); cudaFree(d_work);
435 throw std::runtime_error(
"heevdx (complex<float>) failed with info = " + std::to_string(h_info));
444void heevdx(cusolverDnHandle_t& cusolver_handle,
447 std::complex<double>* d_A,
451 const int il,
const int iu,
452 const double vl,
const double vu,
457 int* d_info =
nullptr;
458 cuDoubleComplex* d_work =
nullptr;
462 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
463 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
464 cusolverEigRange_t range_t = cublas_eig_range(range);
468 jobz_t, range_t, uplo_t,
470 reinterpret_cast<cuDoubleComplex*
>(d_A), lda,
477 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
481 jobz_t, range_t, uplo_t,
483 reinterpret_cast<cuDoubleComplex*
>(d_A), lda,
492 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
494 cudaFree(d_info); cudaFree(d_work);
495 throw std::runtime_error(
"heevdx (complex<double>) failed with info = " + std::to_string(h_info));
503void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n,
float* A,
const int& lda,
float* B,
const int& ldb,
float * W)
508 int* d_info =
nullptr;
509 float* d_work =
nullptr;
513 cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
514 n, A, lda, B, ldb, W, &lwork));
516 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
518 cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
519 n, A, lda, B, ldb, W, d_work, lwork, d_info));
521 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
523 throw std::runtime_error(
"heevd: failed to invert matrix");
529void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n,
double* A,
const int& lda,
double* B,
const int& ldb,
double * W)
534 int* d_info =
nullptr;
535 double* d_work =
nullptr;
539 cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
540 n, A, lda, B, ldb, W, &lwork));
542 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
544 cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
545 n, A, lda, B, ldb, W, d_work, lwork, d_info));
547 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
549 throw std::runtime_error(
"heevd: failed to invert matrix");
555void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n, std::complex<float>* A,
const int& lda, std::complex<float>* B,
const int& ldb,
float* W)
560 int* d_info =
nullptr;
561 cuComplex* d_work =
nullptr;
565 cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
566 n,
reinterpret_cast<cuComplex*
>(A), lda,
reinterpret_cast<cuComplex*
>(B), ldb, W, &lwork));
568 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
570 cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
571 n,
reinterpret_cast<cuComplex*
>(A), lda,
reinterpret_cast<cuComplex*
>(B), ldb, W, d_work, lwork, d_info));
573 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
575 throw std::runtime_error(
"heevd: failed to invert matrix");
581void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n, std::complex<double>* A,
const int& lda, std::complex<double>* B,
const int& ldb,
double* W)
586 int* d_info =
nullptr;
587 cuDoubleComplex* d_work =
nullptr;
591 cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
592 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, W, &lwork));
594 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
596 cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
597 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, W, d_work, lwork, d_info));
599 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
601 throw std::runtime_error(
"heevd: failed to invert matrix");
615 cusolverDnHandle_t& cusolver_handle,
633 int *d_info =
nullptr;
634 float *d_work =
nullptr;
640 float *d_A_copy =
nullptr, *d_B_copy =
nullptr;
641 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(
float) * n * lda));
642 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(
float) * n * lda));
643 cudaErrcheck(cudaMemcpy(d_A_copy, d_A,
sizeof(
float) * n * lda, cudaMemcpyDeviceToDevice));
644 cudaErrcheck(cudaMemcpy(d_B_copy, d_B,
sizeof(
float) * n * lda, cudaMemcpyDeviceToDevice));
647 cusolverEigType_t itype_t = cublas_eig_type(itype);
648 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
649 cusolverEigRange_t range_t = cublas_eig_range(range);
650 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
655 itype_t, jobz_t, range_t, uplo_t,
666 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
671 itype_t, jobz_t, range_t, uplo_t,
684 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
686 throw std::runtime_error(
"hegvdx (float): illegal argument #" + std::to_string(-h_info));
687 }
else if (h_info > 0) {
690 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
691 throw std::runtime_error(
"hegvdx (float): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
692 }
else if (h_info > n) {
693 throw std::runtime_error(
"hegvdx (float): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
699 const int m = (*h_meig);
700 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy,
sizeof(
float) * n * m, cudaMemcpyDeviceToDevice));
714 cusolverDnHandle_t& cusolver_handle,
732 int *d_info =
nullptr;
733 double *d_work =
nullptr;
737 double *d_A_copy =
nullptr, *d_B_copy =
nullptr;
738 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(
double) * n * lda));
739 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(
double) * n * lda));
740 cudaErrcheck(cudaMemcpy(d_A_copy, d_A,
sizeof(
double) * n * lda, cudaMemcpyDeviceToDevice));
741 cudaErrcheck(cudaMemcpy(d_B_copy, d_B,
sizeof(
double) * n * lda, cudaMemcpyDeviceToDevice));
743 cusolverEigType_t itype_t = cublas_eig_type(itype);
744 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
745 cusolverEigRange_t range_t = cublas_eig_range(range);
746 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
750 itype_t, jobz_t, range_t, uplo_t,
760 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
764 itype_t, jobz_t, range_t, uplo_t,
776 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
778 throw std::runtime_error(
"hegvdx (double): illegal argument #" + std::to_string(-h_info));
779 }
else if (h_info > 0) {
780 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
781 throw std::runtime_error(
"hegvdx (double): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
782 }
else if (h_info > n) {
783 throw std::runtime_error(
"hegvdx (double): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
788 const int m = (*h_meig);
789 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy,
sizeof(
double) * n * m, cudaMemcpyDeviceToDevice));
802 cusolverDnHandle_t& cusolver_handle,
809 std::complex<float>* d_A,
810 std::complex<float>* d_B,
817 std::complex<float>* d_eigen_vec
820 int *d_info =
nullptr;
821 cuComplex *d_work =
nullptr;
825 cuComplex *d_A_copy =
nullptr, *d_B_copy =
nullptr;
826 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(cuComplex) * n * lda));
827 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(cuComplex) * n * lda));
828 cudaErrcheck(cudaMemcpy(d_A_copy,
reinterpret_cast<cuComplex*
>(d_A),
sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
829 cudaErrcheck(cudaMemcpy(d_B_copy,
reinterpret_cast<cuComplex*
>(d_B),
sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
831 cusolverEigType_t itype_t = cublas_eig_type(itype);
832 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
833 cusolverEigRange_t range_t = cublas_eig_range(range);
834 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
838 itype_t, jobz_t, range_t, uplo_t,
848 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
852 itype_t, jobz_t, range_t, uplo_t,
864 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
866 throw std::runtime_error(
"hegvdx (complex<float>): illegal argument #" + std::to_string(-h_info));
867 }
else if (h_info > 0) {
868 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
869 throw std::runtime_error(
"hegvdx (complex<float>): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
870 }
else if (h_info > n) {
871 throw std::runtime_error(
"hegvdx (complex<float>): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
876 const int m = (*h_meig);
877 cudaErrcheck(cudaMemcpy(
reinterpret_cast<cuComplex*
>(d_eigen_vec), d_A_copy,
sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice));
890 cusolverDnHandle_t& cusolver_handle,
897 std::complex<double>* d_A,
898 std::complex<double>* d_B,
905 std::complex<double>* d_eigen_vec
908 int *d_info =
nullptr;
909 cuDoubleComplex *d_work =
nullptr;
913 cuDoubleComplex *d_A_copy =
nullptr, *d_B_copy =
nullptr;
914 cudaErrcheck(cudaMalloc((
void**)&d_A_copy,
sizeof(cuDoubleComplex) * n * lda));
915 cudaErrcheck(cudaMalloc((
void**)&d_B_copy,
sizeof(cuDoubleComplex) * n * lda));
916 cudaErrcheck(cudaMemcpy(d_A_copy,
reinterpret_cast<cuDoubleComplex*
>(d_A),
sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
917 cudaErrcheck(cudaMemcpy(d_B_copy,
reinterpret_cast<cuDoubleComplex*
>(d_B),
sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
919 cusolverEigType_t itype_t = cublas_eig_type(itype);
920 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
921 cusolverEigRange_t range_t = cublas_eig_range(range);
922 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
926 itype_t, jobz_t, range_t, uplo_t,
936 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
940 itype_t, jobz_t, range_t, uplo_t,
952 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
954 throw std::runtime_error(
"hegvdx (complex<double>): illegal argument #" + std::to_string(-h_info));
955 }
else if (h_info > 0) {
956 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
957 throw std::runtime_error(
"hegvdx (complex<double>): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
958 }
else if (h_info > n) {
959 throw std::runtime_error(
"hegvdx (complex<double>): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
964 const int m = (*h_meig);
965 cudaErrcheck(cudaMemcpy(
reinterpret_cast<cuDoubleComplex*
>(d_eigen_vec), d_A_copy,
sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice));
977void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n,
float* A,
const int& lda,
int* ipiv)
982 int* d_info =
nullptr;
983 float* d_work =
nullptr;
987 cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
990 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
993 cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
995 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
997 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1004void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n,
double* A,
const int& lda,
int* ipiv)
1009 int* d_info =
nullptr;
1010 double* d_work =
nullptr;
1011 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1014 cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
1017 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
1020 cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
1022 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1024 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1031void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n, std::complex<float>* A,
const int& lda,
int* ipiv)
1036 int* d_info =
nullptr;
1037 cuComplex* d_work =
nullptr;
1038 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1041 cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n,
reinterpret_cast<cuComplex*
>(A), lda, &lwork));
1044 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
1047 cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n,
reinterpret_cast<cuComplex*
>(A), lda, d_work, ipiv, d_info));
1049 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1051 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1058void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n, std::complex<double>* A,
const int& lda,
int* ipiv)
1063 int* d_info =
nullptr;
1064 cuDoubleComplex* d_work =
nullptr;
1065 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1068 cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, &lwork));
1071 cudaErrcheck(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
1074 cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, d_work, ipiv, d_info));
1076 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1078 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1086void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs,
float* A,
const int& lda,
const int* ipiv,
float* B,
const int& ldb)
1089 int* d_info =
nullptr;
1090 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1092 cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1094 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1096 throw std::runtime_error(
"getrs: failed to solve the linear system");
1102void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs,
double* A,
const int& lda,
const int* ipiv,
double* B,
const int& ldb)
1105 int* d_info =
nullptr;
1106 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1108 cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1110 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1112 throw std::runtime_error(
"getrs: failed to solve the linear system");
1118void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs, std::complex<float>* A,
const int& lda,
const int* ipiv, std::complex<float>* B,
const int& ldb)
1121 int* d_info =
nullptr;
1122 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1124 cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs,
reinterpret_cast<cuComplex*
>(A), lda, ipiv,
reinterpret_cast<cuComplex*
>(B), ldb, d_info));
1126 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1128 throw std::runtime_error(
"getrs: failed to solve the linear system");
1134void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs, std::complex<double>* A,
const int& lda,
const int* ipiv, std::complex<double>* B,
const int& ldb)
1137 int* d_info =
nullptr;
1138 cudaErrcheck(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1140 cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs,
reinterpret_cast<cuDoubleComplex*
>(A), lda, ipiv,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, d_info));
1142 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1144 throw std::runtime_error(
"getrs: failed to solve the linear system");
1243static inline void geqrf(
1244 cusolverDnHandle_t& cusolver_handle,
1253 cusolver_handle, m, n, d_A, lda, &lwork));
1255 float* d_work =
nullptr;
1256 int* d_info =
nullptr;
1259 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
float) * lwork));
1261 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1264 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1267 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1269 std::cout <<
"geqrf (S): info = " << h_info << std::endl;
1272 throw std::runtime_error(
"geqrf (S): QR factorization failed");
1280static inline void geqrf(
1281 cusolverDnHandle_t& cusolver_handle,
1290 cusolver_handle, m, n, d_A, lda, &lwork));
1292 double* d_work =
nullptr;
1293 int* d_info =
nullptr;
1296 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
double) * lwork));
1298 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1301 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1304 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1306 std::cout <<
"geqrf (D): info = " << h_info << std::endl;
1309 throw std::runtime_error(
"geqrf (D): QR factorization failed");
1317static inline void geqrf(
1318 cusolverDnHandle_t& cusolver_handle,
1321 std::complex<float>* d_A,
1323 std::complex<float>* d_tau
1327 cusolver_handle, m, n,
1328 reinterpret_cast<cuComplex*
>(d_A),
1333 cuComplex* d_work =
nullptr;
1334 int* d_info =
nullptr;
1337 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuComplex) * lwork));
1339 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1342 cusolver_handle, m, n,
1343 reinterpret_cast<cuComplex*
>(d_A),
1345 reinterpret_cast<cuComplex*
>(d_tau),
1346 d_work, lwork, d_info));
1349 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1351 std::cout <<
"geqrf (C): info = " << h_info << std::endl;
1354 throw std::runtime_error(
"geqrf (C): QR factorization failed");
1362static inline void geqrf(
1363 cusolverDnHandle_t& cusolver_handle,
1366 std::complex<double>* d_A,
1368 std::complex<double>* d_tau
1372 cusolver_handle, m, n,
1373 reinterpret_cast<cuDoubleComplex*
>(d_A),
1378 cuDoubleComplex* d_work =
nullptr;
1379 int* d_info =
nullptr;
1382 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuDoubleComplex) * lwork));
1384 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1387 cusolver_handle, m, n,
1388 reinterpret_cast<cuDoubleComplex*
>(d_A),
1390 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1391 d_work, lwork, d_info));
1394 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1396 std::cout <<
"geqrf (Z): info = " << h_info << std::endl;
1399 throw std::runtime_error(
"geqrf (Z): QR factorization failed");
1408static inline void orgqr(
1409 cusolverDnHandle_t& cusolver_handle,
1419 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1421 float* d_work =
nullptr;
1422 int* d_info =
nullptr;
1425 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
float) * lwork));
1427 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1430 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1433 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1435 std::cout <<
"orgqr (S): info = " << h_info <<
" (failure at parameter " << -h_info <<
")" << std::endl;
1438 throw std::runtime_error(
"orgqr (S): failed to generate Q matrix");
1447static inline void orgqr(
1448 cusolverDnHandle_t& cusolver_handle,
1458 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1460 double* d_work =
nullptr;
1461 int* d_info =
nullptr;
1464 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
double) * lwork));
1466 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1469 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1472 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1474 std::cout <<
"orgqr (D): info = " << h_info << std::endl;
1477 throw std::runtime_error(
"orgqr (D): failed to generate Q matrix");
1485static inline void orgqr(
1486 cusolverDnHandle_t& cusolver_handle,
1490 std::complex<float>* d_A,
1492 std::complex<float>* d_tau
1496 cusolver_handle, m, n, k,
1497 reinterpret_cast<cuComplex*
>(d_A),
1499 reinterpret_cast<cuComplex*
>(d_tau),
1502 cuComplex* d_work =
nullptr;
1503 int* d_info =
nullptr;
1506 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuComplex) * lwork));
1508 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1511 cusolver_handle, m, n, k,
1512 reinterpret_cast<cuComplex*
>(d_A),
1514 reinterpret_cast<cuComplex*
>(d_tau),
1515 d_work, lwork, d_info));
1518 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1520 std::cout <<
"orgqr (C): info = " << h_info << std::endl;
1523 throw std::runtime_error(
"orgqr (C): failed to generate Q matrix");
1531static inline void orgqr(
1532 cusolverDnHandle_t& cusolver_handle,
1536 std::complex<double>* d_A,
1538 std::complex<double>* d_tau
1542 cusolver_handle, m, n, k,
1543 reinterpret_cast<cuDoubleComplex*
>(d_A),
1545 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1548 cuDoubleComplex* d_work =
nullptr;
1549 int* d_info =
nullptr;
1552 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuDoubleComplex) * lwork));
1554 cudaErrcheck(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1557 cusolver_handle, m, n, k,
1558 reinterpret_cast<cuDoubleComplex*
>(d_A),
1560 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1561 d_work, lwork, d_info));
1564 cudaErrcheck(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1566 std::cout <<
"orgqr (Z): info = " << h_info << std::endl;
1569 throw std::runtime_error(
"orgqr (Z): failed to generate Q matrix");