20namespace cuSolverConnector {
24void trtri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
T* A,
const int& lda)
26 size_t d_lwork = 0, h_lwork = 0;
28 CHECK_CUSOLVER(cusolverDnXtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n,
GetTypeCuda<T>::cuda_data_type,
reinterpret_cast<Type*
>(A), lda, &d_lwork, &h_lwork));
29 void* d_work =
nullptr, *h_work =
nullptr;
30 CHECK_CUDA(cudaMalloc((
void**)&d_work, d_lwork));
32 h_work = malloc(h_lwork);
33 if (h_work ==
nullptr) {
34 throw std::bad_alloc();
38 int* d_info =
nullptr;
39 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
41 CHECK_CUSOLVER(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n,
GetTypeCuda<T>::cuda_data_type,
reinterpret_cast<Type*
>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
42 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
44 throw std::runtime_error(
"trtri: failed to invert matrix");
47 CHECK_CUDA(cudaFree(d_work));
48 CHECK_CUDA(cudaFree(d_info));
52void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
float * A,
const int& lda)
55 CHECK_CUSOLVER(cusolverDnSpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
56 float* work =
nullptr;
57 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(
float)));
59 CHECK_CUSOLVER(cusolverDnSpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork,
nullptr));
60 CHECK_CUDA(cudaFree(work));
63void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n,
double * A,
const int& lda)
66 CHECK_CUSOLVER(cusolverDnDpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
67 double* work =
nullptr;
68 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(
double)));
70 CHECK_CUSOLVER(cusolverDnDpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork,
nullptr));
71 CHECK_CUDA(cudaFree(work));
74void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n, std::complex<float> * A,
const int& lda)
77 CHECK_CUSOLVER(cusolverDnCpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex *
>(A), n, &lwork));
78 cuComplex* work =
nullptr;
79 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(cuComplex)));
81 CHECK_CUSOLVER(cusolverDnCpotri(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex *
>(A), n, work, lwork,
nullptr));
82 CHECK_CUDA(cudaFree(work));
85void potri (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const char& diag,
const int& n, std::complex<double> * A,
const int& lda)
88 CHECK_CUSOLVER(cusolverDnZpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex *
>(A), n, &lwork));
89 cuDoubleComplex* work =
nullptr;
90 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(cuDoubleComplex)));
92 CHECK_CUSOLVER(cusolverDnZpotri(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex *
>(A), n, work, lwork,
nullptr));
93 CHECK_CUDA(cudaFree(work));
98void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n,
float * A,
const int& lda)
102 CHECK_CUDA(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
103 CHECK_CUSOLVER(cusolverDnSpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
104 float* work =
nullptr;
105 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(
float)));
107 CHECK_CUSOLVER(cusolverDnSpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
108 CHECK_CUDA(cudaFree(work));
109 CHECK_CUDA(cudaFree(info));
112void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n,
double * A,
const int& lda)
116 CHECK_CUDA(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
117 CHECK_CUSOLVER(cusolverDnDpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
118 double* work =
nullptr;
119 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(
double)));
121 CHECK_CUSOLVER(cusolverDnDpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
122 CHECK_CUDA(cudaFree(work));
123 CHECK_CUDA(cudaFree(info));
126void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n, std::complex<float> * A,
const int& lda)
130 CHECK_CUDA(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
131 CHECK_CUSOLVER(cusolverDnCpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex*
>(A), lda, &lwork));
132 cuComplex* work =
nullptr;
133 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(cuComplex)));
135 CHECK_CUSOLVER(cusolverDnCpotrf(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuComplex*
>(A), lda, work, lwork, info));
136 CHECK_CUDA(cudaFree(work));
137 CHECK_CUDA(cudaFree(info));
140void potrf (cusolverDnHandle_t& cusolver_handle,
const char& uplo,
const int& n, std::complex<double> * A,
const int& lda)
144 CHECK_CUDA(cudaMalloc((
void**)&info, 1 *
sizeof(
int)));
145 CHECK_CUSOLVER(cusolverDnZpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, &lwork));
146 cuDoubleComplex* work =
nullptr;
147 CHECK_CUDA(cudaMalloc((
void**)&work, lwork *
sizeof(cuDoubleComplex)));
149 CHECK_CUSOLVER(cusolverDnZpotrf(cusolver_handle, cublas_fill_mode(uplo), n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, work, lwork, info));
150 CHECK_CUDA(cudaFree(work));
151 CHECK_CUDA(cudaFree(info));
156void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n,
float* A,
const int& lda,
float * W)
161 int* d_info =
nullptr;
162 float* d_work =
nullptr;
163 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
166 CHECK_CUSOLVER(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
167 n, A, lda, W, &lwork));
169 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
171 CHECK_CUSOLVER(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
172 n, A, lda, W, d_work, lwork, d_info));
174 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
176 throw std::runtime_error(
"heevd: failed to invert matrix");
178 CHECK_CUDA(cudaFree(d_info));
179 CHECK_CUDA(cudaFree(d_work));
182void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n,
double* A,
const int& lda,
double * W)
187 int* d_info =
nullptr;
188 double* d_work =
nullptr;
189 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
192 CHECK_CUSOLVER(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
193 n, A, lda, W, &lwork));
195 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
197 CHECK_CUSOLVER(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
198 n, A, lda, W, d_work, lwork, d_info));
200 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
202 throw std::runtime_error(
"heevd: failed to invert matrix");
204 CHECK_CUDA(cudaFree(d_info));
205 CHECK_CUDA(cudaFree(d_work));
208void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n, std::complex<float>* A,
const int& lda,
float * W)
213 int* d_info =
nullptr;
214 cuComplex* d_work =
nullptr;
215 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
218 CHECK_CUSOLVER(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
219 n,
reinterpret_cast<cuComplex*
>(A), lda, W, &lwork));
221 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
223 CHECK_CUSOLVER(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
224 n,
reinterpret_cast<cuComplex*
>(A), lda, W, d_work, lwork, d_info));
226 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
228 throw std::runtime_error(
"heevd: failed to invert matrix");
230 CHECK_CUDA(cudaFree(d_info));
231 CHECK_CUDA(cudaFree(d_work));
234void heevd (cusolverDnHandle_t& cusolver_handle,
const char& jobz,
const char& uplo,
const int& n, std::complex<double>* A,
const int& lda,
double* W)
239 int* d_info =
nullptr;
240 cuDoubleComplex* d_work =
nullptr;
241 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
244 CHECK_CUSOLVER(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
245 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, W, &lwork));
247 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
249 CHECK_CUSOLVER(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
250 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, W, d_work, lwork, d_info));
252 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
254 throw std::runtime_error(
"heevd: failed to invert matrix");
256 CHECK_CUDA(cudaFree(d_info));
257 CHECK_CUDA(cudaFree(d_work));
265void heevdx(cusolverDnHandle_t& cusolver_handle,
272 const int il,
const int iu,
273 const float vl,
const float vu,
278 int* d_info =
nullptr;
279 float* d_work =
nullptr;
281 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
283 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
284 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
285 cusolverEigRange_t range_t = cublas_eig_range(range);
287 CHECK_CUSOLVER(cusolverDnSsyevdx_bufferSize(
289 jobz_t, range_t, uplo_t,
297 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
300 CHECK_CUSOLVER(cusolverDnSsyevdx(
302 jobz_t, range_t, uplo_t,
313 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
315 cudaFree(d_info); cudaFree(d_work);
316 throw std::runtime_error(
"heevdx (float) failed with info = " + std::to_string(h_info));
325void heevdx(cusolverDnHandle_t& cusolver_handle,
332 const int il,
const int iu,
333 const double vl,
const double vu,
338 int* d_info =
nullptr;
339 double* d_work =
nullptr;
341 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
343 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
344 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
345 cusolverEigRange_t range_t = cublas_eig_range(range);
347 CHECK_CUSOLVER(cusolverDnDsyevdx_bufferSize(
349 jobz_t, range_t, uplo_t,
357 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
359 CHECK_CUSOLVER(cusolverDnDsyevdx(
361 jobz_t, range_t, uplo_t,
372 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
374 cudaFree(d_info); cudaFree(d_work);
375 throw std::runtime_error(
"heevdx (double) failed with info = " + std::to_string(h_info));
384void heevdx(cusolverDnHandle_t& cusolver_handle,
387 std::complex<float>* d_A,
391 const int il,
const int iu,
392 const float vl,
const float vu,
397 int* d_info =
nullptr;
398 cuComplex* d_work =
nullptr;
400 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
402 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
403 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
404 cusolverEigRange_t range_t = cublas_eig_range(range);
406 CHECK_CUSOLVER(cusolverDnCheevdx_bufferSize(
408 jobz_t, range_t, uplo_t,
410 reinterpret_cast<cuComplex*
>(d_A), lda,
417 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
419 CHECK_CUSOLVER(cusolverDnCheevdx(
421 jobz_t, range_t, uplo_t,
423 reinterpret_cast<cuComplex*
>(d_A), lda,
432 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
434 cudaFree(d_info); cudaFree(d_work);
435 throw std::runtime_error(
"heevdx (complex<float>) failed with info = " + std::to_string(h_info));
444void heevdx(cusolverDnHandle_t& cusolver_handle,
447 std::complex<double>* d_A,
451 const int il,
const int iu,
452 const double vl,
const double vu,
457 int* d_info =
nullptr;
458 cuDoubleComplex* d_work =
nullptr;
460 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
462 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
463 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
464 cusolverEigRange_t range_t = cublas_eig_range(range);
466 CHECK_CUSOLVER(cusolverDnZheevdx_bufferSize(
468 jobz_t, range_t, uplo_t,
470 reinterpret_cast<cuDoubleComplex*
>(d_A), lda,
477 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
479 CHECK_CUSOLVER(cusolverDnZheevdx(
481 jobz_t, range_t, uplo_t,
483 reinterpret_cast<cuDoubleComplex*
>(d_A), lda,
492 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
494 cudaFree(d_info); cudaFree(d_work);
495 throw std::runtime_error(
"heevdx (complex<double>) failed with info = " + std::to_string(h_info));
503void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n,
float* A,
const int& lda,
float* B,
const int& ldb,
float * W)
508 int* d_info =
nullptr;
509 float* d_work =
nullptr;
510 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
513 CHECK_CUSOLVER(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
514 n, A, lda, B, ldb, W, &lwork));
516 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
518 CHECK_CUSOLVER(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
519 n, A, lda, B, ldb, W, d_work, lwork, d_info));
521 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
523 throw std::runtime_error(
"heevd: failed to invert matrix");
525 CHECK_CUDA(cudaFree(d_info));
526 CHECK_CUDA(cudaFree(d_work));
529void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n,
double* A,
const int& lda,
double* B,
const int& ldb,
double * W)
534 int* d_info =
nullptr;
535 double* d_work =
nullptr;
536 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
539 CHECK_CUSOLVER(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
540 n, A, lda, B, ldb, W, &lwork));
542 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
544 CHECK_CUSOLVER(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
545 n, A, lda, B, ldb, W, d_work, lwork, d_info));
547 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
549 throw std::runtime_error(
"heevd: failed to invert matrix");
551 CHECK_CUDA(cudaFree(d_info));
552 CHECK_CUDA(cudaFree(d_work));
555void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n, std::complex<float>* A,
const int& lda, std::complex<float>* B,
const int& ldb,
float* W)
560 int* d_info =
nullptr;
561 cuComplex* d_work =
nullptr;
562 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
565 CHECK_CUSOLVER(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
566 n,
reinterpret_cast<cuComplex*
>(A), lda,
reinterpret_cast<cuComplex*
>(B), ldb, W, &lwork));
568 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
570 CHECK_CUSOLVER(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
571 n,
reinterpret_cast<cuComplex*
>(A), lda,
reinterpret_cast<cuComplex*
>(B), ldb, W, d_work, lwork, d_info));
573 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
575 throw std::runtime_error(
"heevd: failed to invert matrix");
577 CHECK_CUDA(cudaFree(d_info));
578 CHECK_CUDA(cudaFree(d_work));
581void hegvd (cusolverDnHandle_t& cusolver_handle,
const int& itype,
const char& jobz,
const char& uplo,
const int& n, std::complex<double>* A,
const int& lda, std::complex<double>* B,
const int& ldb,
double* W)
586 int* d_info =
nullptr;
587 cuDoubleComplex* d_work =
nullptr;
588 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
591 CHECK_CUSOLVER(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
592 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, W, &lwork));
594 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
596 CHECK_CUSOLVER(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
597 n,
reinterpret_cast<cuDoubleComplex*
>(A), lda,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, W, d_work, lwork, d_info));
599 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
601 throw std::runtime_error(
"heevd: failed to invert matrix");
603 CHECK_CUDA(cudaFree(d_info));
604 CHECK_CUDA(cudaFree(d_work));
615 cusolverDnHandle_t& cusolver_handle,
633 int *d_info =
nullptr;
634 float *d_work =
nullptr;
637 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
640 float *d_A_copy =
nullptr, *d_B_copy =
nullptr;
641 CHECK_CUDA(cudaMalloc((
void**)&d_A_copy,
sizeof(
float) * n * lda));
642 CHECK_CUDA(cudaMalloc((
void**)&d_B_copy,
sizeof(
float) * n * lda));
643 CHECK_CUDA(cudaMemcpy(d_A_copy, d_A,
sizeof(
float) * n * lda, cudaMemcpyDeviceToDevice));
644 CHECK_CUDA(cudaMemcpy(d_B_copy, d_B,
sizeof(
float) * n * lda, cudaMemcpyDeviceToDevice));
647 cusolverEigType_t itype_t = cublas_eig_type(itype);
648 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
649 cusolverEigRange_t range_t = cublas_eig_range(range);
650 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
653 CHECK_CUSOLVER(cusolverDnSsygvdx_bufferSize(
655 itype_t, jobz_t, range_t, uplo_t,
666 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
669 CHECK_CUSOLVER(cusolverDnSsygvdx(
671 itype_t, jobz_t, range_t, uplo_t,
684 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
686 throw std::runtime_error(
"hegvdx (float): illegal argument #" + std::to_string(-h_info));
687 }
else if (h_info > 0) {
690 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
691 throw std::runtime_error(
"hegvdx (float): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
692 }
else if (h_info > n) {
693 throw std::runtime_error(
"hegvdx (float): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
699 const int m = (*h_meig);
700 CHECK_CUDA(cudaMemcpy(d_eigen_vec, d_A_copy,
sizeof(
float) * n * m, cudaMemcpyDeviceToDevice));
714 cusolverDnHandle_t& cusolver_handle,
732 int *d_info =
nullptr;
733 double *d_work =
nullptr;
735 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
737 double *d_A_copy =
nullptr, *d_B_copy =
nullptr;
738 CHECK_CUDA(cudaMalloc((
void**)&d_A_copy,
sizeof(
double) * n * lda));
739 CHECK_CUDA(cudaMalloc((
void**)&d_B_copy,
sizeof(
double) * n * lda));
740 CHECK_CUDA(cudaMemcpy(d_A_copy, d_A,
sizeof(
double) * n * lda, cudaMemcpyDeviceToDevice));
741 CHECK_CUDA(cudaMemcpy(d_B_copy, d_B,
sizeof(
double) * n * lda, cudaMemcpyDeviceToDevice));
743 cusolverEigType_t itype_t = cublas_eig_type(itype);
744 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
745 cusolverEigRange_t range_t = cublas_eig_range(range);
746 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
748 CHECK_CUSOLVER(cusolverDnDsygvdx_bufferSize(
750 itype_t, jobz_t, range_t, uplo_t,
760 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
762 CHECK_CUSOLVER(cusolverDnDsygvdx(
764 itype_t, jobz_t, range_t, uplo_t,
776 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
778 throw std::runtime_error(
"hegvdx (double): illegal argument #" + std::to_string(-h_info));
779 }
else if (h_info > 0) {
780 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
781 throw std::runtime_error(
"hegvdx (double): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
782 }
else if (h_info > n) {
783 throw std::runtime_error(
"hegvdx (double): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
788 const int m = (*h_meig);
789 CHECK_CUDA(cudaMemcpy(d_eigen_vec, d_A_copy,
sizeof(
double) * n * m, cudaMemcpyDeviceToDevice));
802 cusolverDnHandle_t& cusolver_handle,
809 std::complex<float>* d_A,
810 std::complex<float>* d_B,
817 std::complex<float>* d_eigen_vec
820 int *d_info =
nullptr;
821 cuComplex *d_work =
nullptr;
823 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
825 cuComplex *d_A_copy =
nullptr, *d_B_copy =
nullptr;
826 CHECK_CUDA(cudaMalloc((
void**)&d_A_copy,
sizeof(cuComplex) * n * lda));
827 CHECK_CUDA(cudaMalloc((
void**)&d_B_copy,
sizeof(cuComplex) * n * lda));
828 CHECK_CUDA(cudaMemcpy(d_A_copy,
reinterpret_cast<cuComplex*
>(d_A),
sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
829 CHECK_CUDA(cudaMemcpy(d_B_copy,
reinterpret_cast<cuComplex*
>(d_B),
sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
831 cusolverEigType_t itype_t = cublas_eig_type(itype);
832 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
833 cusolverEigRange_t range_t = cublas_eig_range(range);
834 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
836 CHECK_CUSOLVER(cusolverDnChegvdx_bufferSize(
838 itype_t, jobz_t, range_t, uplo_t,
848 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
850 CHECK_CUSOLVER(cusolverDnChegvdx(
852 itype_t, jobz_t, range_t, uplo_t,
864 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
866 throw std::runtime_error(
"hegvdx (complex<float>): illegal argument #" + std::to_string(-h_info));
867 }
else if (h_info > 0) {
868 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
869 throw std::runtime_error(
"hegvdx (complex<float>): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
870 }
else if (h_info > n) {
871 throw std::runtime_error(
"hegvdx (complex<float>): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
876 const int m = (*h_meig);
877 CHECK_CUDA(cudaMemcpy(
reinterpret_cast<cuComplex*
>(d_eigen_vec), d_A_copy,
sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice));
890 cusolverDnHandle_t& cusolver_handle,
897 std::complex<double>* d_A,
898 std::complex<double>* d_B,
905 std::complex<double>* d_eigen_vec
908 int *d_info =
nullptr;
909 cuDoubleComplex *d_work =
nullptr;
911 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
913 cuDoubleComplex *d_A_copy =
nullptr, *d_B_copy =
nullptr;
914 CHECK_CUDA(cudaMalloc((
void**)&d_A_copy,
sizeof(cuDoubleComplex) * n * lda));
915 CHECK_CUDA(cudaMalloc((
void**)&d_B_copy,
sizeof(cuDoubleComplex) * n * lda));
916 CHECK_CUDA(cudaMemcpy(d_A_copy,
reinterpret_cast<cuDoubleComplex*
>(d_A),
sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
917 CHECK_CUDA(cudaMemcpy(d_B_copy,
reinterpret_cast<cuDoubleComplex*
>(d_B),
sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
919 cusolverEigType_t itype_t = cublas_eig_type(itype);
920 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
921 cusolverEigRange_t range_t = cublas_eig_range(range);
922 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
924 CHECK_CUSOLVER(cusolverDnZhegvdx_bufferSize(
926 itype_t, jobz_t, range_t, uplo_t,
936 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
938 CHECK_CUSOLVER(cusolverDnZhegvdx(
940 itype_t, jobz_t, range_t, uplo_t,
952 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
954 throw std::runtime_error(
"hegvdx (complex<double>): illegal argument #" + std::to_string(-h_info));
955 }
else if (h_info > 0) {
956 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
957 throw std::runtime_error(
"hegvdx (complex<double>): failed to converge, " + std::to_string(h_info) +
" off-diagonal elements didn't converge");
958 }
else if (h_info > n) {
959 throw std::runtime_error(
"hegvdx (complex<double>): leading minor of order " + std::to_string(h_info - n) +
" of B is not positive definite");
964 const int m = (*h_meig);
965 CHECK_CUDA(cudaMemcpy(
reinterpret_cast<cuDoubleComplex*
>(d_eigen_vec), d_A_copy,
sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice));
977void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n,
float* A,
const int& lda,
int* ipiv)
982 int* d_info =
nullptr;
983 float* d_work =
nullptr;
984 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
987 CHECK_CUSOLVER(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
990 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
float) * lwork));
993 CHECK_CUSOLVER(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
995 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
997 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1000 CHECK_CUDA(cudaFree(d_work));
1001 CHECK_CUDA(cudaFree(d_info));
1004void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n,
double* A,
const int& lda,
int* ipiv)
1009 int* d_info =
nullptr;
1010 double* d_work =
nullptr;
1011 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1014 CHECK_CUSOLVER(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
1017 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(
double) * lwork));
1020 CHECK_CUSOLVER(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
1022 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1024 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1027 CHECK_CUDA(cudaFree(d_work));
1028 CHECK_CUDA(cudaFree(d_info));
1031void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n, std::complex<float>* A,
const int& lda,
int* ipiv)
1036 int* d_info =
nullptr;
1037 cuComplex* d_work =
nullptr;
1038 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1041 CHECK_CUSOLVER(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n,
reinterpret_cast<cuComplex*
>(A), lda, &lwork));
1044 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuComplex) * lwork));
1047 CHECK_CUSOLVER(cusolverDnCgetrf(cusolver_handle, m, n,
reinterpret_cast<cuComplex*
>(A), lda, d_work, ipiv, d_info));
1049 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1051 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1054 CHECK_CUDA(cudaFree(d_work));
1055 CHECK_CUDA(cudaFree(d_info));
1058void getrf(cusolverDnHandle_t& cusolver_handle,
const int& m,
const int& n, std::complex<double>* A,
const int& lda,
int* ipiv)
1063 int* d_info =
nullptr;
1064 cuDoubleComplex* d_work =
nullptr;
1065 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1068 CHECK_CUSOLVER(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, &lwork));
1071 CHECK_CUDA(cudaMalloc((
void**)&d_work,
sizeof(cuDoubleComplex) * lwork));
1074 CHECK_CUSOLVER(cusolverDnZgetrf(cusolver_handle, m, n,
reinterpret_cast<cuDoubleComplex*
>(A), lda, d_work, ipiv, d_info));
1076 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1078 throw std::runtime_error(
"getrf: failed to compute LU factorization");
1081 CHECK_CUDA(cudaFree(d_work));
1082 CHECK_CUDA(cudaFree(d_info));
1086void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs,
float* A,
const int& lda,
const int* ipiv,
float* B,
const int& ldb)
1089 int* d_info =
nullptr;
1090 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1092 CHECK_CUSOLVER(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1094 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1096 throw std::runtime_error(
"getrs: failed to solve the linear system");
1099 CHECK_CUDA(cudaFree(d_info));
1102void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs,
double* A,
const int& lda,
const int* ipiv,
double* B,
const int& ldb)
1105 int* d_info =
nullptr;
1106 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1108 CHECK_CUSOLVER(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1110 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1112 throw std::runtime_error(
"getrs: failed to solve the linear system");
1115 CHECK_CUDA(cudaFree(d_info));
1118void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs, std::complex<float>* A,
const int& lda,
const int* ipiv, std::complex<float>* B,
const int& ldb)
1121 int* d_info =
nullptr;
1122 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1124 CHECK_CUSOLVER(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs,
reinterpret_cast<cuComplex*
>(A), lda, ipiv,
reinterpret_cast<cuComplex*
>(B), ldb, d_info));
1126 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1128 throw std::runtime_error(
"getrs: failed to solve the linear system");
1131 CHECK_CUDA(cudaFree(d_info));
1134void getrs(cusolverDnHandle_t& cusolver_handle,
const char& trans,
const int& n,
const int& nrhs, std::complex<double>* A,
const int& lda,
const int* ipiv, std::complex<double>* B,
const int& ldb)
1137 int* d_info =
nullptr;
1138 CHECK_CUDA(cudaMalloc((
void**)&d_info,
sizeof(
int)));
1140 CHECK_CUSOLVER(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs,
reinterpret_cast<cuDoubleComplex*
>(A), lda, ipiv,
reinterpret_cast<cuDoubleComplex*
>(B), ldb, d_info));
1142 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1144 throw std::runtime_error(
"getrs: failed to solve the linear system");
1147 CHECK_CUDA(cudaFree(d_info));
1243static inline void geqrf(
1244 cusolverDnHandle_t& cusolver_handle,
1252 CHECK_CUSOLVER(cusolverDnSgeqrf_bufferSize(
1253 cusolver_handle, m, n, d_A, lda, &lwork));
1255 float* d_work =
nullptr;
1256 int* d_info =
nullptr;
1259 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
float) * lwork));
1261 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1263 CHECK_CUSOLVER(cusolverDnSgeqrf(
1264 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1267 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1269 std::cout <<
"geqrf (S): info = " << h_info << std::endl;
1270 if (d_work) CHECK_CUDA(cudaFree(d_work));
1271 CHECK_CUDA(cudaFree(d_info));
1272 throw std::runtime_error(
"geqrf (S): QR factorization failed");
1275 if (d_work) CHECK_CUDA(cudaFree(d_work));
1276 CHECK_CUDA(cudaFree(d_info));
1280static inline void geqrf(
1281 cusolverDnHandle_t& cusolver_handle,
1289 CHECK_CUSOLVER(cusolverDnDgeqrf_bufferSize(
1290 cusolver_handle, m, n, d_A, lda, &lwork));
1292 double* d_work =
nullptr;
1293 int* d_info =
nullptr;
1296 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
double) * lwork));
1298 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1300 CHECK_CUSOLVER(cusolverDnDgeqrf(
1301 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1304 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1306 std::cout <<
"geqrf (D): info = " << h_info << std::endl;
1307 if (d_work) CHECK_CUDA(cudaFree(d_work));
1308 CHECK_CUDA(cudaFree(d_info));
1309 throw std::runtime_error(
"geqrf (D): QR factorization failed");
1312 if (d_work) CHECK_CUDA(cudaFree(d_work));
1313 CHECK_CUDA(cudaFree(d_info));
1317static inline void geqrf(
1318 cusolverDnHandle_t& cusolver_handle,
1321 std::complex<float>* d_A,
1323 std::complex<float>* d_tau
1326 CHECK_CUSOLVER(cusolverDnCgeqrf_bufferSize(
1327 cusolver_handle, m, n,
1328 reinterpret_cast<cuComplex*
>(d_A),
1333 cuComplex* d_work =
nullptr;
1334 int* d_info =
nullptr;
1337 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuComplex) * lwork));
1339 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1341 CHECK_CUSOLVER(cusolverDnCgeqrf(
1342 cusolver_handle, m, n,
1343 reinterpret_cast<cuComplex*
>(d_A),
1345 reinterpret_cast<cuComplex*
>(d_tau),
1346 d_work, lwork, d_info));
1349 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1351 std::cout <<
"geqrf (C): info = " << h_info << std::endl;
1352 if (d_work) CHECK_CUDA(cudaFree(d_work));
1353 CHECK_CUDA(cudaFree(d_info));
1354 throw std::runtime_error(
"geqrf (C): QR factorization failed");
1357 if (d_work) CHECK_CUDA(cudaFree(d_work));
1358 CHECK_CUDA(cudaFree(d_info));
1362static inline void geqrf(
1363 cusolverDnHandle_t& cusolver_handle,
1366 std::complex<double>* d_A,
1368 std::complex<double>* d_tau
1371 CHECK_CUSOLVER(cusolverDnZgeqrf_bufferSize(
1372 cusolver_handle, m, n,
1373 reinterpret_cast<cuDoubleComplex*
>(d_A),
1378 cuDoubleComplex* d_work =
nullptr;
1379 int* d_info =
nullptr;
1382 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuDoubleComplex) * lwork));
1384 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1386 CHECK_CUSOLVER(cusolverDnZgeqrf(
1387 cusolver_handle, m, n,
1388 reinterpret_cast<cuDoubleComplex*
>(d_A),
1390 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1391 d_work, lwork, d_info));
1394 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1396 std::cout <<
"geqrf (Z): info = " << h_info << std::endl;
1397 if (d_work) CHECK_CUDA(cudaFree(d_work));
1398 CHECK_CUDA(cudaFree(d_info));
1399 throw std::runtime_error(
"geqrf (Z): QR factorization failed");
1402 if (d_work) CHECK_CUDA(cudaFree(d_work));
1403 CHECK_CUDA(cudaFree(d_info));
1408static inline void orgqr(
1409 cusolverDnHandle_t& cusolver_handle,
1418 CHECK_CUSOLVER(cusolverDnSorgqr_bufferSize(
1419 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1421 float* d_work =
nullptr;
1422 int* d_info =
nullptr;
1425 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
float) * lwork));
1427 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1429 CHECK_CUSOLVER(cusolverDnSorgqr(
1430 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1433 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1435 std::cout <<
"orgqr (S): info = " << h_info <<
" (failure at parameter " << -h_info <<
")" << std::endl;
1436 if (d_work) CHECK_CUDA(cudaFree(d_work));
1437 CHECK_CUDA(cudaFree(d_info));
1438 throw std::runtime_error(
"orgqr (S): failed to generate Q matrix");
1442 if (d_work) CHECK_CUDA(cudaFree(d_work));
1443 CHECK_CUDA(cudaFree(d_info));
1447static inline void orgqr(
1448 cusolverDnHandle_t& cusolver_handle,
1457 CHECK_CUSOLVER(cusolverDnDorgqr_bufferSize(
1458 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1460 double* d_work =
nullptr;
1461 int* d_info =
nullptr;
1464 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(
double) * lwork));
1466 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1468 CHECK_CUSOLVER(cusolverDnDorgqr(
1469 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1472 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1474 std::cout <<
"orgqr (D): info = " << h_info << std::endl;
1475 if (d_work) CHECK_CUDA(cudaFree(d_work));
1476 CHECK_CUDA(cudaFree(d_info));
1477 throw std::runtime_error(
"orgqr (D): failed to generate Q matrix");
1480 if (d_work) CHECK_CUDA(cudaFree(d_work));
1481 CHECK_CUDA(cudaFree(d_info));
1485static inline void orgqr(
1486 cusolverDnHandle_t& cusolver_handle,
1490 std::complex<float>* d_A,
1492 std::complex<float>* d_tau
1495 CHECK_CUSOLVER(cusolverDnCungqr_bufferSize(
1496 cusolver_handle, m, n, k,
1497 reinterpret_cast<cuComplex*
>(d_A),
1499 reinterpret_cast<cuComplex*
>(d_tau),
1502 cuComplex* d_work =
nullptr;
1503 int* d_info =
nullptr;
1506 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuComplex) * lwork));
1508 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1510 CHECK_CUSOLVER(cusolverDnCungqr(
1511 cusolver_handle, m, n, k,
1512 reinterpret_cast<cuComplex*
>(d_A),
1514 reinterpret_cast<cuComplex*
>(d_tau),
1515 d_work, lwork, d_info));
1518 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1520 std::cout <<
"orgqr (C): info = " << h_info << std::endl;
1521 if (d_work) CHECK_CUDA(cudaFree(d_work));
1522 CHECK_CUDA(cudaFree(d_info));
1523 throw std::runtime_error(
"orgqr (C): failed to generate Q matrix");
1526 if (d_work) CHECK_CUDA(cudaFree(d_work));
1527 CHECK_CUDA(cudaFree(d_info));
1531static inline void orgqr(
1532 cusolverDnHandle_t& cusolver_handle,
1536 std::complex<double>* d_A,
1538 std::complex<double>* d_tau
1541 CHECK_CUSOLVER(cusolverDnZungqr_bufferSize(
1542 cusolver_handle, m, n, k,
1543 reinterpret_cast<cuDoubleComplex*
>(d_A),
1545 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1548 cuDoubleComplex* d_work =
nullptr;
1549 int* d_info =
nullptr;
1552 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_work),
sizeof(cuDoubleComplex) * lwork));
1554 CHECK_CUDA(cudaMalloc(
reinterpret_cast<void**
>(&d_info),
sizeof(
int)));
1556 CHECK_CUSOLVER(cusolverDnZungqr(
1557 cusolver_handle, m, n, k,
1558 reinterpret_cast<cuDoubleComplex*
>(d_A),
1560 reinterpret_cast<cuDoubleComplex*
>(d_tau),
1561 d_work, lwork, d_info));
1564 CHECK_CUDA(cudaMemcpy(&h_info, d_info,
sizeof(
int), cudaMemcpyDeviceToHost));
1566 std::cout <<
"orgqr (Z): info = " << h_info << std::endl;
1567 if (d_work) CHECK_CUDA(cudaFree(d_work));
1568 CHECK_CUDA(cudaFree(d_info));
1569 throw std::runtime_error(
"orgqr (Z): failed to generate Q matrix");
1572 if (d_work) CHECK_CUDA(cudaFree(d_work));
1573 CHECK_CUDA(cudaFree(d_info));