ABACUS develop
Atomic-orbital Based Ab-initio Computation at UStc
Loading...
Searching...
No Matches
cusolver.h
Go to the documentation of this file.
1#ifndef BASE_THIRD_PARTY_CUSOLVER_H_
2#define BASE_THIRD_PARTY_CUSOLVER_H_
3
4#include <cublas_v2.h>
5#include <cuda_runtime.h>
6
7// #include <base/third_party/cusolver_utils.h> // traits, needed if generic API is used.
8// header provided by cusolver, including some data types and macros.
9// see https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSOLVER/utils/cusolver_utils.h
10// The cuSolverDN library provides two different APIs; legacy and generic.
11// https://docs.nvidia.com/cuda/cusolver/index.html#naming-conventions
12// now only legacy APIs are used, while the general APIs have the potential to simplify code implementation.
13// for example, cucusolverDnXpotrf/getrf/geqrf/sytrf
14// More tests are needed to confirm that the generic APIs are operating normally, as they are not yet fully supported.
15
16#include <base/macros/cuda.h>
17
18namespace container {
19namespace cuSolverConnector {
20
21template <typename T>
22static inline
23void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, T* A, const int& lda)
24{
25 size_t d_lwork = 0, h_lwork = 0;
26 using Type = typename GetTypeThrust<T>::type;
27 cusolverErrcheck(cusolverDnXtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), lda, &d_lwork, &h_lwork));
28 void* d_work = nullptr, *h_work = nullptr;
29 cudaErrcheck(cudaMalloc((void**)&d_work, d_lwork));
30 if (h_lwork) {
31 h_work = malloc(h_lwork);
32 if (h_work == nullptr) {
33 throw std::bad_alloc();
34 }
35 }
36 int h_info = 0;
37 int* d_info = nullptr;
38 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
39 // Perform Cholesky decomposition
40 cusolverErrcheck(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
41 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
42 if (h_info != 0) {
43 throw std::runtime_error("trtri: failed to invert matrix");
44 }
45 free(h_work);
46 cudaErrcheck(cudaFree(d_work));
47 cudaErrcheck(cudaFree(d_info));
48}
49
50static inline
51void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float * A, const int& lda)
52{
53 int lwork;
54 cusolverErrcheck(cusolverDnSpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
55 float* work;
56 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(float)));
57 // Perform Cholesky decomposition
58 cusolverErrcheck(cusolverDnSpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, nullptr));
59 cudaErrcheck(cudaFree(work));
60}
61static inline
62void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, double * A, const int& lda)
63{
64 int lwork;
65 cusolverErrcheck(cusolverDnDpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
66 double* work;
67 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(double)));
68 // Perform Cholesky decomposition
69 cusolverErrcheck(cusolverDnDpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, nullptr));
70 cudaErrcheck(cudaFree(work));
71}
72static inline
73void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<float> * A, const int& lda)
74{
75 int lwork;
76 cusolverErrcheck(cusolverDnCpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex *>(A), n, &lwork));
77 cuComplex* work;
78 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuComplex)));
79 // Perform Cholesky decomposition
80 cusolverErrcheck(cusolverDnCpotri(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex *>(A), n, work, lwork, nullptr));
81 cudaErrcheck(cudaFree(work));
82}
83static inline
84void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<double> * A, const int& lda)
85{
86 int lwork;
87 cusolverErrcheck(cusolverDnZpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex *>(A), n, &lwork));
88 cuDoubleComplex* work;
89 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuDoubleComplex)));
90 // Perform Cholesky decomposition
91 cusolverErrcheck(cusolverDnZpotri(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex *>(A), n, work, lwork, nullptr));
92 cudaErrcheck(cudaFree(work));
93}
94
95
96static inline
97void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, float * A, const int& lda)
98{
99 int lwork;
100 int *info = nullptr;
101 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
102 cusolverErrcheck(cusolverDnSpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
103 float* work;
104 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(float)));
105 // Perform Cholesky decomposition
106 cusolverErrcheck(cusolverDnSpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
107 cudaErrcheck(cudaFree(work));
108 cudaErrcheck(cudaFree(info));
109}
110static inline
111void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, double * A, const int& lda)
112{
113 int lwork;
114 int *info = nullptr;
115 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
116 cusolverErrcheck(cusolverDnDpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
117 double* work;
118 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(double)));
119 // Perform Cholesky decomposition
120 cusolverErrcheck(cusolverDnDpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
121 cudaErrcheck(cudaFree(work));
122 cudaErrcheck(cudaFree(info));
123}
124static inline
125void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, std::complex<float> * A, const int& lda)
126{
127 int lwork;
128 int *info = nullptr;
129 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
130 cusolverErrcheck(cusolverDnCpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
131 cuComplex* work;
132 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuComplex)));
133 // Perform Cholesky decomposition
134 cusolverErrcheck(cusolverDnCpotrf(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex*>(A), lda, work, lwork, info));
135 cudaErrcheck(cudaFree(work));
136 cudaErrcheck(cudaFree(info));
137}
138static inline
139void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, std::complex<double> * A, const int& lda)
140{
141 int lwork;
142 int *info = nullptr;
143 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
144 cusolverErrcheck(cusolverDnZpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
145 cuDoubleComplex* work;
146 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuDoubleComplex)));
147 // Perform Cholesky decomposition
148 cusolverErrcheck(cusolverDnZpotrf(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex*>(A), lda, work, lwork, info));
149 cudaErrcheck(cudaFree(work));
150 cudaErrcheck(cudaFree(info));
151}
152
153
154static inline
155void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W)
156{
157 // prepare some values for cusolverDnSsyevd_bufferSize
158 int lwork = 0;
159 int h_info = 0;
160 int* d_info = nullptr;
161 float* d_work = nullptr;
162 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
163
164 // calculate the sizes needed for pre-allocated buffer.
165 cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
166 n, A, lda, W, &lwork));
167 // allocate memory
168 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
169 // compute eigenvalues and eigenvectors.
170 cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
171 n, A, lda, W, d_work, lwork, d_info));
172
173 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
174 if (h_info != 0) {
175 throw std::runtime_error("heevd: failed to invert matrix");
176 }
177 cudaErrcheck(cudaFree(d_info));
178 cudaErrcheck(cudaFree(d_work));
179}
180static inline
181void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W)
182{
183 // prepare some values for cusolverDnDsyevd_bufferSize
184 int lwork = 0;
185 int h_info = 0;
186 int* d_info = nullptr;
187 double* d_work = nullptr;
188 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
189
190 // calculate the sizes needed for pre-allocated buffer.
191 cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
192 n, A, lda, W, &lwork));
193 // allocate memory
194 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
195 // compute eigenvalues and eigenvectors.
196 cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
197 n, A, lda, W, d_work, lwork, d_info));
198
199 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
200 if (h_info != 0) {
201 throw std::runtime_error("heevd: failed to invert matrix");
202 }
203 cudaErrcheck(cudaFree(d_info));
204 cudaErrcheck(cudaFree(d_work));
205}
206static inline
207void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, float * W)
208{
209 // prepare some values for cusolverDnCheevd_bufferSize
210 int lwork = 0;
211 int h_info = 0;
212 int* d_info = nullptr;
213 cuComplex* d_work = nullptr;
214 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
215
216 // calculate the sizes needed for pre-allocated buffer.
217 cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
218 n, reinterpret_cast<cuComplex*>(A), lda, W, &lwork));
219 // allocate memory
220 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
221 // compute eigenvalues and eigenvectors.
222 cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
223 n, reinterpret_cast<cuComplex*>(A), lda, W, d_work, lwork, d_info));
224
225 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
226 if (h_info != 0) {
227 throw std::runtime_error("heevd: failed to invert matrix");
228 }
229 cudaErrcheck(cudaFree(d_info));
230 cudaErrcheck(cudaFree(d_work));
231}
232static inline
233void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, double* W)
234{
235 // prepare some values for cusolverDnZheevd_bufferSize
236 int lwork = 0;
237 int h_info = 0;
238 int* d_info = nullptr;
239 cuDoubleComplex* d_work = nullptr;
240 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
241
242 // calculate the sizes needed for pre-allocated buffer.
243 cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
244 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, &lwork));
245 // allocate memory
246 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
247 // compute eigenvalues and eigenvectors.
248 cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
249 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, d_work, lwork, d_info));
250
251 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
252 if (h_info != 0) {
253 throw std::runtime_error("heevd: failed to invert matrix");
254 }
255 cudaErrcheck(cudaFree(d_info));
256 cudaErrcheck(cudaFree(d_work));
257}
258
259// =====================================================================================================
260// heevdx: Compute eigenvalues and eigenvectors of symmetric/Hermitian matrix
261// =====================================================================================================
262// --- float ---
263static inline
264void heevdx(cusolverDnHandle_t& cusolver_handle,
265 const int n,
266 const int lda,
267 float* d_A,
268 const char jobz,
269 const char uplo,
270 const char range,
271 const int il, const int iu,
272 const float vl, const float vu,
273 float* d_eigen_val,
274 int* h_meig)
275{
276 int lwork = 0;
277 int* d_info = nullptr;
278 float* d_work = nullptr;
279
280 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
281
282 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
283 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
284 cusolverEigRange_t range_t = cublas_eig_range(range);
285
286 cusolverErrcheck(cusolverDnSsyevdx_bufferSize(
287 cusolver_handle,
288 jobz_t, range_t, uplo_t,
289 n, d_A, lda,
290 vl, vu, il, iu,
291 h_meig, // ← int* output: number of eigenvalues found
292 d_eigen_val, // ← const float* W (used for query, can be dummy)
293 &lwork // ← int* lwork (output)
294 ));
295
296 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
297
298 // Main call
299 cusolverErrcheck(cusolverDnSsyevdx(
300 cusolver_handle,
301 jobz_t, range_t, uplo_t,
302 n,
303 d_A, lda,
304 vl, vu, il, iu,
305 h_meig, // ← int* output
306 d_eigen_val, // ← float* W: eigenvalues
307 d_work, lwork,
308 d_info
309 ));
310
311 int h_info = 0;
312 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
313 if (h_info != 0) {
314 cudaFree(d_info); cudaFree(d_work);
315 throw std::runtime_error("heevdx (float) failed with info = " + std::to_string(h_info));
316 }
317
318 cudaFree(d_info);
319 cudaFree(d_work);
320}
321
322// --- double ---
323static inline
324void heevdx(cusolverDnHandle_t& cusolver_handle,
325 const int n,
326 const int lda,
327 double* d_A,
328 const char jobz,
329 const char uplo,
330 const char range,
331 const int il, const int iu,
332 const double vl, const double vu,
333 double* d_eigen_val,
334 int* h_meig)
335{
336 int lwork = 0;
337 int* d_info = nullptr;
338 double* d_work = nullptr;
339
340 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
341
342 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
343 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
344 cusolverEigRange_t range_t = cublas_eig_range(range);
345
346 cusolverErrcheck(cusolverDnDsyevdx_bufferSize(
347 cusolver_handle,
348 jobz_t, range_t, uplo_t,
349 n, d_A, lda,
350 vl, vu, il, iu,
351 h_meig,
352 d_eigen_val,
353 &lwork
354 ));
355
356 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
357
358 cusolverErrcheck(cusolverDnDsyevdx(
359 cusolver_handle,
360 jobz_t, range_t, uplo_t,
361 n,
362 d_A, lda,
363 vl, vu, il, iu,
364 h_meig,
365 d_eigen_val,
366 d_work, lwork,
367 d_info
368 ));
369
370 int h_info = 0;
371 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
372 if (h_info != 0) {
373 cudaFree(d_info); cudaFree(d_work);
374 throw std::runtime_error("heevdx (double) failed with info = " + std::to_string(h_info));
375 }
376
377 cudaFree(d_info);
378 cudaFree(d_work);
379}
380
381// --- complex<float> ---
382static inline
383void heevdx(cusolverDnHandle_t& cusolver_handle,
384 const int n,
385 const int lda,
386 std::complex<float>* d_A,
387 const char jobz,
388 const char uplo,
389 const char range,
390 const int il, const int iu,
391 const float vl, const float vu,
392 float* d_eigen_val,
393 int* h_meig)
394{
395 int lwork = 0;
396 int* d_info = nullptr;
397 cuComplex* d_work = nullptr;
398
399 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
400
401 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
402 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
403 cusolverEigRange_t range_t = cublas_eig_range(range);
404
405 cusolverErrcheck(cusolverDnCheevdx_bufferSize(
406 cusolver_handle,
407 jobz_t, range_t, uplo_t,
408 n,
409 reinterpret_cast<cuComplex*>(d_A), lda,
410 vl, vu, il, iu,
411 h_meig,
412 d_eigen_val,
413 &lwork
414 ));
415
416 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
417
418 cusolverErrcheck(cusolverDnCheevdx(
419 cusolver_handle,
420 jobz_t, range_t, uplo_t,
421 n,
422 reinterpret_cast<cuComplex*>(d_A), lda,
423 vl, vu, il, iu,
424 h_meig,
425 d_eigen_val,
426 d_work, lwork,
427 d_info
428 ));
429
430 int h_info = 0;
431 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
432 if (h_info != 0) {
433 cudaFree(d_info); cudaFree(d_work);
434 throw std::runtime_error("heevdx (complex<float>) failed with info = " + std::to_string(h_info));
435 }
436
437 cudaFree(d_info);
438 cudaFree(d_work);
439}
440
441// --- complex<double> ---
442static inline
443void heevdx(cusolverDnHandle_t& cusolver_handle,
444 const int n,
445 const int lda,
446 std::complex<double>* d_A,
447 const char jobz,
448 const char uplo,
449 const char range,
450 const int il, const int iu,
451 const double vl, const double vu,
452 double* d_eigen_val,
453 int* h_meig)
454{
455 int lwork = 0;
456 int* d_info = nullptr;
457 cuDoubleComplex* d_work = nullptr;
458
459 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
460
461 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
462 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
463 cusolverEigRange_t range_t = cublas_eig_range(range);
464
465 cusolverErrcheck(cusolverDnZheevdx_bufferSize(
466 cusolver_handle,
467 jobz_t, range_t, uplo_t,
468 n,
469 reinterpret_cast<cuDoubleComplex*>(d_A), lda,
470 vl, vu, il, iu,
471 h_meig,
472 d_eigen_val,
473 &lwork
474 ));
475
476 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
477
478 cusolverErrcheck(cusolverDnZheevdx(
479 cusolver_handle,
480 jobz_t, range_t, uplo_t,
481 n,
482 reinterpret_cast<cuDoubleComplex*>(d_A), lda,
483 vl, vu, il, iu,
484 h_meig,
485 d_eigen_val,
486 d_work, lwork,
487 d_info
488 ));
489
490 int h_info = 0;
491 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
492 if (h_info != 0) {
493 cudaFree(d_info); cudaFree(d_work);
494 throw std::runtime_error("heevdx (complex<double>) failed with info = " + std::to_string(h_info));
495 }
496
497 cudaFree(d_info);
498 cudaFree(d_work);
499}
500
501static inline
502void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W)
503{
504 // prepare some values for cusolverDnSsygvd_bufferSize
505 int lwork = 0;
506 int h_info = 0;
507 int* d_info = nullptr;
508 float* d_work = nullptr;
509 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
510
511 // calculate the sizes needed for pre-allocated buffer.
512 cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
513 n, A, lda, B, ldb, W, &lwork));
514 // allocate memory
515 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
516 // compute eigenvalues and eigenvectors.
517 cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
518 n, A, lda, B, ldb, W, d_work, lwork, d_info));
519
520 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
521 if (h_info != 0) {
522 throw std::runtime_error("heevd: failed to invert matrix");
523 }
524 cudaErrcheck(cudaFree(d_info));
525 cudaErrcheck(cudaFree(d_work));
526}
527static inline
528void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W)
529{
530 // prepare some values for cusolverDnDsygvd_bufferSize
531 int lwork = 0;
532 int h_info = 0;
533 int* d_info = nullptr;
534 double* d_work = nullptr;
535 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
536
537 // calculate the sizes needed for pre-allocated buffer.
538 cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
539 n, A, lda, B, ldb, W, &lwork));
540 // allocate memory
541 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
542 // compute eigenvalues and eigenvectors.
543 cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
544 n, A, lda, B, ldb, W, d_work, lwork, d_info));
545
546 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
547 if (h_info != 0) {
548 throw std::runtime_error("heevd: failed to invert matrix");
549 }
550 cudaErrcheck(cudaFree(d_info));
551 cudaErrcheck(cudaFree(d_work));
552}
553static inline
554void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb, float* W)
555{
556 // prepare some values for cusolverDnChegvd_bufferSize
557 int lwork = 0;
558 int h_info = 0;
559 int* d_info = nullptr;
560 cuComplex* d_work = nullptr;
561 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
562
563 // calculate the sizes needed for pre-allocated buffer.
564 cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
565 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, &lwork));
566 // allocate memory
567 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
568 // compute eigenvalues and eigenvectors.
569 cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
570 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, d_work, lwork, d_info));
571
572 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
573 if (h_info != 0) {
574 throw std::runtime_error("heevd: failed to invert matrix");
575 }
576 cudaErrcheck(cudaFree(d_info));
577 cudaErrcheck(cudaFree(d_work));
578}
579static inline
580void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb, double* W)
581{
582 // prepare some values for cusolverDnZhegvd_bufferSize
583 int lwork = 0;
584 int h_info = 0;
585 int* d_info = nullptr;
586 cuDoubleComplex* d_work = nullptr;
587 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
588
589 // calculate the sizes needed for pre-allocated buffer.
590 cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
591 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, &lwork));
592 // allocate memory
593 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
594 // compute eigenvalues and eigenvectors.
595 cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
596 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, d_work, lwork, d_info));
597
598 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
599 if (h_info != 0) {
600 throw std::runtime_error("heevd: failed to invert matrix");
601 }
602 cudaErrcheck(cudaFree(d_info));
603 cudaErrcheck(cudaFree(d_work));
604}
605
606// =====================================================================================================
607// hegvd x: Compute selected eigenvalues and eigenvectors of generalized Hermitian-definite eigenproblem
608// A * x = lambda * B * x
609// =====================================================================================================
610
611// --- float ---
612static inline
613void hegvdx(
614 cusolverDnHandle_t& cusolver_handle,
615 const int itype, // 1: A*x = lambda*B*x
616 const char jobz, // 'V' or 'N'
617 const char range, // 'I', 'V', 'A'
618 const char uplo, // 'U' or 'L'
619 const int n,
620 const int lda,
621 float* d_A, // Input matrix A (device)
622 float* d_B, // Input matrix B (device)
623 const float vl, // for RANGE='V'
624 const float vu,
625 const int il, // for RANGE='I'
626 const int iu,
627 int* h_meig, // output: number of eigenvalues found
628 float* d_eigen_val, // output: eigenvalues
629 float* d_eigen_vec // output: eigenvectors (if jobz='V'), size ldz × m
630) {
631 int lwork = 0;
632 int *d_info = nullptr;
633 float *d_work = nullptr;
634
635 // Allocate device info
636 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
637
638 // Copy A and B to temporary buffers since sygvdx may modify them
639 float *d_A_copy = nullptr, *d_B_copy = nullptr;
640 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(float) * n * lda));
641 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(float) * n * lda));
642 cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice));
643 cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice));
644
645 // Set parameters
646 cusolverEigType_t itype_t = cublas_eig_type(itype);
647 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
648 cusolverEigRange_t range_t = cublas_eig_range(range);
649 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
650
651 // Query workspace size
652 cusolverErrcheck(cusolverDnSsygvdx_bufferSize(
653 cusolver_handle,
654 itype_t, jobz_t, range_t, uplo_t,
655 n,
656 d_A_copy, lda,
657 d_B_copy, lda,
658 vl, vu, il, iu,
659 h_meig,
660 d_eigen_val,
661 &lwork
662 ));
663
664 // Allocate workspace
665 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
666
667 // Main call
668 cusolverErrcheck(cusolverDnSsygvdx(
669 cusolver_handle,
670 itype_t, jobz_t, range_t, uplo_t,
671 n,
672 d_A_copy, lda,
673 d_B_copy, lda,
674 vl, vu, il, iu,
675 h_meig,
676 d_eigen_val,
677 d_work, lwork,
678 d_info
679 ));
680
681 // Check result
682 int h_info = 0;
683 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
684 if (h_info < 0) {
685 throw std::runtime_error("hegvdx (float): illegal argument #" + std::to_string(-h_info));
686 } else if (h_info > 0) {
687 // If h_info <= n: convergence issue in tridiag solver (no vec) OR
688 // If h_info > n: B's leading minor of order (h_info - n) is not positive definite
689 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
690 throw std::runtime_error("hegvdx (float): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
691 } else if (h_info > n) {
692 throw std::runtime_error("hegvdx (float): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
693 }
694 }
695
696 // If jobz == 'V', copy eigenvectors from A (which now contains Z) to output
697 if (jobz == 'V') {
698 const int m = (*h_meig); // number of eigenvectors computed
699 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(float) * n * m, cudaMemcpyDeviceToDevice));
700 }
701
702 // Cleanup
703 cudaFree(d_info);
704 cudaFree(d_work);
705 cudaFree(d_A_copy);
706 cudaFree(d_B_copy);
707}
708
709
710// --- double ---
711static inline
712void hegvdx(
713 cusolverDnHandle_t& cusolver_handle,
714 const int itype,
715 const char jobz,
716 const char range,
717 const char uplo,
718 const int n,
719 const int lda,
720 double* d_A,
721 double* d_B,
722 const double vl,
723 const double vu,
724 const int il,
725 const int iu,
726 int* h_meig,
727 double* d_eigen_val,
728 double* d_eigen_vec
729) {
730 int lwork = 0;
731 int *d_info = nullptr;
732 double *d_work = nullptr;
733
734 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
735
736 double *d_A_copy = nullptr, *d_B_copy = nullptr;
737 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(double) * n * lda));
738 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(double) * n * lda));
739 cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice));
740 cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice));
741
742 cusolverEigType_t itype_t = cublas_eig_type(itype);
743 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
744 cusolverEigRange_t range_t = cublas_eig_range(range);
745 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
746
747 cusolverErrcheck(cusolverDnDsygvdx_bufferSize(
748 cusolver_handle,
749 itype_t, jobz_t, range_t, uplo_t,
750 n,
751 d_A_copy, lda,
752 d_B_copy, lda,
753 vl, vu, il, iu,
754 h_meig,
755 d_eigen_val,
756 &lwork
757 ));
758
759 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
760
761 cusolverErrcheck(cusolverDnDsygvdx(
762 cusolver_handle,
763 itype_t, jobz_t, range_t, uplo_t,
764 n,
765 d_A_copy, lda,
766 d_B_copy, lda,
767 vl, vu, il, iu,
768 h_meig,
769 d_eigen_val,
770 d_work, lwork,
771 d_info
772 ));
773
774 int h_info = 0;
775 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
776 if (h_info < 0) {
777 throw std::runtime_error("hegvdx (double): illegal argument #" + std::to_string(-h_info));
778 } else if (h_info > 0) {
779 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
780 throw std::runtime_error("hegvdx (double): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
781 } else if (h_info > n) {
782 throw std::runtime_error("hegvdx (double): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
783 }
784 }
785
786 if (jobz == 'V') {
787 const int m = (*h_meig);
788 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(double) * n * m, cudaMemcpyDeviceToDevice));
789 }
790
791 cudaFree(d_info);
792 cudaFree(d_work);
793 cudaFree(d_A_copy);
794 cudaFree(d_B_copy);
795}
796
797
798// --- complex<float> ---
799static inline
800void hegvdx(
801 cusolverDnHandle_t& cusolver_handle,
802 const int itype,
803 const char jobz,
804 const char range,
805 const char uplo,
806 const int n,
807 const int lda,
808 std::complex<float>* d_A,
809 std::complex<float>* d_B,
810 const float vl,
811 const float vu,
812 const int il,
813 const int iu,
814 int* h_meig,
815 float* d_eigen_val,
816 std::complex<float>* d_eigen_vec
817) {
818 int lwork = 0;
819 int *d_info = nullptr;
820 cuComplex *d_work = nullptr;
821
822 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
823
824 cuComplex *d_A_copy = nullptr, *d_B_copy = nullptr;
825 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuComplex) * n * lda));
826 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuComplex) * n * lda));
827 cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast<cuComplex*>(d_A), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
828 cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast<cuComplex*>(d_B), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
829
830 cusolverEigType_t itype_t = cublas_eig_type(itype);
831 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
832 cusolverEigRange_t range_t = cublas_eig_range(range);
833 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
834
835 cusolverErrcheck(cusolverDnChegvdx_bufferSize(
836 cusolver_handle,
837 itype_t, jobz_t, range_t, uplo_t,
838 n,
839 d_A_copy, lda,
840 d_B_copy, lda,
841 vl, vu, il, iu,
842 h_meig,
843 d_eigen_val,
844 &lwork
845 ));
846
847 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
848
849 cusolverErrcheck(cusolverDnChegvdx(
850 cusolver_handle,
851 itype_t, jobz_t, range_t, uplo_t,
852 n,
853 d_A_copy, lda,
854 d_B_copy, lda,
855 vl, vu, il, iu,
856 h_meig,
857 d_eigen_val,
858 d_work, lwork,
859 d_info
860 ));
861
862 int h_info = 0;
863 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
864 if (h_info < 0) {
865 throw std::runtime_error("hegvdx (complex<float>): illegal argument #" + std::to_string(-h_info));
866 } else if (h_info > 0) {
867 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
868 throw std::runtime_error("hegvdx (complex<float>): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
869 } else if (h_info > n) {
870 throw std::runtime_error("hegvdx (complex<float>): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
871 }
872 }
873
874 if (jobz == 'V') {
875 const int m = (*h_meig);
876 cudaErrcheck(cudaMemcpy(reinterpret_cast<cuComplex*>(d_eigen_vec), d_A_copy, sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice));
877 }
878
879 cudaFree(d_info);
880 cudaFree(d_work);
881 cudaFree(d_A_copy);
882 cudaFree(d_B_copy);
883}
884
885
886// --- complex<double> ---
887static inline
888void hegvdx(
889 cusolverDnHandle_t& cusolver_handle,
890 const int itype,
891 const char jobz,
892 const char range,
893 const char uplo,
894 const int n,
895 const int lda,
896 std::complex<double>* d_A,
897 std::complex<double>* d_B,
898 const double vl,
899 const double vu,
900 const int il,
901 const int iu,
902 int* h_meig,
903 double* d_eigen_val,
904 std::complex<double>* d_eigen_vec
905) {
906 int lwork = 0;
907 int *d_info = nullptr;
908 cuDoubleComplex *d_work = nullptr;
909
910 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
911
912 cuDoubleComplex *d_A_copy = nullptr, *d_B_copy = nullptr;
913 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuDoubleComplex) * n * lda));
914 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuDoubleComplex) * n * lda));
915 cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast<cuDoubleComplex*>(d_A), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
916 cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast<cuDoubleComplex*>(d_B), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
917
918 cusolverEigType_t itype_t = cublas_eig_type(itype);
919 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
920 cusolverEigRange_t range_t = cublas_eig_range(range);
921 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
922
923 cusolverErrcheck(cusolverDnZhegvdx_bufferSize(
924 cusolver_handle,
925 itype_t, jobz_t, range_t, uplo_t,
926 n,
927 d_A_copy, lda,
928 d_B_copy, lda,
929 vl, vu, il, iu,
930 h_meig,
931 d_eigen_val,
932 &lwork
933 ));
934
935 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
936
937 cusolverErrcheck(cusolverDnZhegvdx(
938 cusolver_handle,
939 itype_t, jobz_t, range_t, uplo_t,
940 n,
941 d_A_copy, lda,
942 d_B_copy, lda,
943 vl, vu, il, iu,
944 h_meig,
945 d_eigen_val,
946 d_work, lwork,
947 d_info
948 ));
949
950 int h_info = 0;
951 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
952 if (h_info < 0) {
953 throw std::runtime_error("hegvdx (complex<double>): illegal argument #" + std::to_string(-h_info));
954 } else if (h_info > 0) {
955 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
956 throw std::runtime_error("hegvdx (complex<double>): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
957 } else if (h_info > n) {
958 throw std::runtime_error("hegvdx (complex<double>): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
959 }
960 }
961
962 if (jobz == 'V') {
963 const int m = (*h_meig);
964 cudaErrcheck(cudaMemcpy(reinterpret_cast<cuDoubleComplex*>(d_eigen_vec), d_A_copy, sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice));
965 }
966
967 cudaFree(d_info);
968 cudaFree(d_work);
969 cudaFree(d_A_copy);
970 cudaFree(d_B_copy);
971}
972
973
974// --- getrf
975static inline
976void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv)
977{
978 // prepare some values for cusolverDnSgetrf_bufferSize
979 int lwork = 0;
980 int h_info = 0;
981 int* d_info = nullptr;
982 float* d_work = nullptr;
983 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
984
985 // calculate the sizes needed for pre-allocated buffer.
986 cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
987
988 // allocate memory
989 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
990
991 // Perform LU decomposition
992 cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
993
994 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
995 if (h_info != 0) {
996 throw std::runtime_error("getrf: failed to compute LU factorization");
997 }
998
999 cudaErrcheck(cudaFree(d_work));
1000 cudaErrcheck(cudaFree(d_info));
1001}
1002static inline
1003void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, double* A, const int& lda, int* ipiv)
1004{
1005 // prepare some values for cusolverDnDgetrf_bufferSize
1006 int lwork = 0;
1007 int h_info = 0;
1008 int* d_info = nullptr;
1009 double* d_work = nullptr;
1010 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1011
1012 // calculate the sizes needed for pre-allocated buffer.
1013 cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
1014
1015 // allocate memory
1016 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
1017
1018 // Perform LU decomposition
1019 cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
1020
1021 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1022 if (h_info != 0) {
1023 throw std::runtime_error("getrf: failed to compute LU factorization");
1024 }
1025
1026 cudaErrcheck(cudaFree(d_work));
1027 cudaErrcheck(cudaFree(d_info));
1028}
1029static inline
1030void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<float>* A, const int& lda, int* ipiv)
1031{
1032 // prepare some values for cusolverDnCgetrf_bufferSize
1033 int lwork = 0;
1034 int h_info = 0;
1035 int* d_info = nullptr;
1036 cuComplex* d_work = nullptr;
1037 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1038
1039 // calculate the sizes needed for pre-allocated buffer.
1040 cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
1041
1042 // allocate memory
1043 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
1044
1045 // Perform LU decomposition
1046 cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, d_work, ipiv, d_info));
1047
1048 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1049 if (h_info != 0) {
1050 throw std::runtime_error("getrf: failed to compute LU factorization");
1051 }
1052
1053 cudaErrcheck(cudaFree(d_work));
1054 cudaErrcheck(cudaFree(d_info));
1055}
1056static inline
1057void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<double>* A, const int& lda, int* ipiv)
1058{
1059 // prepare some values for cusolverDnZgetrf_bufferSize
1060 int lwork = 0;
1061 int h_info = 0;
1062 int* d_info = nullptr;
1063 cuDoubleComplex* d_work = nullptr;
1064 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1065
1066 // calculate the sizes needed for pre-allocated buffer.
1067 cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
1068
1069 // allocate memory
1070 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
1071
1072 // Perform LU decomposition
1073 cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, d_work, ipiv, d_info));
1074
1075 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1076 if (h_info != 0) {
1077 throw std::runtime_error("getrf: failed to compute LU factorization");
1078 }
1079
1080 cudaErrcheck(cudaFree(d_work));
1081 cudaErrcheck(cudaFree(d_info));
1082}
1083
1084static inline
1085void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, float* A, const int& lda, const int* ipiv, float* B, const int& ldb)
1086{
1087 int h_info = 0;
1088 int* d_info = nullptr;
1089 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1090
1091 cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1092
1093 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1094 if (h_info != 0) {
1095 throw std::runtime_error("getrs: failed to solve the linear system");
1096 }
1097
1098 cudaErrcheck(cudaFree(d_info));
1099}
1100static inline
1101void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, double* A, const int& lda, const int* ipiv, double* B, const int& ldb)
1102{
1103 int h_info = 0;
1104 int* d_info = nullptr;
1105 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1106
1107 cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1108
1109 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1110 if (h_info != 0) {
1111 throw std::runtime_error("getrs: failed to solve the linear system");
1112 }
1113
1114 cudaErrcheck(cudaFree(d_info));
1115}
1116static inline
1117void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<float>* A, const int& lda, const int* ipiv, std::complex<float>* B, const int& ldb)
1118{
1119 int h_info = 0;
1120 int* d_info = nullptr;
1121 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1122
1123 cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuComplex*>(A), lda, ipiv, reinterpret_cast<cuComplex*>(B), ldb, d_info));
1124
1125 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1126 if (h_info != 0) {
1127 throw std::runtime_error("getrs: failed to solve the linear system");
1128 }
1129
1130 cudaErrcheck(cudaFree(d_info));
1131}
1132static inline
1133void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<double>* A, const int& lda, const int* ipiv, std::complex<double>* B, const int& ldb)
1134{
1135 int h_info = 0;
1136 int* d_info = nullptr;
1137 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1138
1139 cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuDoubleComplex*>(A), lda, ipiv, reinterpret_cast<cuDoubleComplex*>(B), ldb, d_info));
1140
1141 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1142 if (h_info != 0) {
1143 throw std::runtime_error("getrs: failed to solve the linear system");
1144 }
1145
1146 cudaErrcheck(cudaFree(d_info));
1147}
1148
1149// QR decomposition
1150// geqrf, orgqr
1151// Note:
1152// there are two cusolver geqrf
1153// one is cusolverDn<t>geqrf
1154// one is cusolverDnXgeqrf
1155// which one is better?
1156//
1157// template<typename T>
1158// static inline void geqrf(
1159// cusolverDnHandle_t& cusolver_handle,
1160// const int64_t m,
1161// const int64_t n,
1162// T* d_A, // device matrix A (m x n, column-major)
1163// const int64_t lda,
1164// T* d_tau // output: scalar factors of elementary reflectors
1165// ) {
1166// // query workspace size
1167// int *d_info = nullptr; /* error info */
1168//
1169// size_t workspaceInBytesOnDevice = 0; /* size of workspace */
1170// void *d_work = nullptr; /* device workspace */
1171// size_t workspaceInBytesOnHost = 0; /* size of workspace */
1172// void *h_work = nullptr; /* host workspace */
1173//
1174// cudaErrcheck(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
1175//
1176// cusolverDnParams_t params = NULL;
1177// cusolverErrcheck(cusolverDnCreateParams(&params));
1178//
1179// cusolverErrcheck(cusolverDnXgeqrf_bufferSize(
1180// cusolver_handle,
1181// params,
1182// m, n,
1183// traits<T>::cuda_data_type,
1184// d_A,
1185// lda,
1186// traits<T>::cuda_data_type,
1187// d_tau,
1188// traits<T>::cuda_data_type,
1189// &workspaceInBytesOnDevice,
1190// &workspaceInBytesOnHost
1191// ));
1192//
1193// // allocate device workspace
1194// cudaErrcheck(cudaMalloc(reinterpret_cast<void **>(&d_work), workspaceInBytesOnDevice));
1195//
1196// // allocate host workspace
1197// if (workspaceInBytesOnHost > 0) {
1198// h_work = reinterpret_cast<void *>(malloc(workspaceInBytesOnHost));
1199// if (h_work == nullptr) {
1200// throw std::runtime_error("Error: h_work not allocated.");
1201// }
1202// }
1203//
1204// // QR factorization
1205// cusolverErrcheck(cusolverDnXgeqrf(
1206// cusolver_handle,
1207// params,
1208// m, n,
1209// traits<T>::cuda_data_type,
1210// d_A,
1211// lda,
1212// traits<T>::cuda_data_type,
1213// d_tau,
1214// traits<T>::cuda_data_type,
1215// d_work,
1216// workspaceInBytesOnDevice,
1217// h_work,
1218// workspaceInBytesOnHost,
1219// d_info
1220// ));
1221//
1222// // check info
1223// int h_info = 0;
1224// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1225// if (h_info != 0) {
1226// // std::printf("%d-th parameter is wrong \n", -info);
1227// // print error message
1228// std::cout << -h_info << "th parameter is wrong" << std::endl;
1229// throw std::runtime_error("geqrf: failed to compute QR decomposition");
1230// }
1231//
1232// // clean workspace
1233// cudaErrcheck(cudaFree(d_info));
1234// cudaErrcheck(cudaFree(d_work));
1235// if (h_work) free(h_work);
1236// cusolverErrcheck(cusolverDnDestroyParams(params));
1237// }
1238
1239// geqrf
1240
1241// --- float ---
1242static inline void geqrf(
1243 cusolverDnHandle_t& cusolver_handle,
1244 const int m,
1245 const int n,
1246 float* d_A,
1247 const int lda,
1248 float* d_tau
1249) {
1250 int lwork = 0;
1251 cusolverErrcheck(cusolverDnSgeqrf_bufferSize(
1252 cusolver_handle, m, n, d_A, lda, &lwork));
1253
1254 float* d_work = nullptr;
1255 int* d_info = nullptr;
1256
1257 if (lwork > 0) {
1258 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(float) * lwork));
1259 }
1260 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1261
1262 cusolverErrcheck(cusolverDnSgeqrf(
1263 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1264
1265 int h_info = 0;
1266 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1267 if (h_info != 0) {
1268 std::cout << "geqrf (S): info = " << h_info << std::endl;
1269 if (d_work) cudaErrcheck(cudaFree(d_work));
1270 cudaErrcheck(cudaFree(d_info));
1271 throw std::runtime_error("geqrf (S): QR factorization failed");
1272 }
1273
1274 if (d_work) cudaErrcheck(cudaFree(d_work));
1275 cudaErrcheck(cudaFree(d_info));
1276}
1277
1278// --- double ---
1279static inline void geqrf(
1280 cusolverDnHandle_t& cusolver_handle,
1281 const int m,
1282 const int n,
1283 double* d_A,
1284 const int lda,
1285 double* d_tau
1286) {
1287 int lwork = 0;
1288 cusolverErrcheck(cusolverDnDgeqrf_bufferSize(
1289 cusolver_handle, m, n, d_A, lda, &lwork));
1290
1291 double* d_work = nullptr;
1292 int* d_info = nullptr;
1293
1294 if (lwork > 0) {
1295 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(double) * lwork));
1296 }
1297 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1298
1299 cusolverErrcheck(cusolverDnDgeqrf(
1300 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1301
1302 int h_info = 0;
1303 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1304 if (h_info != 0) {
1305 std::cout << "geqrf (D): info = " << h_info << std::endl;
1306 if (d_work) cudaErrcheck(cudaFree(d_work));
1307 cudaErrcheck(cudaFree(d_info));
1308 throw std::runtime_error("geqrf (D): QR factorization failed");
1309 }
1310
1311 if (d_work) cudaErrcheck(cudaFree(d_work));
1312 cudaErrcheck(cudaFree(d_info));
1313}
1314
1315// --- std::complex<float> ---
1316static inline void geqrf(
1317 cusolverDnHandle_t& cusolver_handle,
1318 const int m,
1319 const int n,
1320 std::complex<float>* d_A,
1321 const int lda,
1322 std::complex<float>* d_tau
1323) {
1324 int lwork = 0;
1325 cusolverErrcheck(cusolverDnCgeqrf_bufferSize(
1326 cusolver_handle, m, n,
1327 reinterpret_cast<cuComplex*>(d_A),
1328 lda,
1329 &lwork // ← 这里才是 lwork 的地址!
1330 ));
1331
1332 cuComplex* d_work = nullptr;
1333 int* d_info = nullptr;
1334
1335 if (lwork > 0) {
1336 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
1337 }
1338 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1339
1340 cusolverErrcheck(cusolverDnCgeqrf(
1341 cusolver_handle, m, n,
1342 reinterpret_cast<cuComplex*>(d_A),
1343 lda,
1344 reinterpret_cast<cuComplex*>(d_tau), // ← 这里才是 d_tau
1345 d_work, lwork, d_info));
1346
1347 int h_info = 0;
1348 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1349 if (h_info != 0) {
1350 std::cout << "geqrf (C): info = " << h_info << std::endl;
1351 if (d_work) cudaErrcheck(cudaFree(d_work));
1352 cudaErrcheck(cudaFree(d_info));
1353 throw std::runtime_error("geqrf (C): QR factorization failed");
1354 }
1355
1356 if (d_work) cudaErrcheck(cudaFree(d_work));
1357 cudaErrcheck(cudaFree(d_info));
1358}
1359
1360// --- std::complex<double> ---
1361static inline void geqrf(
1362 cusolverDnHandle_t& cusolver_handle,
1363 const int m,
1364 const int n,
1365 std::complex<double>* d_A,
1366 const int lda,
1367 std::complex<double>* d_tau
1368) {
1369 int lwork = 0;
1370 cusolverErrcheck(cusolverDnZgeqrf_bufferSize(
1371 cusolver_handle, m, n,
1372 reinterpret_cast<cuDoubleComplex*>(d_A),
1373 lda,
1374 &lwork
1375 ));
1376
1377 cuDoubleComplex* d_work = nullptr;
1378 int* d_info = nullptr;
1379
1380 if (lwork > 0) {
1381 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuDoubleComplex) * lwork));
1382 }
1383 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1384
1385 cusolverErrcheck(cusolverDnZgeqrf(
1386 cusolver_handle, m, n,
1387 reinterpret_cast<cuDoubleComplex*>(d_A),
1388 lda,
1389 reinterpret_cast<cuDoubleComplex*>(d_tau),
1390 d_work, lwork, d_info));
1391
1392 int h_info = 0;
1393 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1394 if (h_info != 0) {
1395 std::cout << "geqrf (Z): info = " << h_info << std::endl;
1396 if (d_work) cudaErrcheck(cudaFree(d_work));
1397 cudaErrcheck(cudaFree(d_info));
1398 throw std::runtime_error("geqrf (Z): QR factorization failed");
1399 }
1400
1401 if (d_work) cudaErrcheck(cudaFree(d_work));
1402 cudaErrcheck(cudaFree(d_info));
1403}
1404
1405
1406// --- float ---
1407static inline void orgqr(
1408 cusolverDnHandle_t& cusolver_handle,
1409 const int m,
1410 const int n,
1411 const int k,
1412 float* d_A,
1413 const int lda,
1414 float* d_tau
1415) {
1416 int lwork = 0;
1417 cusolverErrcheck(cusolverDnSorgqr_bufferSize(
1418 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1419
1420 float* d_work = nullptr;
1421 int* d_info = nullptr;
1422
1423 if (lwork > 0) {
1424 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(float) * lwork));
1425 }
1426 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1427
1428 cusolverErrcheck(cusolverDnSorgqr(
1429 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1430
1431 int h_info = 0;
1432 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1433 if (h_info != 0) {
1434 std::cout << "orgqr (S): info = " << h_info << " (failure at parameter " << -h_info << ")" << std::endl;
1435 if (d_work) cudaErrcheck(cudaFree(d_work));
1436 cudaErrcheck(cudaFree(d_info));
1437 throw std::runtime_error("orgqr (S): failed to generate Q matrix");
1438 }
1439
1440 // clean workspace
1441 if (d_work) cudaErrcheck(cudaFree(d_work));
1442 cudaErrcheck(cudaFree(d_info));
1443}
1444
1445// --- double ---
1446static inline void orgqr(
1447 cusolverDnHandle_t& cusolver_handle,
1448 const int m,
1449 const int n,
1450 const int k,
1451 double* d_A,
1452 const int lda,
1453 double* d_tau
1454) {
1455 int lwork = 0;
1456 cusolverErrcheck(cusolverDnDorgqr_bufferSize(
1457 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1458
1459 double* d_work = nullptr;
1460 int* d_info = nullptr;
1461
1462 if (lwork > 0) {
1463 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(double) * lwork));
1464 }
1465 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1466
1467 cusolverErrcheck(cusolverDnDorgqr(
1468 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1469
1470 int h_info = 0;
1471 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1472 if (h_info != 0) {
1473 std::cout << "orgqr (D): info = " << h_info << std::endl;
1474 if (d_work) cudaErrcheck(cudaFree(d_work));
1475 cudaErrcheck(cudaFree(d_info));
1476 throw std::runtime_error("orgqr (D): failed to generate Q matrix");
1477 }
1478
1479 if (d_work) cudaErrcheck(cudaFree(d_work));
1480 cudaErrcheck(cudaFree(d_info));
1481}
1482
1483// --- std::complex<float> ---
1484static inline void orgqr(
1485 cusolverDnHandle_t& cusolver_handle,
1486 const int m,
1487 const int n,
1488 const int k,
1489 std::complex<float>* d_A,
1490 const int lda,
1491 std::complex<float>* d_tau
1492) {
1493 int lwork = 0;
1494 cusolverErrcheck(cusolverDnCungqr_bufferSize(
1495 cusolver_handle, m, n, k,
1496 reinterpret_cast<cuComplex*>(d_A),
1497 lda,
1498 reinterpret_cast<cuComplex*>(d_tau),
1499 &lwork));
1500
1501 cuComplex* d_work = nullptr;
1502 int* d_info = nullptr;
1503
1504 if (lwork > 0) {
1505 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
1506 }
1507 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1508
1509 cusolverErrcheck(cusolverDnCungqr(
1510 cusolver_handle, m, n, k,
1511 reinterpret_cast<cuComplex*>(d_A),
1512 lda,
1513 reinterpret_cast<cuComplex*>(d_tau),
1514 d_work, lwork, d_info));
1515
1516 int h_info = 0;
1517 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1518 if (h_info != 0) {
1519 std::cout << "orgqr (C): info = " << h_info << std::endl;
1520 if (d_work) cudaErrcheck(cudaFree(d_work));
1521 cudaErrcheck(cudaFree(d_info));
1522 throw std::runtime_error("orgqr (C): failed to generate Q matrix");
1523 }
1524
1525 if (d_work) cudaErrcheck(cudaFree(d_work));
1526 cudaErrcheck(cudaFree(d_info));
1527}
1528
1529// --- std::complex<double> ---
1530static inline void orgqr(
1531 cusolverDnHandle_t& cusolver_handle,
1532 const int m,
1533 const int n,
1534 const int k,
1535 std::complex<double>* d_A,
1536 const int lda,
1537 std::complex<double>* d_tau
1538) {
1539 int lwork = 0;
1540 cusolverErrcheck(cusolverDnZungqr_bufferSize(
1541 cusolver_handle, m, n, k,
1542 reinterpret_cast<cuDoubleComplex*>(d_A),
1543 lda,
1544 reinterpret_cast<cuDoubleComplex*>(d_tau),
1545 &lwork));
1546
1547 cuDoubleComplex* d_work = nullptr;
1548 int* d_info = nullptr;
1549
1550 if (lwork > 0) {
1551 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuDoubleComplex) * lwork));
1552 }
1553 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1554
1555 cusolverErrcheck(cusolverDnZungqr(
1556 cusolver_handle, m, n, k,
1557 reinterpret_cast<cuDoubleComplex*>(d_A),
1558 lda,
1559 reinterpret_cast<cuDoubleComplex*>(d_tau),
1560 d_work, lwork, d_info));
1561
1562 int h_info = 0;
1563 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1564 if (h_info != 0) {
1565 std::cout << "orgqr (Z): info = " << h_info << std::endl;
1566 if (d_work) cudaErrcheck(cudaFree(d_work));
1567 cudaErrcheck(cudaFree(d_info));
1568 throw std::runtime_error("orgqr (Z): failed to generate Q matrix");
1569 }
1570
1571 if (d_work) cudaErrcheck(cudaFree(d_work));
1572 cudaErrcheck(cudaFree(d_info));
1573}
1574
1575
1576} // namespace cuSolverConnector
1577} // namespace container
1578
1579#endif // BASE_THIRD_PARTY_CUSOLVER_H_
#define cudaErrcheck(res)
Definition cuda.h:236
#define cusolverErrcheck(res)
Definition cuda.h:225
#define T
Definition exp.cpp:237
Definition tensor.cpp:8
Definition cuda.h:49
T type
Definition cuda.h:14