ABACUS develop
Atomic-orbital Based Ab-initio Computation at UStc
Loading...
Searching...
No Matches
cusolver.h
Go to the documentation of this file.
1#ifndef BASE_THIRD_PARTY_CUSOLVER_H_
2#define BASE_THIRD_PARTY_CUSOLVER_H_
3
4#include <cublas_v2.h>
5#include <cuda_runtime.h>
6#include <iostream>
7
8// #include <base/third_party/cusolver_utils.h> // traits, needed if generic API is used.
9// header provided by cusolver, including some data types and macros.
10// see https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSOLVER/utils/cusolver_utils.h
11// The cuSolverDN library provides two different APIs; legacy and generic.
12// https://docs.nvidia.com/cuda/cusolver/index.html#naming-conventions
13// now only legacy APIs are used, while the general APIs have the potential to simplify code implementation.
14// for example, cucusolverDnXpotrf/getrf/geqrf/sytrf
15// More tests are needed to confirm that the generic APIs are operating normally, as they are not yet fully supported.
16
17#include <base/macros/cuda.h>
18
19namespace container {
20namespace cuSolverConnector {
21
22template <typename T>
23static inline
24void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, T* A, const int& lda)
25{
26 size_t d_lwork = 0, h_lwork = 0;
27 using Type = typename GetTypeThrust<T>::type;
28 cusolverErrcheck(cusolverDnXtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), lda, &d_lwork, &h_lwork));
29 void* d_work = nullptr, *h_work = nullptr;
30 cudaErrcheck(cudaMalloc((void**)&d_work, d_lwork));
31 if (h_lwork) {
32 h_work = malloc(h_lwork);
33 if (h_work == nullptr) {
34 throw std::bad_alloc();
35 }
36 }
37 int h_info = 0;
38 int* d_info = nullptr;
39 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
40 // Perform Cholesky decomposition
41 cusolverErrcheck(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
42 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
43 if (h_info != 0) {
44 throw std::runtime_error("trtri: failed to invert matrix");
45 }
46 free(h_work);
47 cudaErrcheck(cudaFree(d_work));
48 cudaErrcheck(cudaFree(d_info));
49}
50
51static inline
52void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float * A, const int& lda)
53{
54 int lwork;
55 cusolverErrcheck(cusolverDnSpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
56 float* work;
57 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(float)));
58 // Perform Cholesky decomposition
59 cusolverErrcheck(cusolverDnSpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, nullptr));
60 cudaErrcheck(cudaFree(work));
61}
62static inline
63void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, double * A, const int& lda)
64{
65 int lwork;
66 cusolverErrcheck(cusolverDnDpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
67 double* work;
68 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(double)));
69 // Perform Cholesky decomposition
70 cusolverErrcheck(cusolverDnDpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, nullptr));
71 cudaErrcheck(cudaFree(work));
72}
73static inline
74void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<float> * A, const int& lda)
75{
76 int lwork;
77 cusolverErrcheck(cusolverDnCpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex *>(A), n, &lwork));
78 cuComplex* work;
79 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuComplex)));
80 // Perform Cholesky decomposition
81 cusolverErrcheck(cusolverDnCpotri(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex *>(A), n, work, lwork, nullptr));
82 cudaErrcheck(cudaFree(work));
83}
84static inline
85void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<double> * A, const int& lda)
86{
87 int lwork;
88 cusolverErrcheck(cusolverDnZpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex *>(A), n, &lwork));
89 cuDoubleComplex* work;
90 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuDoubleComplex)));
91 // Perform Cholesky decomposition
92 cusolverErrcheck(cusolverDnZpotri(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex *>(A), n, work, lwork, nullptr));
93 cudaErrcheck(cudaFree(work));
94}
95
96
97static inline
98void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, float * A, const int& lda)
99{
100 int lwork;
101 int *info = nullptr;
102 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
103 cusolverErrcheck(cusolverDnSpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
104 float* work;
105 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(float)));
106 // Perform Cholesky decomposition
107 cusolverErrcheck(cusolverDnSpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
108 cudaErrcheck(cudaFree(work));
109 cudaErrcheck(cudaFree(info));
110}
111static inline
112void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, double * A, const int& lda)
113{
114 int lwork;
115 int *info = nullptr;
116 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
117 cusolverErrcheck(cusolverDnDpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
118 double* work;
119 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(double)));
120 // Perform Cholesky decomposition
121 cusolverErrcheck(cusolverDnDpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
122 cudaErrcheck(cudaFree(work));
123 cudaErrcheck(cudaFree(info));
124}
125static inline
126void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, std::complex<float> * A, const int& lda)
127{
128 int lwork;
129 int *info = nullptr;
130 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
131 cusolverErrcheck(cusolverDnCpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
132 cuComplex* work;
133 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuComplex)));
134 // Perform Cholesky decomposition
135 cusolverErrcheck(cusolverDnCpotrf(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex*>(A), lda, work, lwork, info));
136 cudaErrcheck(cudaFree(work));
137 cudaErrcheck(cudaFree(info));
138}
139static inline
140void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, std::complex<double> * A, const int& lda)
141{
142 int lwork;
143 int *info = nullptr;
144 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
145 cusolverErrcheck(cusolverDnZpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
146 cuDoubleComplex* work;
147 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuDoubleComplex)));
148 // Perform Cholesky decomposition
149 cusolverErrcheck(cusolverDnZpotrf(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex*>(A), lda, work, lwork, info));
150 cudaErrcheck(cudaFree(work));
151 cudaErrcheck(cudaFree(info));
152}
153
154
155static inline
156void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W)
157{
158 // prepare some values for cusolverDnSsyevd_bufferSize
159 int lwork = 0;
160 int h_info = 0;
161 int* d_info = nullptr;
162 float* d_work = nullptr;
163 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
164
165 // calculate the sizes needed for pre-allocated buffer.
166 cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
167 n, A, lda, W, &lwork));
168 // allocate memory
169 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
170 // compute eigenvalues and eigenvectors.
171 cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
172 n, A, lda, W, d_work, lwork, d_info));
173
174 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
175 if (h_info != 0) {
176 throw std::runtime_error("heevd: failed to invert matrix");
177 }
178 cudaErrcheck(cudaFree(d_info));
179 cudaErrcheck(cudaFree(d_work));
180}
181static inline
182void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W)
183{
184 // prepare some values for cusolverDnDsyevd_bufferSize
185 int lwork = 0;
186 int h_info = 0;
187 int* d_info = nullptr;
188 double* d_work = nullptr;
189 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
190
191 // calculate the sizes needed for pre-allocated buffer.
192 cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
193 n, A, lda, W, &lwork));
194 // allocate memory
195 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
196 // compute eigenvalues and eigenvectors.
197 cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
198 n, A, lda, W, d_work, lwork, d_info));
199
200 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
201 if (h_info != 0) {
202 throw std::runtime_error("heevd: failed to invert matrix");
203 }
204 cudaErrcheck(cudaFree(d_info));
205 cudaErrcheck(cudaFree(d_work));
206}
207static inline
208void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, float * W)
209{
210 // prepare some values for cusolverDnCheevd_bufferSize
211 int lwork = 0;
212 int h_info = 0;
213 int* d_info = nullptr;
214 cuComplex* d_work = nullptr;
215 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
216
217 // calculate the sizes needed for pre-allocated buffer.
218 cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
219 n, reinterpret_cast<cuComplex*>(A), lda, W, &lwork));
220 // allocate memory
221 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
222 // compute eigenvalues and eigenvectors.
223 cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
224 n, reinterpret_cast<cuComplex*>(A), lda, W, d_work, lwork, d_info));
225
226 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
227 if (h_info != 0) {
228 throw std::runtime_error("heevd: failed to invert matrix");
229 }
230 cudaErrcheck(cudaFree(d_info));
231 cudaErrcheck(cudaFree(d_work));
232}
233static inline
234void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, double* W)
235{
236 // prepare some values for cusolverDnZheevd_bufferSize
237 int lwork = 0;
238 int h_info = 0;
239 int* d_info = nullptr;
240 cuDoubleComplex* d_work = nullptr;
241 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
242
243 // calculate the sizes needed for pre-allocated buffer.
244 cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
245 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, &lwork));
246 // allocate memory
247 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
248 // compute eigenvalues and eigenvectors.
249 cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
250 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, d_work, lwork, d_info));
251
252 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
253 if (h_info != 0) {
254 throw std::runtime_error("heevd: failed to invert matrix");
255 }
256 cudaErrcheck(cudaFree(d_info));
257 cudaErrcheck(cudaFree(d_work));
258}
259
260// =====================================================================================================
261// heevdx: Compute eigenvalues and eigenvectors of symmetric/Hermitian matrix
262// =====================================================================================================
263// --- float ---
264static inline
265void heevdx(cusolverDnHandle_t& cusolver_handle,
266 const int n,
267 const int lda,
268 float* d_A,
269 const char jobz,
270 const char uplo,
271 const char range,
272 const int il, const int iu,
273 const float vl, const float vu,
274 float* d_eigen_val,
275 int* h_meig)
276{
277 int lwork = 0;
278 int* d_info = nullptr;
279 float* d_work = nullptr;
280
281 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
282
283 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
284 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
285 cusolverEigRange_t range_t = cublas_eig_range(range);
286
287 cusolverErrcheck(cusolverDnSsyevdx_bufferSize(
288 cusolver_handle,
289 jobz_t, range_t, uplo_t,
290 n, d_A, lda,
291 vl, vu, il, iu,
292 h_meig, // ← int* output: number of eigenvalues found
293 d_eigen_val, // ← const float* W (used for query, can be dummy)
294 &lwork // ← int* lwork (output)
295 ));
296
297 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
298
299 // Main call
300 cusolverErrcheck(cusolverDnSsyevdx(
301 cusolver_handle,
302 jobz_t, range_t, uplo_t,
303 n,
304 d_A, lda,
305 vl, vu, il, iu,
306 h_meig, // ← int* output
307 d_eigen_val, // ← float* W: eigenvalues
308 d_work, lwork,
309 d_info
310 ));
311
312 int h_info = 0;
313 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
314 if (h_info != 0) {
315 cudaFree(d_info); cudaFree(d_work);
316 throw std::runtime_error("heevdx (float) failed with info = " + std::to_string(h_info));
317 }
318
319 cudaFree(d_info);
320 cudaFree(d_work);
321}
322
323// --- double ---
324static inline
325void heevdx(cusolverDnHandle_t& cusolver_handle,
326 const int n,
327 const int lda,
328 double* d_A,
329 const char jobz,
330 const char uplo,
331 const char range,
332 const int il, const int iu,
333 const double vl, const double vu,
334 double* d_eigen_val,
335 int* h_meig)
336{
337 int lwork = 0;
338 int* d_info = nullptr;
339 double* d_work = nullptr;
340
341 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
342
343 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
344 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
345 cusolverEigRange_t range_t = cublas_eig_range(range);
346
347 cusolverErrcheck(cusolverDnDsyevdx_bufferSize(
348 cusolver_handle,
349 jobz_t, range_t, uplo_t,
350 n, d_A, lda,
351 vl, vu, il, iu,
352 h_meig,
353 d_eigen_val,
354 &lwork
355 ));
356
357 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
358
359 cusolverErrcheck(cusolverDnDsyevdx(
360 cusolver_handle,
361 jobz_t, range_t, uplo_t,
362 n,
363 d_A, lda,
364 vl, vu, il, iu,
365 h_meig,
366 d_eigen_val,
367 d_work, lwork,
368 d_info
369 ));
370
371 int h_info = 0;
372 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
373 if (h_info != 0) {
374 cudaFree(d_info); cudaFree(d_work);
375 throw std::runtime_error("heevdx (double) failed with info = " + std::to_string(h_info));
376 }
377
378 cudaFree(d_info);
379 cudaFree(d_work);
380}
381
382// --- complex<float> ---
383static inline
384void heevdx(cusolverDnHandle_t& cusolver_handle,
385 const int n,
386 const int lda,
387 std::complex<float>* d_A,
388 const char jobz,
389 const char uplo,
390 const char range,
391 const int il, const int iu,
392 const float vl, const float vu,
393 float* d_eigen_val,
394 int* h_meig)
395{
396 int lwork = 0;
397 int* d_info = nullptr;
398 cuComplex* d_work = nullptr;
399
400 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
401
402 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
403 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
404 cusolverEigRange_t range_t = cublas_eig_range(range);
405
406 cusolverErrcheck(cusolverDnCheevdx_bufferSize(
407 cusolver_handle,
408 jobz_t, range_t, uplo_t,
409 n,
410 reinterpret_cast<cuComplex*>(d_A), lda,
411 vl, vu, il, iu,
412 h_meig,
413 d_eigen_val,
414 &lwork
415 ));
416
417 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
418
419 cusolverErrcheck(cusolverDnCheevdx(
420 cusolver_handle,
421 jobz_t, range_t, uplo_t,
422 n,
423 reinterpret_cast<cuComplex*>(d_A), lda,
424 vl, vu, il, iu,
425 h_meig,
426 d_eigen_val,
427 d_work, lwork,
428 d_info
429 ));
430
431 int h_info = 0;
432 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
433 if (h_info != 0) {
434 cudaFree(d_info); cudaFree(d_work);
435 throw std::runtime_error("heevdx (complex<float>) failed with info = " + std::to_string(h_info));
436 }
437
438 cudaFree(d_info);
439 cudaFree(d_work);
440}
441
442// --- complex<double> ---
443static inline
444void heevdx(cusolverDnHandle_t& cusolver_handle,
445 const int n,
446 const int lda,
447 std::complex<double>* d_A,
448 const char jobz,
449 const char uplo,
450 const char range,
451 const int il, const int iu,
452 const double vl, const double vu,
453 double* d_eigen_val,
454 int* h_meig)
455{
456 int lwork = 0;
457 int* d_info = nullptr;
458 cuDoubleComplex* d_work = nullptr;
459
460 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
461
462 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
463 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
464 cusolverEigRange_t range_t = cublas_eig_range(range);
465
466 cusolverErrcheck(cusolverDnZheevdx_bufferSize(
467 cusolver_handle,
468 jobz_t, range_t, uplo_t,
469 n,
470 reinterpret_cast<cuDoubleComplex*>(d_A), lda,
471 vl, vu, il, iu,
472 h_meig,
473 d_eigen_val,
474 &lwork
475 ));
476
477 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
478
479 cusolverErrcheck(cusolverDnZheevdx(
480 cusolver_handle,
481 jobz_t, range_t, uplo_t,
482 n,
483 reinterpret_cast<cuDoubleComplex*>(d_A), lda,
484 vl, vu, il, iu,
485 h_meig,
486 d_eigen_val,
487 d_work, lwork,
488 d_info
489 ));
490
491 int h_info = 0;
492 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
493 if (h_info != 0) {
494 cudaFree(d_info); cudaFree(d_work);
495 throw std::runtime_error("heevdx (complex<double>) failed with info = " + std::to_string(h_info));
496 }
497
498 cudaFree(d_info);
499 cudaFree(d_work);
500}
501
502static inline
503void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W)
504{
505 // prepare some values for cusolverDnSsygvd_bufferSize
506 int lwork = 0;
507 int h_info = 0;
508 int* d_info = nullptr;
509 float* d_work = nullptr;
510 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
511
512 // calculate the sizes needed for pre-allocated buffer.
513 cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
514 n, A, lda, B, ldb, W, &lwork));
515 // allocate memory
516 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
517 // compute eigenvalues and eigenvectors.
518 cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
519 n, A, lda, B, ldb, W, d_work, lwork, d_info));
520
521 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
522 if (h_info != 0) {
523 throw std::runtime_error("heevd: failed to invert matrix");
524 }
525 cudaErrcheck(cudaFree(d_info));
526 cudaErrcheck(cudaFree(d_work));
527}
528static inline
529void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W)
530{
531 // prepare some values for cusolverDnDsygvd_bufferSize
532 int lwork = 0;
533 int h_info = 0;
534 int* d_info = nullptr;
535 double* d_work = nullptr;
536 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
537
538 // calculate the sizes needed for pre-allocated buffer.
539 cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
540 n, A, lda, B, ldb, W, &lwork));
541 // allocate memory
542 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
543 // compute eigenvalues and eigenvectors.
544 cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
545 n, A, lda, B, ldb, W, d_work, lwork, d_info));
546
547 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
548 if (h_info != 0) {
549 throw std::runtime_error("heevd: failed to invert matrix");
550 }
551 cudaErrcheck(cudaFree(d_info));
552 cudaErrcheck(cudaFree(d_work));
553}
554static inline
555void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb, float* W)
556{
557 // prepare some values for cusolverDnChegvd_bufferSize
558 int lwork = 0;
559 int h_info = 0;
560 int* d_info = nullptr;
561 cuComplex* d_work = nullptr;
562 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
563
564 // calculate the sizes needed for pre-allocated buffer.
565 cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
566 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, &lwork));
567 // allocate memory
568 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
569 // compute eigenvalues and eigenvectors.
570 cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
571 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, d_work, lwork, d_info));
572
573 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
574 if (h_info != 0) {
575 throw std::runtime_error("heevd: failed to invert matrix");
576 }
577 cudaErrcheck(cudaFree(d_info));
578 cudaErrcheck(cudaFree(d_work));
579}
580static inline
581void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb, double* W)
582{
583 // prepare some values for cusolverDnZhegvd_bufferSize
584 int lwork = 0;
585 int h_info = 0;
586 int* d_info = nullptr;
587 cuDoubleComplex* d_work = nullptr;
588 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
589
590 // calculate the sizes needed for pre-allocated buffer.
591 cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
592 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, &lwork));
593 // allocate memory
594 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
595 // compute eigenvalues and eigenvectors.
596 cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
597 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, d_work, lwork, d_info));
598
599 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
600 if (h_info != 0) {
601 throw std::runtime_error("heevd: failed to invert matrix");
602 }
603 cudaErrcheck(cudaFree(d_info));
604 cudaErrcheck(cudaFree(d_work));
605}
606
607// =====================================================================================================
608// hegvd x: Compute selected eigenvalues and eigenvectors of generalized Hermitian-definite eigenproblem
609// A * x = lambda * B * x
610// =====================================================================================================
611
612// --- float ---
613static inline
614void hegvdx(
615 cusolverDnHandle_t& cusolver_handle,
616 const int itype, // 1: A*x = lambda*B*x
617 const char jobz, // 'V' or 'N'
618 const char range, // 'I', 'V', 'A'
619 const char uplo, // 'U' or 'L'
620 const int n,
621 const int lda,
622 float* d_A, // Input matrix A (device)
623 float* d_B, // Input matrix B (device)
624 const float vl, // for RANGE='V'
625 const float vu,
626 const int il, // for RANGE='I'
627 const int iu,
628 int* h_meig, // output: number of eigenvalues found
629 float* d_eigen_val, // output: eigenvalues
630 float* d_eigen_vec // output: eigenvectors (if jobz='V'), size ldz × m
631) {
632 int lwork = 0;
633 int *d_info = nullptr;
634 float *d_work = nullptr;
635
636 // Allocate device info
637 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
638
639 // Copy A and B to temporary buffers since sygvdx may modify them
640 float *d_A_copy = nullptr, *d_B_copy = nullptr;
641 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(float) * n * lda));
642 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(float) * n * lda));
643 cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice));
644 cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice));
645
646 // Set parameters
647 cusolverEigType_t itype_t = cublas_eig_type(itype);
648 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
649 cusolverEigRange_t range_t = cublas_eig_range(range);
650 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
651
652 // Query workspace size
653 cusolverErrcheck(cusolverDnSsygvdx_bufferSize(
654 cusolver_handle,
655 itype_t, jobz_t, range_t, uplo_t,
656 n,
657 d_A_copy, lda,
658 d_B_copy, lda,
659 vl, vu, il, iu,
660 h_meig,
661 d_eigen_val,
662 &lwork
663 ));
664
665 // Allocate workspace
666 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
667
668 // Main call
669 cusolverErrcheck(cusolverDnSsygvdx(
670 cusolver_handle,
671 itype_t, jobz_t, range_t, uplo_t,
672 n,
673 d_A_copy, lda,
674 d_B_copy, lda,
675 vl, vu, il, iu,
676 h_meig,
677 d_eigen_val,
678 d_work, lwork,
679 d_info
680 ));
681
682 // Check result
683 int h_info = 0;
684 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
685 if (h_info < 0) {
686 throw std::runtime_error("hegvdx (float): illegal argument #" + std::to_string(-h_info));
687 } else if (h_info > 0) {
688 // If h_info <= n: convergence issue in tridiag solver (no vec) OR
689 // If h_info > n: B's leading minor of order (h_info - n) is not positive definite
690 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
691 throw std::runtime_error("hegvdx (float): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
692 } else if (h_info > n) {
693 throw std::runtime_error("hegvdx (float): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
694 }
695 }
696
697 // If jobz == 'V', copy eigenvectors from A (which now contains Z) to output
698 if (jobz == 'V') {
699 const int m = (*h_meig); // number of eigenvectors computed
700 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(float) * n * m, cudaMemcpyDeviceToDevice));
701 }
702
703 // Cleanup
704 cudaFree(d_info);
705 cudaFree(d_work);
706 cudaFree(d_A_copy);
707 cudaFree(d_B_copy);
708}
709
710
711// --- double ---
712static inline
713void hegvdx(
714 cusolverDnHandle_t& cusolver_handle,
715 const int itype,
716 const char jobz,
717 const char range,
718 const char uplo,
719 const int n,
720 const int lda,
721 double* d_A,
722 double* d_B,
723 const double vl,
724 const double vu,
725 const int il,
726 const int iu,
727 int* h_meig,
728 double* d_eigen_val,
729 double* d_eigen_vec
730) {
731 int lwork = 0;
732 int *d_info = nullptr;
733 double *d_work = nullptr;
734
735 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
736
737 double *d_A_copy = nullptr, *d_B_copy = nullptr;
738 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(double) * n * lda));
739 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(double) * n * lda));
740 cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice));
741 cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice));
742
743 cusolverEigType_t itype_t = cublas_eig_type(itype);
744 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
745 cusolverEigRange_t range_t = cublas_eig_range(range);
746 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
747
748 cusolverErrcheck(cusolverDnDsygvdx_bufferSize(
749 cusolver_handle,
750 itype_t, jobz_t, range_t, uplo_t,
751 n,
752 d_A_copy, lda,
753 d_B_copy, lda,
754 vl, vu, il, iu,
755 h_meig,
756 d_eigen_val,
757 &lwork
758 ));
759
760 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
761
762 cusolverErrcheck(cusolverDnDsygvdx(
763 cusolver_handle,
764 itype_t, jobz_t, range_t, uplo_t,
765 n,
766 d_A_copy, lda,
767 d_B_copy, lda,
768 vl, vu, il, iu,
769 h_meig,
770 d_eigen_val,
771 d_work, lwork,
772 d_info
773 ));
774
775 int h_info = 0;
776 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
777 if (h_info < 0) {
778 throw std::runtime_error("hegvdx (double): illegal argument #" + std::to_string(-h_info));
779 } else if (h_info > 0) {
780 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
781 throw std::runtime_error("hegvdx (double): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
782 } else if (h_info > n) {
783 throw std::runtime_error("hegvdx (double): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
784 }
785 }
786
787 if (jobz == 'V') {
788 const int m = (*h_meig);
789 cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(double) * n * m, cudaMemcpyDeviceToDevice));
790 }
791
792 cudaFree(d_info);
793 cudaFree(d_work);
794 cudaFree(d_A_copy);
795 cudaFree(d_B_copy);
796}
797
798
799// --- complex<float> ---
800static inline
801void hegvdx(
802 cusolverDnHandle_t& cusolver_handle,
803 const int itype,
804 const char jobz,
805 const char range,
806 const char uplo,
807 const int n,
808 const int lda,
809 std::complex<float>* d_A,
810 std::complex<float>* d_B,
811 const float vl,
812 const float vu,
813 const int il,
814 const int iu,
815 int* h_meig,
816 float* d_eigen_val,
817 std::complex<float>* d_eigen_vec
818) {
819 int lwork = 0;
820 int *d_info = nullptr;
821 cuComplex *d_work = nullptr;
822
823 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
824
825 cuComplex *d_A_copy = nullptr, *d_B_copy = nullptr;
826 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuComplex) * n * lda));
827 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuComplex) * n * lda));
828 cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast<cuComplex*>(d_A), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
829 cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast<cuComplex*>(d_B), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
830
831 cusolverEigType_t itype_t = cublas_eig_type(itype);
832 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
833 cusolverEigRange_t range_t = cublas_eig_range(range);
834 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
835
836 cusolverErrcheck(cusolverDnChegvdx_bufferSize(
837 cusolver_handle,
838 itype_t, jobz_t, range_t, uplo_t,
839 n,
840 d_A_copy, lda,
841 d_B_copy, lda,
842 vl, vu, il, iu,
843 h_meig,
844 d_eigen_val,
845 &lwork
846 ));
847
848 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
849
850 cusolverErrcheck(cusolverDnChegvdx(
851 cusolver_handle,
852 itype_t, jobz_t, range_t, uplo_t,
853 n,
854 d_A_copy, lda,
855 d_B_copy, lda,
856 vl, vu, il, iu,
857 h_meig,
858 d_eigen_val,
859 d_work, lwork,
860 d_info
861 ));
862
863 int h_info = 0;
864 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
865 if (h_info < 0) {
866 throw std::runtime_error("hegvdx (complex<float>): illegal argument #" + std::to_string(-h_info));
867 } else if (h_info > 0) {
868 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
869 throw std::runtime_error("hegvdx (complex<float>): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
870 } else if (h_info > n) {
871 throw std::runtime_error("hegvdx (complex<float>): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
872 }
873 }
874
875 if (jobz == 'V') {
876 const int m = (*h_meig);
877 cudaErrcheck(cudaMemcpy(reinterpret_cast<cuComplex*>(d_eigen_vec), d_A_copy, sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice));
878 }
879
880 cudaFree(d_info);
881 cudaFree(d_work);
882 cudaFree(d_A_copy);
883 cudaFree(d_B_copy);
884}
885
886
887// --- complex<double> ---
888static inline
889void hegvdx(
890 cusolverDnHandle_t& cusolver_handle,
891 const int itype,
892 const char jobz,
893 const char range,
894 const char uplo,
895 const int n,
896 const int lda,
897 std::complex<double>* d_A,
898 std::complex<double>* d_B,
899 const double vl,
900 const double vu,
901 const int il,
902 const int iu,
903 int* h_meig,
904 double* d_eigen_val,
905 std::complex<double>* d_eigen_vec
906) {
907 int lwork = 0;
908 int *d_info = nullptr;
909 cuDoubleComplex *d_work = nullptr;
910
911 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
912
913 cuDoubleComplex *d_A_copy = nullptr, *d_B_copy = nullptr;
914 cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuDoubleComplex) * n * lda));
915 cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuDoubleComplex) * n * lda));
916 cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast<cuDoubleComplex*>(d_A), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
917 cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast<cuDoubleComplex*>(d_B), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
918
919 cusolverEigType_t itype_t = cublas_eig_type(itype);
920 cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
921 cusolverEigRange_t range_t = cublas_eig_range(range);
922 cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
923
924 cusolverErrcheck(cusolverDnZhegvdx_bufferSize(
925 cusolver_handle,
926 itype_t, jobz_t, range_t, uplo_t,
927 n,
928 d_A_copy, lda,
929 d_B_copy, lda,
930 vl, vu, il, iu,
931 h_meig,
932 d_eigen_val,
933 &lwork
934 ));
935
936 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
937
938 cusolverErrcheck(cusolverDnZhegvdx(
939 cusolver_handle,
940 itype_t, jobz_t, range_t, uplo_t,
941 n,
942 d_A_copy, lda,
943 d_B_copy, lda,
944 vl, vu, il, iu,
945 h_meig,
946 d_eigen_val,
947 d_work, lwork,
948 d_info
949 ));
950
951 int h_info = 0;
952 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
953 if (h_info < 0) {
954 throw std::runtime_error("hegvdx (complex<double>): illegal argument #" + std::to_string(-h_info));
955 } else if (h_info > 0) {
956 if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
957 throw std::runtime_error("hegvdx (complex<double>): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
958 } else if (h_info > n) {
959 throw std::runtime_error("hegvdx (complex<double>): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
960 }
961 }
962
963 if (jobz == 'V') {
964 const int m = (*h_meig);
965 cudaErrcheck(cudaMemcpy(reinterpret_cast<cuDoubleComplex*>(d_eigen_vec), d_A_copy, sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice));
966 }
967
968 cudaFree(d_info);
969 cudaFree(d_work);
970 cudaFree(d_A_copy);
971 cudaFree(d_B_copy);
972}
973
974
975// --- getrf
976static inline
977void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv)
978{
979 // prepare some values for cusolverDnSgetrf_bufferSize
980 int lwork = 0;
981 int h_info = 0;
982 int* d_info = nullptr;
983 float* d_work = nullptr;
984 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
985
986 // calculate the sizes needed for pre-allocated buffer.
987 cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
988
989 // allocate memory
990 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
991
992 // Perform LU decomposition
993 cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
994
995 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
996 if (h_info != 0) {
997 throw std::runtime_error("getrf: failed to compute LU factorization");
998 }
999
1000 cudaErrcheck(cudaFree(d_work));
1001 cudaErrcheck(cudaFree(d_info));
1002}
1003static inline
1004void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, double* A, const int& lda, int* ipiv)
1005{
1006 // prepare some values for cusolverDnDgetrf_bufferSize
1007 int lwork = 0;
1008 int h_info = 0;
1009 int* d_info = nullptr;
1010 double* d_work = nullptr;
1011 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1012
1013 // calculate the sizes needed for pre-allocated buffer.
1014 cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
1015
1016 // allocate memory
1017 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
1018
1019 // Perform LU decomposition
1020 cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
1021
1022 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1023 if (h_info != 0) {
1024 throw std::runtime_error("getrf: failed to compute LU factorization");
1025 }
1026
1027 cudaErrcheck(cudaFree(d_work));
1028 cudaErrcheck(cudaFree(d_info));
1029}
1030static inline
1031void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<float>* A, const int& lda, int* ipiv)
1032{
1033 // prepare some values for cusolverDnCgetrf_bufferSize
1034 int lwork = 0;
1035 int h_info = 0;
1036 int* d_info = nullptr;
1037 cuComplex* d_work = nullptr;
1038 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1039
1040 // calculate the sizes needed for pre-allocated buffer.
1041 cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
1042
1043 // allocate memory
1044 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
1045
1046 // Perform LU decomposition
1047 cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, d_work, ipiv, d_info));
1048
1049 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1050 if (h_info != 0) {
1051 throw std::runtime_error("getrf: failed to compute LU factorization");
1052 }
1053
1054 cudaErrcheck(cudaFree(d_work));
1055 cudaErrcheck(cudaFree(d_info));
1056}
1057static inline
1058void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<double>* A, const int& lda, int* ipiv)
1059{
1060 // prepare some values for cusolverDnZgetrf_bufferSize
1061 int lwork = 0;
1062 int h_info = 0;
1063 int* d_info = nullptr;
1064 cuDoubleComplex* d_work = nullptr;
1065 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1066
1067 // calculate the sizes needed for pre-allocated buffer.
1068 cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
1069
1070 // allocate memory
1071 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
1072
1073 // Perform LU decomposition
1074 cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, d_work, ipiv, d_info));
1075
1076 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1077 if (h_info != 0) {
1078 throw std::runtime_error("getrf: failed to compute LU factorization");
1079 }
1080
1081 cudaErrcheck(cudaFree(d_work));
1082 cudaErrcheck(cudaFree(d_info));
1083}
1084
1085static inline
1086void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, float* A, const int& lda, const int* ipiv, float* B, const int& ldb)
1087{
1088 int h_info = 0;
1089 int* d_info = nullptr;
1090 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1091
1092 cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1093
1094 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1095 if (h_info != 0) {
1096 throw std::runtime_error("getrs: failed to solve the linear system");
1097 }
1098
1099 cudaErrcheck(cudaFree(d_info));
1100}
1101static inline
1102void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, double* A, const int& lda, const int* ipiv, double* B, const int& ldb)
1103{
1104 int h_info = 0;
1105 int* d_info = nullptr;
1106 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1107
1108 cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
1109
1110 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1111 if (h_info != 0) {
1112 throw std::runtime_error("getrs: failed to solve the linear system");
1113 }
1114
1115 cudaErrcheck(cudaFree(d_info));
1116}
1117static inline
1118void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<float>* A, const int& lda, const int* ipiv, std::complex<float>* B, const int& ldb)
1119{
1120 int h_info = 0;
1121 int* d_info = nullptr;
1122 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1123
1124 cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuComplex*>(A), lda, ipiv, reinterpret_cast<cuComplex*>(B), ldb, d_info));
1125
1126 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1127 if (h_info != 0) {
1128 throw std::runtime_error("getrs: failed to solve the linear system");
1129 }
1130
1131 cudaErrcheck(cudaFree(d_info));
1132}
1133static inline
1134void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<double>* A, const int& lda, const int* ipiv, std::complex<double>* B, const int& ldb)
1135{
1136 int h_info = 0;
1137 int* d_info = nullptr;
1138 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
1139
1140 cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuDoubleComplex*>(A), lda, ipiv, reinterpret_cast<cuDoubleComplex*>(B), ldb, d_info));
1141
1142 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1143 if (h_info != 0) {
1144 throw std::runtime_error("getrs: failed to solve the linear system");
1145 }
1146
1147 cudaErrcheck(cudaFree(d_info));
1148}
1149
1150// QR decomposition
1151// geqrf, orgqr
1152// Note:
1153// there are two cusolver geqrf
1154// one is cusolverDn<t>geqrf
1155// one is cusolverDnXgeqrf
1156// which one is better?
1157//
1158// template<typename T>
1159// static inline void geqrf(
1160// cusolverDnHandle_t& cusolver_handle,
1161// const int64_t m,
1162// const int64_t n,
1163// T* d_A, // device matrix A (m x n, column-major)
1164// const int64_t lda,
1165// T* d_tau // output: scalar factors of elementary reflectors
1166// ) {
1167// // query workspace size
1168// int *d_info = nullptr; /* error info */
1169//
1170// size_t workspaceInBytesOnDevice = 0; /* size of workspace */
1171// void *d_work = nullptr; /* device workspace */
1172// size_t workspaceInBytesOnHost = 0; /* size of workspace */
1173// void *h_work = nullptr; /* host workspace */
1174//
1175// cudaErrcheck(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
1176//
1177// cusolverDnParams_t params = NULL;
1178// cusolverErrcheck(cusolverDnCreateParams(&params));
1179//
1180// cusolverErrcheck(cusolverDnXgeqrf_bufferSize(
1181// cusolver_handle,
1182// params,
1183// m, n,
1184// traits<T>::cuda_data_type,
1185// d_A,
1186// lda,
1187// traits<T>::cuda_data_type,
1188// d_tau,
1189// traits<T>::cuda_data_type,
1190// &workspaceInBytesOnDevice,
1191// &workspaceInBytesOnHost
1192// ));
1193//
1194// // allocate device workspace
1195// cudaErrcheck(cudaMalloc(reinterpret_cast<void **>(&d_work), workspaceInBytesOnDevice));
1196//
1197// // allocate host workspace
1198// if (workspaceInBytesOnHost > 0) {
1199// h_work = reinterpret_cast<void *>(malloc(workspaceInBytesOnHost));
1200// if (h_work == nullptr) {
1201// throw std::runtime_error("Error: h_work not allocated.");
1202// }
1203// }
1204//
1205// // QR factorization
1206// cusolverErrcheck(cusolverDnXgeqrf(
1207// cusolver_handle,
1208// params,
1209// m, n,
1210// traits<T>::cuda_data_type,
1211// d_A,
1212// lda,
1213// traits<T>::cuda_data_type,
1214// d_tau,
1215// traits<T>::cuda_data_type,
1216// d_work,
1217// workspaceInBytesOnDevice,
1218// h_work,
1219// workspaceInBytesOnHost,
1220// d_info
1221// ));
1222//
1223// // check info
1224// int h_info = 0;
1225// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1226// if (h_info != 0) {
1227// // std::printf("%d-th parameter is wrong \n", -info);
1228// // print error message
1229// std::cout << -h_info << "th parameter is wrong" << std::endl;
1230// throw std::runtime_error("geqrf: failed to compute QR decomposition");
1231// }
1232//
1233// // clean workspace
1234// cudaErrcheck(cudaFree(d_info));
1235// cudaErrcheck(cudaFree(d_work));
1236// if (h_work) free(h_work);
1237// cusolverErrcheck(cusolverDnDestroyParams(params));
1238// }
1239
1240// geqrf
1241
1242// --- float ---
1243static inline void geqrf(
1244 cusolverDnHandle_t& cusolver_handle,
1245 const int m,
1246 const int n,
1247 float* d_A,
1248 const int lda,
1249 float* d_tau
1250) {
1251 int lwork = 0;
1252 cusolverErrcheck(cusolverDnSgeqrf_bufferSize(
1253 cusolver_handle, m, n, d_A, lda, &lwork));
1254
1255 float* d_work = nullptr;
1256 int* d_info = nullptr;
1257
1258 if (lwork > 0) {
1259 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(float) * lwork));
1260 }
1261 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1262
1263 cusolverErrcheck(cusolverDnSgeqrf(
1264 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1265
1266 int h_info = 0;
1267 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1268 if (h_info != 0) {
1269 std::cout << "geqrf (S): info = " << h_info << std::endl;
1270 if (d_work) cudaErrcheck(cudaFree(d_work));
1271 cudaErrcheck(cudaFree(d_info));
1272 throw std::runtime_error("geqrf (S): QR factorization failed");
1273 }
1274
1275 if (d_work) cudaErrcheck(cudaFree(d_work));
1276 cudaErrcheck(cudaFree(d_info));
1277}
1278
1279// --- double ---
1280static inline void geqrf(
1281 cusolverDnHandle_t& cusolver_handle,
1282 const int m,
1283 const int n,
1284 double* d_A,
1285 const int lda,
1286 double* d_tau
1287) {
1288 int lwork = 0;
1289 cusolverErrcheck(cusolverDnDgeqrf_bufferSize(
1290 cusolver_handle, m, n, d_A, lda, &lwork));
1291
1292 double* d_work = nullptr;
1293 int* d_info = nullptr;
1294
1295 if (lwork > 0) {
1296 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(double) * lwork));
1297 }
1298 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1299
1300 cusolverErrcheck(cusolverDnDgeqrf(
1301 cusolver_handle, m, n, d_A, lda, d_tau, d_work, lwork, d_info));
1302
1303 int h_info = 0;
1304 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1305 if (h_info != 0) {
1306 std::cout << "geqrf (D): info = " << h_info << std::endl;
1307 if (d_work) cudaErrcheck(cudaFree(d_work));
1308 cudaErrcheck(cudaFree(d_info));
1309 throw std::runtime_error("geqrf (D): QR factorization failed");
1310 }
1311
1312 if (d_work) cudaErrcheck(cudaFree(d_work));
1313 cudaErrcheck(cudaFree(d_info));
1314}
1315
1316// --- std::complex<float> ---
1317static inline void geqrf(
1318 cusolverDnHandle_t& cusolver_handle,
1319 const int m,
1320 const int n,
1321 std::complex<float>* d_A,
1322 const int lda,
1323 std::complex<float>* d_tau
1324) {
1325 int lwork = 0;
1326 cusolverErrcheck(cusolverDnCgeqrf_bufferSize(
1327 cusolver_handle, m, n,
1328 reinterpret_cast<cuComplex*>(d_A),
1329 lda,
1330 &lwork // ← 这里才是 lwork 的地址!
1331 ));
1332
1333 cuComplex* d_work = nullptr;
1334 int* d_info = nullptr;
1335
1336 if (lwork > 0) {
1337 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
1338 }
1339 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1340
1341 cusolverErrcheck(cusolverDnCgeqrf(
1342 cusolver_handle, m, n,
1343 reinterpret_cast<cuComplex*>(d_A),
1344 lda,
1345 reinterpret_cast<cuComplex*>(d_tau), // ← 这里才是 d_tau
1346 d_work, lwork, d_info));
1347
1348 int h_info = 0;
1349 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1350 if (h_info != 0) {
1351 std::cout << "geqrf (C): info = " << h_info << std::endl;
1352 if (d_work) cudaErrcheck(cudaFree(d_work));
1353 cudaErrcheck(cudaFree(d_info));
1354 throw std::runtime_error("geqrf (C): QR factorization failed");
1355 }
1356
1357 if (d_work) cudaErrcheck(cudaFree(d_work));
1358 cudaErrcheck(cudaFree(d_info));
1359}
1360
1361// --- std::complex<double> ---
1362static inline void geqrf(
1363 cusolverDnHandle_t& cusolver_handle,
1364 const int m,
1365 const int n,
1366 std::complex<double>* d_A,
1367 const int lda,
1368 std::complex<double>* d_tau
1369) {
1370 int lwork = 0;
1371 cusolverErrcheck(cusolverDnZgeqrf_bufferSize(
1372 cusolver_handle, m, n,
1373 reinterpret_cast<cuDoubleComplex*>(d_A),
1374 lda,
1375 &lwork
1376 ));
1377
1378 cuDoubleComplex* d_work = nullptr;
1379 int* d_info = nullptr;
1380
1381 if (lwork > 0) {
1382 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuDoubleComplex) * lwork));
1383 }
1384 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1385
1386 cusolverErrcheck(cusolverDnZgeqrf(
1387 cusolver_handle, m, n,
1388 reinterpret_cast<cuDoubleComplex*>(d_A),
1389 lda,
1390 reinterpret_cast<cuDoubleComplex*>(d_tau),
1391 d_work, lwork, d_info));
1392
1393 int h_info = 0;
1394 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1395 if (h_info != 0) {
1396 std::cout << "geqrf (Z): info = " << h_info << std::endl;
1397 if (d_work) cudaErrcheck(cudaFree(d_work));
1398 cudaErrcheck(cudaFree(d_info));
1399 throw std::runtime_error("geqrf (Z): QR factorization failed");
1400 }
1401
1402 if (d_work) cudaErrcheck(cudaFree(d_work));
1403 cudaErrcheck(cudaFree(d_info));
1404}
1405
1406
1407// --- float ---
1408static inline void orgqr(
1409 cusolverDnHandle_t& cusolver_handle,
1410 const int m,
1411 const int n,
1412 const int k,
1413 float* d_A,
1414 const int lda,
1415 float* d_tau
1416) {
1417 int lwork = 0;
1418 cusolverErrcheck(cusolverDnSorgqr_bufferSize(
1419 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1420
1421 float* d_work = nullptr;
1422 int* d_info = nullptr;
1423
1424 if (lwork > 0) {
1425 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(float) * lwork));
1426 }
1427 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1428
1429 cusolverErrcheck(cusolverDnSorgqr(
1430 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1431
1432 int h_info = 0;
1433 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1434 if (h_info != 0) {
1435 std::cout << "orgqr (S): info = " << h_info << " (failure at parameter " << -h_info << ")" << std::endl;
1436 if (d_work) cudaErrcheck(cudaFree(d_work));
1437 cudaErrcheck(cudaFree(d_info));
1438 throw std::runtime_error("orgqr (S): failed to generate Q matrix");
1439 }
1440
1441 // clean workspace
1442 if (d_work) cudaErrcheck(cudaFree(d_work));
1443 cudaErrcheck(cudaFree(d_info));
1444}
1445
1446// --- double ---
1447static inline void orgqr(
1448 cusolverDnHandle_t& cusolver_handle,
1449 const int m,
1450 const int n,
1451 const int k,
1452 double* d_A,
1453 const int lda,
1454 double* d_tau
1455) {
1456 int lwork = 0;
1457 cusolverErrcheck(cusolverDnDorgqr_bufferSize(
1458 cusolver_handle, m, n, k, d_A, lda, d_tau, &lwork));
1459
1460 double* d_work = nullptr;
1461 int* d_info = nullptr;
1462
1463 if (lwork > 0) {
1464 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(double) * lwork));
1465 }
1466 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1467
1468 cusolverErrcheck(cusolverDnDorgqr(
1469 cusolver_handle, m, n, k, d_A, lda, d_tau, d_work, lwork, d_info));
1470
1471 int h_info = 0;
1472 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1473 if (h_info != 0) {
1474 std::cout << "orgqr (D): info = " << h_info << std::endl;
1475 if (d_work) cudaErrcheck(cudaFree(d_work));
1476 cudaErrcheck(cudaFree(d_info));
1477 throw std::runtime_error("orgqr (D): failed to generate Q matrix");
1478 }
1479
1480 if (d_work) cudaErrcheck(cudaFree(d_work));
1481 cudaErrcheck(cudaFree(d_info));
1482}
1483
1484// --- std::complex<float> ---
1485static inline void orgqr(
1486 cusolverDnHandle_t& cusolver_handle,
1487 const int m,
1488 const int n,
1489 const int k,
1490 std::complex<float>* d_A,
1491 const int lda,
1492 std::complex<float>* d_tau
1493) {
1494 int lwork = 0;
1495 cusolverErrcheck(cusolverDnCungqr_bufferSize(
1496 cusolver_handle, m, n, k,
1497 reinterpret_cast<cuComplex*>(d_A),
1498 lda,
1499 reinterpret_cast<cuComplex*>(d_tau),
1500 &lwork));
1501
1502 cuComplex* d_work = nullptr;
1503 int* d_info = nullptr;
1504
1505 if (lwork > 0) {
1506 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuComplex) * lwork));
1507 }
1508 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1509
1510 cusolverErrcheck(cusolverDnCungqr(
1511 cusolver_handle, m, n, k,
1512 reinterpret_cast<cuComplex*>(d_A),
1513 lda,
1514 reinterpret_cast<cuComplex*>(d_tau),
1515 d_work, lwork, d_info));
1516
1517 int h_info = 0;
1518 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1519 if (h_info != 0) {
1520 std::cout << "orgqr (C): info = " << h_info << std::endl;
1521 if (d_work) cudaErrcheck(cudaFree(d_work));
1522 cudaErrcheck(cudaFree(d_info));
1523 throw std::runtime_error("orgqr (C): failed to generate Q matrix");
1524 }
1525
1526 if (d_work) cudaErrcheck(cudaFree(d_work));
1527 cudaErrcheck(cudaFree(d_info));
1528}
1529
1530// --- std::complex<double> ---
1531static inline void orgqr(
1532 cusolverDnHandle_t& cusolver_handle,
1533 const int m,
1534 const int n,
1535 const int k,
1536 std::complex<double>* d_A,
1537 const int lda,
1538 std::complex<double>* d_tau
1539) {
1540 int lwork = 0;
1541 cusolverErrcheck(cusolverDnZungqr_bufferSize(
1542 cusolver_handle, m, n, k,
1543 reinterpret_cast<cuDoubleComplex*>(d_A),
1544 lda,
1545 reinterpret_cast<cuDoubleComplex*>(d_tau),
1546 &lwork));
1547
1548 cuDoubleComplex* d_work = nullptr;
1549 int* d_info = nullptr;
1550
1551 if (lwork > 0) {
1552 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_work), sizeof(cuDoubleComplex) * lwork));
1553 }
1554 cudaErrcheck(cudaMalloc(reinterpret_cast<void**>(&d_info), sizeof(int)));
1555
1556 cusolverErrcheck(cusolverDnZungqr(
1557 cusolver_handle, m, n, k,
1558 reinterpret_cast<cuDoubleComplex*>(d_A),
1559 lda,
1560 reinterpret_cast<cuDoubleComplex*>(d_tau),
1561 d_work, lwork, d_info));
1562
1563 int h_info = 0;
1564 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
1565 if (h_info != 0) {
1566 std::cout << "orgqr (Z): info = " << h_info << std::endl;
1567 if (d_work) cudaErrcheck(cudaFree(d_work));
1568 cudaErrcheck(cudaFree(d_info));
1569 throw std::runtime_error("orgqr (Z): failed to generate Q matrix");
1570 }
1571
1572 if (d_work) cudaErrcheck(cudaFree(d_work));
1573 cudaErrcheck(cudaFree(d_info));
1574}
1575
1576
1577} // namespace cuSolverConnector
1578} // namespace container
1579
1580#endif // BASE_THIRD_PARTY_CUSOLVER_H_
#define cudaErrcheck(res)
Definition cuda.h:236
#define cusolverErrcheck(res)
Definition cuda.h:225
#define T
Definition exp.cpp:237
Definition tensor.cpp:8
Definition cuda.h:49
T type
Definition cuda.h:14