ABACUS develop
Atomic-orbital Based Ab-initio Computation at UStc
Loading...
Searching...
No Matches
cusolver.h
Go to the documentation of this file.
1#ifndef BASE_THIRD_PARTY_CUSOLVER_H_
2#define BASE_THIRD_PARTY_CUSOLVER_H_
3
4#include <cublas_v2.h>
5#include <cuda_runtime.h>
6#include <base/macros/cuda.h>
7
8namespace container {
9namespace cuSolverConnector {
10
11template <typename T>
12static inline
13void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, T* A, const int& lda)
14{
15 size_t d_lwork = 0, h_lwork = 0;
16 using Type = typename GetTypeThrust<T>::type;
17 cusolverErrcheck(cusolverDnXtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), lda, &d_lwork, &h_lwork));
18 void* d_work = nullptr, *h_work = nullptr;
19 cudaErrcheck(cudaMalloc((void**)&d_work, d_lwork));
20 if (h_lwork) {
21 h_work = malloc(h_lwork);
22 if (h_work == nullptr) {
23 throw std::bad_alloc();
24 }
25 }
26 int h_info = 0;
27 int* d_info = nullptr;
28 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
29 // Perform Cholesky decomposition
30 cusolverErrcheck(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
31 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
32 if (h_info != 0) {
33 throw std::runtime_error("trtri: failed to invert matrix");
34 }
35 free(h_work);
36 cudaErrcheck(cudaFree(d_work));
37 cudaErrcheck(cudaFree(d_info));
38}
39
40static inline
41void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float * A, const int& lda)
42{
43 int lwork;
44 cusolverErrcheck(cusolverDnSpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
45 float* work;
46 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(float)));
47 // Perform Cholesky decomposition
48 cusolverErrcheck(cusolverDnSpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, nullptr));
49 cudaErrcheck(cudaFree(work));
50}
51static inline
52void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, double * A, const int& lda)
53{
54 int lwork;
55 cusolverErrcheck(cusolverDnDpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
56 double* work;
57 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(double)));
58 // Perform Cholesky decomposition
59 cusolverErrcheck(cusolverDnDpotri(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, nullptr));
60 cudaErrcheck(cudaFree(work));
61}
62static inline
63void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<float> * A, const int& lda)
64{
65 int lwork;
66 cusolverErrcheck(cusolverDnCpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex *>(A), n, &lwork));
67 cuComplex* work;
68 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuComplex)));
69 // Perform Cholesky decomposition
70 cusolverErrcheck(cusolverDnCpotri(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex *>(A), n, work, lwork, nullptr));
71 cudaErrcheck(cudaFree(work));
72}
73static inline
74void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<double> * A, const int& lda)
75{
76 int lwork;
77 cusolverErrcheck(cusolverDnZpotri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex *>(A), n, &lwork));
78 cuDoubleComplex* work;
79 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuDoubleComplex)));
80 // Perform Cholesky decomposition
81 cusolverErrcheck(cusolverDnZpotri(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex *>(A), n, work, lwork, nullptr));
82 cudaErrcheck(cudaFree(work));
83}
84
85
86static inline
87void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, float * A, const int& lda)
88{
89 int lwork;
90 int *info = nullptr;
91 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
92 cusolverErrcheck(cusolverDnSpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
93 float* work;
94 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(float)));
95 // Perform Cholesky decomposition
96 cusolverErrcheck(cusolverDnSpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
97 cudaErrcheck(cudaFree(work));
98 cudaErrcheck(cudaFree(info));
99}
100static inline
101void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, double * A, const int& lda)
102{
103 int lwork;
104 int *info = nullptr;
105 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
106 cusolverErrcheck(cusolverDnDpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, A, n, &lwork));
107 double* work;
108 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(double)));
109 // Perform Cholesky decomposition
110 cusolverErrcheck(cusolverDnDpotrf(cusolver_handle, cublas_fill_mode(uplo), n, A, n, work, lwork, info));
111 cudaErrcheck(cudaFree(work));
112 cudaErrcheck(cudaFree(info));
113}
114static inline
115void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, std::complex<float> * A, const int& lda)
116{
117 int lwork;
118 int *info = nullptr;
119 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
120 cusolverErrcheck(cusolverDnCpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
121 cuComplex* work;
122 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuComplex)));
123 // Perform Cholesky decomposition
124 cusolverErrcheck(cusolverDnCpotrf(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuComplex*>(A), lda, work, lwork, info));
125 cudaErrcheck(cudaFree(work));
126 cudaErrcheck(cudaFree(info));
127}
128static inline
129void potrf (cusolverDnHandle_t& cusolver_handle, const char& uplo, const int& n, std::complex<double> * A, const int& lda)
130{
131 int lwork;
132 int *info = nullptr;
133 cudaErrcheck(cudaMalloc((void**)&info, 1 * sizeof(int)));
134 cusolverErrcheck(cusolverDnZpotrf_bufferSize(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
135 cuDoubleComplex* work;
136 cudaErrcheck(cudaMalloc((void**)&work, lwork * sizeof(cuDoubleComplex)));
137 // Perform Cholesky decomposition
138 cusolverErrcheck(cusolverDnZpotrf(cusolver_handle, cublas_fill_mode(uplo), n, reinterpret_cast<cuDoubleComplex*>(A), lda, work, lwork, info));
139 cudaErrcheck(cudaFree(work));
140 cudaErrcheck(cudaFree(info));
141}
142
143
144static inline
145void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W)
146{
147 // prepare some values for cusolverDnSsyevd_bufferSize
148 int lwork = 0;
149 int h_info = 0;
150 int* d_info = nullptr;
151 float* d_work = nullptr;
152 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
153
154 // calculate the sizes needed for pre-allocated buffer.
155 cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
156 n, A, lda, W, &lwork));
157 // allocate memory
158 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
159 // compute eigenvalues and eigenvectors.
160 cusolverErrcheck(cusolverDnSsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
161 n, A, lda, W, d_work, lwork, d_info));
162
163 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
164 if (h_info != 0) {
165 throw std::runtime_error("dnevd: failed to invert matrix");
166 }
167 cudaErrcheck(cudaFree(d_info));
168 cudaErrcheck(cudaFree(d_work));
169}
170static inline
171void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W)
172{
173 // prepare some values for cusolverDnDsyevd_bufferSize
174 int lwork = 0;
175 int h_info = 0;
176 int* d_info = nullptr;
177 double* d_work = nullptr;
178 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
179
180 // calculate the sizes needed for pre-allocated buffer.
181 cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
182 n, A, lda, W, &lwork));
183 // allocate memory
184 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
185 // compute eigenvalues and eigenvectors.
186 cusolverErrcheck(cusolverDnDsyevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
187 n, A, lda, W, d_work, lwork, d_info));
188
189 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
190 if (h_info != 0) {
191 throw std::runtime_error("dnevd: failed to invert matrix");
192 }
193 cudaErrcheck(cudaFree(d_info));
194 cudaErrcheck(cudaFree(d_work));
195}
196static inline
197void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, float * W)
198{
199 // prepare some values for cusolverDnCheevd_bufferSize
200 int lwork = 0;
201 int h_info = 0;
202 int* d_info = nullptr;
203 cuComplex* d_work = nullptr;
204 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
205
206 // calculate the sizes needed for pre-allocated buffer.
207 cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
208 n, reinterpret_cast<cuComplex*>(A), lda, W, &lwork));
209 // allocate memory
210 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
211 // compute eigenvalues and eigenvectors.
212 cusolverErrcheck(cusolverDnCheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
213 n, reinterpret_cast<cuComplex*>(A), lda, W, d_work, lwork, d_info));
214
215 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
216 if (h_info != 0) {
217 throw std::runtime_error("dnevd: failed to invert matrix");
218 }
219 cudaErrcheck(cudaFree(d_info));
220 cudaErrcheck(cudaFree(d_work));
221}
222static inline
223void dnevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, double* W)
224{
225 // prepare some values for cusolverDnZheevd_bufferSize
226 int lwork = 0;
227 int h_info = 0;
228 int* d_info = nullptr;
229 cuDoubleComplex* d_work = nullptr;
230 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
231
232 // calculate the sizes needed for pre-allocated buffer.
233 cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
234 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, &lwork));
235 // allocate memory
236 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
237 // compute eigenvalues and eigenvectors.
238 cusolverErrcheck(cusolverDnZheevd(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
239 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, d_work, lwork, d_info));
240
241 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
242 if (h_info != 0) {
243 throw std::runtime_error("dnevd: failed to invert matrix");
244 }
245 cudaErrcheck(cudaFree(d_info));
246 cudaErrcheck(cudaFree(d_work));
247}
248
249static inline
250void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W)
251{
252 // prepare some values for cusolverDnSsygvd_bufferSize
253 int lwork = 0;
254 int h_info = 0;
255 int* d_info = nullptr;
256 float* d_work = nullptr;
257 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
258
259 // calculate the sizes needed for pre-allocated buffer.
260 cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
261 n, A, lda, B, ldb, W, &lwork));
262 // allocate memory
263 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
264 // compute eigenvalues and eigenvectors.
265 cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
266 n, A, lda, B, ldb, W, d_work, lwork, d_info));
267
268 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
269 if (h_info != 0) {
270 throw std::runtime_error("dnevd: failed to invert matrix");
271 }
272 cudaErrcheck(cudaFree(d_info));
273 cudaErrcheck(cudaFree(d_work));
274}
275static inline
276void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W)
277{
278 // prepare some values for cusolverDnDsygvd_bufferSize
279 int lwork = 0;
280 int h_info = 0;
281 int* d_info = nullptr;
282 double* d_work = nullptr;
283 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
284
285 // calculate the sizes needed for pre-allocated buffer.
286 cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
287 n, A, lda, B, ldb, W, &lwork));
288 // allocate memory
289 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
290 // compute eigenvalues and eigenvectors.
291 cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
292 n, A, lda, B, ldb, W, d_work, lwork, d_info));
293
294 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
295 if (h_info != 0) {
296 throw std::runtime_error("dnevd: failed to invert matrix");
297 }
298 cudaErrcheck(cudaFree(d_info));
299 cudaErrcheck(cudaFree(d_work));
300}
301static inline
302void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb, float* W)
303{
304 // prepare some values for cusolverDnChegvd_bufferSize
305 int lwork = 0;
306 int h_info = 0;
307 int* d_info = nullptr;
308 cuComplex* d_work = nullptr;
309 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
310
311 // calculate the sizes needed for pre-allocated buffer.
312 cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
313 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, &lwork));
314 // allocate memory
315 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
316 // compute eigenvalues and eigenvectors.
317 cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
318 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, d_work, lwork, d_info));
319
320 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
321 if (h_info != 0) {
322 throw std::runtime_error("dnevd: failed to invert matrix");
323 }
324 cudaErrcheck(cudaFree(d_info));
325 cudaErrcheck(cudaFree(d_work));
326}
327static inline
328void dngvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb, double* W)
329{
330 // prepare some values for cusolverDnZhegvd_bufferSize
331 int lwork = 0;
332 int h_info = 0;
333 int* d_info = nullptr;
334 cuDoubleComplex* d_work = nullptr;
335 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
336
337 // calculate the sizes needed for pre-allocated buffer.
338 cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
339 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, &lwork));
340 // allocate memory
341 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
342 // compute eigenvalues and eigenvectors.
343 cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
344 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, d_work, lwork, d_info));
345
346 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
347 if (h_info != 0) {
348 throw std::runtime_error("dnevd: failed to invert matrix");
349 }
350 cudaErrcheck(cudaFree(d_info));
351 cudaErrcheck(cudaFree(d_work));
352}
353
354static inline
355void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv)
356{
357 // prepare some values for cusolverDnSgetrf_bufferSize
358 int lwork = 0;
359 int h_info = 0;
360 int* d_info = nullptr;
361 float* d_work = nullptr;
362 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
363
364 // calculate the sizes needed for pre-allocated buffer.
365 cusolverErrcheck(cusolverDnSgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
366
367 // allocate memory
368 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
369
370 // Perform LU decomposition
371 cusolverErrcheck(cusolverDnSgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
372
373 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
374 if (h_info != 0) {
375 throw std::runtime_error("getrf: failed to compute LU factorization");
376 }
377
378 cudaErrcheck(cudaFree(d_work));
379 cudaErrcheck(cudaFree(d_info));
380}
381static inline
382void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, double* A, const int& lda, int* ipiv)
383{
384 // prepare some values for cusolverDnDgetrf_bufferSize
385 int lwork = 0;
386 int h_info = 0;
387 int* d_info = nullptr;
388 double* d_work = nullptr;
389 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
390
391 // calculate the sizes needed for pre-allocated buffer.
392 cusolverErrcheck(cusolverDnDgetrf_bufferSize(cusolver_handle, m, n, A, lda, &lwork));
393
394 // allocate memory
395 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
396
397 // Perform LU decomposition
398 cusolverErrcheck(cusolverDnDgetrf(cusolver_handle, m, n, A, lda, d_work, ipiv, d_info));
399
400 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
401 if (h_info != 0) {
402 throw std::runtime_error("getrf: failed to compute LU factorization");
403 }
404
405 cudaErrcheck(cudaFree(d_work));
406 cudaErrcheck(cudaFree(d_info));
407}
408static inline
409void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<float>* A, const int& lda, int* ipiv)
410{
411 // prepare some values for cusolverDnCgetrf_bufferSize
412 int lwork = 0;
413 int h_info = 0;
414 int* d_info = nullptr;
415 cuComplex* d_work = nullptr;
416 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
417
418 // calculate the sizes needed for pre-allocated buffer.
419 cusolverErrcheck(cusolverDnCgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
420
421 // allocate memory
422 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
423
424 // Perform LU decomposition
425 cusolverErrcheck(cusolverDnCgetrf(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, d_work, ipiv, d_info));
426
427 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
428 if (h_info != 0) {
429 throw std::runtime_error("getrf: failed to compute LU factorization");
430 }
431
432 cudaErrcheck(cudaFree(d_work));
433 cudaErrcheck(cudaFree(d_info));
434}
435static inline
436void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, std::complex<double>* A, const int& lda, int* ipiv)
437{
438 // prepare some values for cusolverDnZgetrf_bufferSize
439 int lwork = 0;
440 int h_info = 0;
441 int* d_info = nullptr;
442 cuDoubleComplex* d_work = nullptr;
443 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
444
445 // calculate the sizes needed for pre-allocated buffer.
446 cusolverErrcheck(cusolverDnZgetrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
447
448 // allocate memory
449 cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
450
451 // Perform LU decomposition
452 cusolverErrcheck(cusolverDnZgetrf(cusolver_handle, m, n, reinterpret_cast<cuDoubleComplex*>(A), lda, d_work, ipiv, d_info));
453
454 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
455 if (h_info != 0) {
456 throw std::runtime_error("getrf: failed to compute LU factorization");
457 }
458
459 cudaErrcheck(cudaFree(d_work));
460 cudaErrcheck(cudaFree(d_info));
461}
462
463static inline
464void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, float* A, const int& lda, const int* ipiv, float* B, const int& ldb)
465{
466 int h_info = 0;
467 int* d_info = nullptr;
468 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
469
470 cusolverErrcheck(cusolverDnSgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
471
472 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
473 if (h_info != 0) {
474 throw std::runtime_error("getrs: failed to solve the linear system");
475 }
476
477 cudaErrcheck(cudaFree(d_info));
478}
479static inline
480void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, double* A, const int& lda, const int* ipiv, double* B, const int& ldb)
481{
482 int h_info = 0;
483 int* d_info = nullptr;
484 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
485
486 cusolverErrcheck(cusolverDnDgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, A, lda, ipiv, B, ldb, d_info));
487
488 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
489 if (h_info != 0) {
490 throw std::runtime_error("getrs: failed to solve the linear system");
491 }
492
493 cudaErrcheck(cudaFree(d_info));
494}
495static inline
496void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<float>* A, const int& lda, const int* ipiv, std::complex<float>* B, const int& ldb)
497{
498 int h_info = 0;
499 int* d_info = nullptr;
500 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
501
502 cusolverErrcheck(cusolverDnCgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuComplex*>(A), lda, ipiv, reinterpret_cast<cuComplex*>(B), ldb, d_info));
503
504 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
505 if (h_info != 0) {
506 throw std::runtime_error("getrs: failed to solve the linear system");
507 }
508
509 cudaErrcheck(cudaFree(d_info));
510}
511static inline
512void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, const int& nrhs, std::complex<double>* A, const int& lda, const int* ipiv, std::complex<double>* B, const int& ldb)
513{
514 int h_info = 0;
515 int* d_info = nullptr;
516 cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
517
518 cusolverErrcheck(cusolverDnZgetrs(cusolver_handle, GetCublasOperation(trans), n, nrhs, reinterpret_cast<cuDoubleComplex*>(A), lda, ipiv, reinterpret_cast<cuDoubleComplex*>(B), ldb, d_info));
519
520 cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
521 if (h_info != 0) {
522 throw std::runtime_error("getrs: failed to solve the linear system");
523 }
524
525 cudaErrcheck(cudaFree(d_info));
526}
527
528} // namespace cuSolverConnector
529} // namespace container
530
531#endif // BASE_THIRD_PARTY_CUSOLVER_H_
#define cudaErrcheck(res)
Definition cuda.h:213
#define cusolverErrcheck(res)
Definition cuda.h:202
#define T
Definition exp.cpp:237
Definition tensor.cpp:8
Definition cuda.h:49
T type
Definition cuda.h:14