|
ABACUS develop
Atomic-orbital Based Ab-initio Computation at UStc
|
this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values. More...
#include <para_gemm.h>
Public Member Functions | |
| PGemmCN () | |
| ~PGemmCN () | |
| void | set_dimension (MPI_Comm comm_col, MPI_Comm comm_row, const int ncolA, const int LDA, const int ncolB, const int LDB, const int nrow, const int LDC, const int mode=1) |
| set the dimension of A, B, and C | |
| void | multiply (const T alpha, const T *A, const T *B, const T beta, T *C) |
| calculate C = alpha * A^H * B + beta * C | |
Public Attributes | |
| MPI_Comm | col_world = MPI_COMM_NULL |
| column communicator world | |
| MPI_Comm | row_world = MPI_COMM_NULL |
| row communicator world | |
| int | col_rank = 0 |
| rank in col_world | |
| int | col_nproc = 1 |
| number of procs in col_world | |
| int | row_rank = 0 |
| rank in row_world | |
| int | row_nproc = 1 |
| number of procs in row_world | |
| std::vector< int > | colA_loc |
| [col_nproc] number of columns of A matrix in each proc | |
| int | max_colA = 0 |
| maximum number of columns of A matrix in all procs | |
| std::vector< int > | colB_loc |
| [col_nproc] number of columns of B matrix in each proc | |
| int | max_colB = 0 |
| maximum number of columns of B matrix in all procs | |
| std::vector< MPI_Request > | requests |
| MPI request. | |
| std::vector< int > | recv_counts |
| receive counts for gathering C_local to C_global | |
| std::vector< int > | displs |
| displacements for gathering C_local to C_global | |
| int | size_C_local = 0 |
| size of C_local, which is a local matrix in each proc | |
| int | size_C_global = 0 |
| size of C_global, which is the global C matrix gathered from all procs | |
| bool | gatherC = true |
| whether gather C_local to C_global | |
| bool | divideCrow = false |
| whether divide C_global to C_local | |
| int | ncolA = 0 |
| number of columns of A, which is a local matrix in each proc | |
| int | ncolB = 0 |
| number of columns of B, which is a local matrix in each proc | |
| int | nrow = 0 |
| number of rows of A or B | |
| int | LDA = 0 |
| leading dimension of A in each proc | |
| int | LDB = 0 |
| leading dimension of B in each proc | |
| int | LDC = 0 |
| leading dimension of C, which can be C_local or C_global | |
Private Types | |
| using | resmem_dev_op = base_device::memory::resize_memory_op< T, Device > |
| using | delmem_dev_op = base_device::memory::delete_memory_op< T, Device > |
| using | syncmem_dev_op = base_device::memory::synchronize_memory_op< T, Device, Device > |
| using | syncmem_d2h_op = base_device::memory::synchronize_memory_op< T, base_device::DEVICE_CPU, Device > |
| using | syncmem_h2d_op = base_device::memory::synchronize_memory_op< T, Device, base_device::DEVICE_CPU > |
Private Member Functions | |
| void | multiply_single (const T alpha, const T *A, const T *B, const T beta, T *C) |
| for col_nproc == 1 | |
| void | multiply_col (const T alpha, const T *A, const T *B, const T beta, T *C) |
| for mode = 1 or 2 | |
| void | multiply_row (const T alpha, const T *A, const T *B, const T beta, T *C) |
| for mode = 3 | |
Private Attributes | |
| std::vector< T > | isend_tmp_ |
| temperory memory for sending data | |
| std::vector< T > | A_tmp_ |
| temperory memory for A | |
| std::vector< T > | B_tmp_ |
| temperory memory for B | |
| std::vector< T > | C_tmp_ |
| temperory memory for C | |
| std::vector< T > | C_global_tmp_ |
| temperory memory for C_global | |
| T * | C_local_tmp_ = nullptr |
| temperory memory for C_local | |
| T * | A_tmp_device_ = nullptr |
| temperory memory for A | |
| T * | B_tmp_device_ = nullptr |
| temperory memory for B | |
this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values.
|
private |
|
private |
|
private |
|
private |
|
private |
| ModuleBase::PGemmCN< T, Device >::PGemmCN | ( | ) |
| ModuleBase::PGemmCN< T, Device >::~PGemmCN | ( | ) |
| void ModuleBase::PGemmCN< T, Device >::multiply | ( | const T | alpha, |
| const T * | A, | ||
| const T * | B, | ||
| const T | beta, | ||
| T * | C | ||
| ) |
calculate C = alpha * A^H * B + beta * C
|
private |
for mode = 1 or 2
|
private |
for mode = 3
|
private |
for col_nproc == 1
| void ModuleBase::PGemmCN< T, Device >::set_dimension | ( | MPI_Comm | comm_col, |
| MPI_Comm | comm_row, | ||
| const int | ncolA, | ||
| const int | LDA, | ||
| const int | ncolB, | ||
| const int | LDB, | ||
| const int | nrow, | ||
| const int | LDC, | ||
| const int | mode = 1 |
||
| ) |
set the dimension of A, B, and C
| ncolA | number of columns of A, which is a local matrix in each proc |
| LDA | leading dimension of A in each proc |
| ncolB | number of columns of B, which is a local matrix in each proc |
| LDB | leading dimension of B in each proc |
| nrow | number of rows of A or B |
| LDC | leading dimension of C. C can be C_local or C_global |
| mode | 1: gather C_local to C_global, 2:C_local(nrow * ncol_loc), 3:C_global(nrow_loc * ncol) |
|
private |
temperory memory for A
|
private |
temperory memory for A
|
private |
temperory memory for B
|
private |
temperory memory for B
|
private |
temperory memory for C_global
|
private |
temperory memory for C_local
|
private |
temperory memory for C
| int ModuleBase::PGemmCN< T, Device >::col_nproc = 1 |
number of procs in col_world
| int ModuleBase::PGemmCN< T, Device >::col_rank = 0 |
rank in col_world
| MPI_Comm ModuleBase::PGemmCN< T, Device >::col_world = MPI_COMM_NULL |
column communicator world
| std::vector<int> ModuleBase::PGemmCN< T, Device >::colA_loc |
[col_nproc] number of columns of A matrix in each proc
| std::vector<int> ModuleBase::PGemmCN< T, Device >::colB_loc |
[col_nproc] number of columns of B matrix in each proc
| std::vector<int> ModuleBase::PGemmCN< T, Device >::displs |
displacements for gathering C_local to C_global
| bool ModuleBase::PGemmCN< T, Device >::divideCrow = false |
whether divide C_global to C_local
| bool ModuleBase::PGemmCN< T, Device >::gatherC = true |
whether gather C_local to C_global
|
private |
temperory memory for sending data
| int ModuleBase::PGemmCN< T, Device >::LDA = 0 |
leading dimension of A in each proc
| int ModuleBase::PGemmCN< T, Device >::LDB = 0 |
leading dimension of B in each proc
| int ModuleBase::PGemmCN< T, Device >::LDC = 0 |
leading dimension of C, which can be C_local or C_global
| int ModuleBase::PGemmCN< T, Device >::max_colA = 0 |
maximum number of columns of A matrix in all procs
| int ModuleBase::PGemmCN< T, Device >::max_colB = 0 |
maximum number of columns of B matrix in all procs
| int ModuleBase::PGemmCN< T, Device >::ncolA = 0 |
number of columns of A, which is a local matrix in each proc
| int ModuleBase::PGemmCN< T, Device >::ncolB = 0 |
number of columns of B, which is a local matrix in each proc
| int ModuleBase::PGemmCN< T, Device >::nrow = 0 |
number of rows of A or B
| std::vector<int> ModuleBase::PGemmCN< T, Device >::recv_counts |
receive counts for gathering C_local to C_global
| std::vector<MPI_Request> ModuleBase::PGemmCN< T, Device >::requests |
MPI request.
| int ModuleBase::PGemmCN< T, Device >::row_nproc = 1 |
number of procs in row_world
| int ModuleBase::PGemmCN< T, Device >::row_rank = 0 |
rank in row_world
| MPI_Comm ModuleBase::PGemmCN< T, Device >::row_world = MPI_COMM_NULL |
row communicator world
| int ModuleBase::PGemmCN< T, Device >::size_C_global = 0 |
size of C_global, which is the global C matrix gathered from all procs
| int ModuleBase::PGemmCN< T, Device >::size_C_local = 0 |
size of C_local, which is a local matrix in each proc