ABACUS develop
Atomic-orbital Based Ab-initio Computation at UStc
|
this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values. More...
#include <para_gemm.h>
Public Member Functions | |
PGemmCN () | |
~PGemmCN () | |
void | set_dimension (MPI_Comm comm_col, MPI_Comm comm_row, const int ncolA, const int LDA, const int ncolB, const int LDB, const int nrow, const int LDC, const int mode=1) |
set the dimension of A, B, and C | |
void | multiply (const T alpha, const T *A, const T *B, const T beta, T *C) |
calculate C = alpha * A^H * B + beta * C | |
Public Attributes | |
MPI_Comm | col_world = MPI_COMM_NULL |
column communicator world | |
MPI_Comm | row_world = MPI_COMM_NULL |
row communicator world | |
int | col_rank = 0 |
rank in col_world | |
int | col_nproc = 1 |
number of procs in col_world | |
int | row_rank = 0 |
rank in row_world | |
int | row_nproc = 1 |
number of procs in row_world | |
std::vector< int > | colA_loc |
[col_nproc] number of columns of A matrix in each proc | |
int | max_colA = 0 |
maximum number of columns of A matrix in all procs | |
std::vector< int > | colB_loc |
[col_nproc] number of columns of B matrix in each proc | |
int | max_colB = 0 |
maximum number of columns of B matrix in all procs | |
std::vector< MPI_Request > | requests |
MPI request. | |
std::vector< int > | recv_counts |
receive counts for gathering C_local to C_global | |
std::vector< int > | displs |
displacements for gathering C_local to C_global | |
int | size_C_local = 0 |
size of C_local, which is a local matrix in each proc | |
int | size_C_global = 0 |
size of C_global, which is the global C matrix gathered from all procs | |
bool | gatherC = true |
whether gather C_local to C_global | |
bool | divideCrow = false |
whether divide C_global to C_local | |
int | ncolA = 0 |
number of columns of A, which is a local matrix in each proc | |
int | ncolB = 0 |
number of columns of B, which is a local matrix in each proc | |
int | nrow = 0 |
number of rows of A or B | |
int | LDA = 0 |
leading dimension of A in each proc | |
int | LDB = 0 |
leading dimension of B in each proc | |
int | LDC = 0 |
leading dimension of C, which can be C_local or C_global | |
Private Types | |
using | resmem_dev_op = base_device::memory::resize_memory_op< T, Device > |
using | delmem_dev_op = base_device::memory::delete_memory_op< T, Device > |
using | syncmem_dev_op = base_device::memory::synchronize_memory_op< T, Device, Device > |
using | syncmem_d2h_op = base_device::memory::synchronize_memory_op< T, base_device::DEVICE_CPU, Device > |
using | syncmem_h2d_op = base_device::memory::synchronize_memory_op< T, Device, base_device::DEVICE_CPU > |
Private Member Functions | |
void | multiply_single (const T alpha, const T *A, const T *B, const T beta, T *C) |
for col_nproc == 1 | |
void | multiply_col (const T alpha, const T *A, const T *B, const T beta, T *C) |
for mode = 1 or 2 | |
void | multiply_row (const T alpha, const T *A, const T *B, const T beta, T *C) |
for mode = 3 | |
Private Attributes | |
std::vector< T > | isend_tmp_ |
temperory memory for sending data | |
std::vector< T > | A_tmp_ |
temperory memory for A | |
std::vector< T > | B_tmp_ |
temperory memory for B | |
std::vector< T > | C_tmp_ |
temperory memory for C | |
std::vector< T > | C_global_tmp_ |
temperory memory for C_global | |
T * | C_local_tmp_ = nullptr |
temperory memory for C_local | |
T * | A_tmp_device_ = nullptr |
temperory memory for A | |
T * | B_tmp_device_ = nullptr |
temperory memory for B | |
this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values.
|
private |
|
private |
|
private |
|
private |
|
private |
ModuleBase::PGemmCN< T, Device >::PGemmCN | ( | ) |
ModuleBase::PGemmCN< T, Device >::~PGemmCN | ( | ) |
void ModuleBase::PGemmCN< T, Device >::multiply | ( | const T | alpha, |
const T * | A, | ||
const T * | B, | ||
const T | beta, | ||
T * | C | ||
) |
calculate C = alpha * A^H * B + beta * C
|
private |
for mode = 1 or 2
|
private |
for mode = 3
|
private |
for col_nproc == 1
void ModuleBase::PGemmCN< T, Device >::set_dimension | ( | MPI_Comm | comm_col, |
MPI_Comm | comm_row, | ||
const int | ncolA, | ||
const int | LDA, | ||
const int | ncolB, | ||
const int | LDB, | ||
const int | nrow, | ||
const int | LDC, | ||
const int | mode = 1 |
||
) |
set the dimension of A, B, and C
ncolA | number of columns of A, which is a local matrix in each proc |
LDA | leading dimension of A in each proc |
ncolB | number of columns of B, which is a local matrix in each proc |
LDB | leading dimension of B in each proc |
nrow | number of rows of A or B |
LDC | leading dimension of C. C can be C_local or C_global |
mode | 1: gather C_local to C_global, 2:C_local(nrow * ncol_loc), 3:C_global(nrow_loc * ncol) |
|
private |
temperory memory for A
|
private |
temperory memory for A
|
private |
temperory memory for B
|
private |
temperory memory for B
|
private |
temperory memory for C_global
|
private |
temperory memory for C_local
|
private |
temperory memory for C
int ModuleBase::PGemmCN< T, Device >::col_nproc = 1 |
number of procs in col_world
int ModuleBase::PGemmCN< T, Device >::col_rank = 0 |
rank in col_world
MPI_Comm ModuleBase::PGemmCN< T, Device >::col_world = MPI_COMM_NULL |
column communicator world
std::vector<int> ModuleBase::PGemmCN< T, Device >::colA_loc |
[col_nproc] number of columns of A matrix in each proc
std::vector<int> ModuleBase::PGemmCN< T, Device >::colB_loc |
[col_nproc] number of columns of B matrix in each proc
std::vector<int> ModuleBase::PGemmCN< T, Device >::displs |
displacements for gathering C_local to C_global
bool ModuleBase::PGemmCN< T, Device >::divideCrow = false |
whether divide C_global to C_local
bool ModuleBase::PGemmCN< T, Device >::gatherC = true |
whether gather C_local to C_global
|
private |
temperory memory for sending data
int ModuleBase::PGemmCN< T, Device >::LDA = 0 |
leading dimension of A in each proc
int ModuleBase::PGemmCN< T, Device >::LDB = 0 |
leading dimension of B in each proc
int ModuleBase::PGemmCN< T, Device >::LDC = 0 |
leading dimension of C, which can be C_local or C_global
int ModuleBase::PGemmCN< T, Device >::max_colA = 0 |
maximum number of columns of A matrix in all procs
int ModuleBase::PGemmCN< T, Device >::max_colB = 0 |
maximum number of columns of B matrix in all procs
int ModuleBase::PGemmCN< T, Device >::ncolA = 0 |
number of columns of A, which is a local matrix in each proc
int ModuleBase::PGemmCN< T, Device >::ncolB = 0 |
number of columns of B, which is a local matrix in each proc
int ModuleBase::PGemmCN< T, Device >::nrow = 0 |
number of rows of A or B
std::vector<int> ModuleBase::PGemmCN< T, Device >::recv_counts |
receive counts for gathering C_local to C_global
std::vector<MPI_Request> ModuleBase::PGemmCN< T, Device >::requests |
MPI request.
int ModuleBase::PGemmCN< T, Device >::row_nproc = 1 |
number of procs in row_world
int ModuleBase::PGemmCN< T, Device >::row_rank = 0 |
rank in row_world
MPI_Comm ModuleBase::PGemmCN< T, Device >::row_world = MPI_COMM_NULL |
row communicator world
int ModuleBase::PGemmCN< T, Device >::size_C_global = 0 |
size of C_global, which is the global C matrix gathered from all procs
int ModuleBase::PGemmCN< T, Device >::size_C_local = 0 |
size of C_local, which is a local matrix in each proc