ABACUS develop
Atomic-orbital Based Ab-initio Computation at UStc
Loading...
Searching...
No Matches
Public Member Functions | Public Attributes | Private Types | Private Member Functions | Private Attributes | List of all members
ModuleBase::PGemmCN< T, Device > Class Template Reference

this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values. More...

#include <para_gemm.h>

Collaboration diagram for ModuleBase::PGemmCN< T, Device >:

Public Member Functions

 PGemmCN ()
 
 ~PGemmCN ()
 
void set_dimension (MPI_Comm comm_col, MPI_Comm comm_row, const int ncolA, const int LDA, const int ncolB, const int LDB, const int nrow, const int LDC, const int mode=1)
 set the dimension of A, B, and C
 
void multiply (const T alpha, const T *A, const T *B, const T beta, T *C)
 calculate C = alpha * A^H * B + beta * C
 

Public Attributes

MPI_Comm col_world = MPI_COMM_NULL
 column communicator world
 
MPI_Comm row_world = MPI_COMM_NULL
 row communicator world
 
int col_rank = 0
 rank in col_world
 
int col_nproc = 1
 number of procs in col_world
 
int row_rank = 0
 rank in row_world
 
int row_nproc = 1
 number of procs in row_world
 
std::vector< int > colA_loc
 [col_nproc] number of columns of A matrix in each proc
 
int max_colA = 0
 maximum number of columns of A matrix in all procs
 
std::vector< int > colB_loc
 [col_nproc] number of columns of B matrix in each proc
 
int max_colB = 0
 maximum number of columns of B matrix in all procs
 
std::vector< MPI_Request > requests
 MPI request.
 
std::vector< int > recv_counts
 receive counts for gathering C_local to C_global
 
std::vector< int > displs
 displacements for gathering C_local to C_global
 
int size_C_local = 0
 size of C_local, which is a local matrix in each proc
 
int size_C_global = 0
 size of C_global, which is the global C matrix gathered from all procs
 
bool gatherC = true
 whether gather C_local to C_global
 
bool divideCrow = false
 whether divide C_global to C_local
 
int ncolA = 0
 number of columns of A, which is a local matrix in each proc
 
int ncolB = 0
 number of columns of B, which is a local matrix in each proc
 
int nrow = 0
 number of rows of A or B
 
int LDA = 0
 leading dimension of A in each proc
 
int LDB = 0
 leading dimension of B in each proc
 
int LDC = 0
 leading dimension of C, which can be C_local or C_global
 

Private Types

using resmem_dev_op = base_device::memory::resize_memory_op< T, Device >
 
using delmem_dev_op = base_device::memory::delete_memory_op< T, Device >
 
using syncmem_dev_op = base_device::memory::synchronize_memory_op< T, Device, Device >
 
using syncmem_d2h_op = base_device::memory::synchronize_memory_op< T, base_device::DEVICE_CPU, Device >
 
using syncmem_h2d_op = base_device::memory::synchronize_memory_op< T, Device, base_device::DEVICE_CPU >
 

Private Member Functions

void multiply_single (const T alpha, const T *A, const T *B, const T beta, T *C)
 for col_nproc == 1
 
void multiply_col (const T alpha, const T *A, const T *B, const T beta, T *C)
 for mode = 1 or 2
 
void multiply_row (const T alpha, const T *A, const T *B, const T beta, T *C)
 for mode = 3
 

Private Attributes

std::vector< Tisend_tmp_
 temperory memory for sending data
 
std::vector< TA_tmp_
 temperory memory for A
 
std::vector< TB_tmp_
 temperory memory for B
 
std::vector< TC_tmp_
 temperory memory for C
 
std::vector< TC_global_tmp_
 temperory memory for C_global
 
TC_local_tmp_ = nullptr
 temperory memory for C_local
 
TA_tmp_device_ = nullptr
 temperory memory for A
 
TB_tmp_device_ = nullptr
 temperory memory for B
 

Detailed Description

template<typename T, typename Device = base_device::DEVICE_CPU>
class ModuleBase::PGemmCN< T, Device >

this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values.

Member Typedef Documentation

◆ delmem_dev_op

template<typename T , typename Device = base_device::DEVICE_CPU>
using ModuleBase::PGemmCN< T, Device >::delmem_dev_op = base_device::memory::delete_memory_op<T, Device>
private

◆ resmem_dev_op

template<typename T , typename Device = base_device::DEVICE_CPU>
using ModuleBase::PGemmCN< T, Device >::resmem_dev_op = base_device::memory::resize_memory_op<T, Device>
private

◆ syncmem_d2h_op

template<typename T , typename Device = base_device::DEVICE_CPU>
using ModuleBase::PGemmCN< T, Device >::syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>
private

◆ syncmem_dev_op

template<typename T , typename Device = base_device::DEVICE_CPU>
using ModuleBase::PGemmCN< T, Device >::syncmem_dev_op = base_device::memory::synchronize_memory_op<T, Device, Device>
private

◆ syncmem_h2d_op

template<typename T , typename Device = base_device::DEVICE_CPU>
using ModuleBase::PGemmCN< T, Device >::syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>
private

Constructor & Destructor Documentation

◆ PGemmCN()

template<typename T , typename Device >
ModuleBase::PGemmCN< T, Device >::PGemmCN ( )

◆ ~PGemmCN()

template<typename T , typename Device >
ModuleBase::PGemmCN< T, Device >::~PGemmCN ( )

Member Function Documentation

◆ multiply()

template<typename T , typename Device >
void ModuleBase::PGemmCN< T, Device >::multiply ( const T  alpha,
const T A,
const T B,
const T  beta,
T C 
)

calculate C = alpha * A^H * B + beta * C

Here is the call graph for this function:
Here is the caller graph for this function:

◆ multiply_col()

template<typename T , typename Device >
void ModuleBase::PGemmCN< T, Device >::multiply_col ( const T  alpha,
const T A,
const T B,
const T  beta,
T C 
)
private

for mode = 1 or 2

Here is the call graph for this function:

◆ multiply_row()

template<typename T , typename Device >
void ModuleBase::PGemmCN< T, Device >::multiply_row ( const T  alpha,
const T A,
const T B,
const T  beta,
T C 
)
private

for mode = 3

◆ multiply_single()

template<typename T , typename Device >
void ModuleBase::PGemmCN< T, Device >::multiply_single ( const T  alpha,
const T A,
const T B,
const T  beta,
T C 
)
private

for col_nproc == 1

◆ set_dimension()

template<typename T , typename Device >
void ModuleBase::PGemmCN< T, Device >::set_dimension ( MPI_Comm  comm_col,
MPI_Comm  comm_row,
const int  ncolA,
const int  LDA,
const int  ncolB,
const int  LDB,
const int  nrow,
const int  LDC,
const int  mode = 1 
)

set the dimension of A, B, and C

Parameters
ncolAnumber of columns of A, which is a local matrix in each proc
LDAleading dimension of A in each proc
ncolBnumber of columns of B, which is a local matrix in each proc
LDBleading dimension of B in each proc
nrownumber of rows of A or B
LDCleading dimension of C. C can be C_local or C_global
mode1: gather C_local to C_global, 2:C_local(nrow * ncol_loc), 3:C_global(nrow_loc * ncol)
Here is the caller graph for this function:

Member Data Documentation

◆ A_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<T> ModuleBase::PGemmCN< T, Device >::A_tmp_
private

temperory memory for A

◆ A_tmp_device_

template<typename T , typename Device = base_device::DEVICE_CPU>
T* ModuleBase::PGemmCN< T, Device >::A_tmp_device_ = nullptr
private

temperory memory for A

◆ B_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<T> ModuleBase::PGemmCN< T, Device >::B_tmp_
private

temperory memory for B

◆ B_tmp_device_

template<typename T , typename Device = base_device::DEVICE_CPU>
T* ModuleBase::PGemmCN< T, Device >::B_tmp_device_ = nullptr
private

temperory memory for B

◆ C_global_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<T> ModuleBase::PGemmCN< T, Device >::C_global_tmp_
private

temperory memory for C_global

◆ C_local_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>
T* ModuleBase::PGemmCN< T, Device >::C_local_tmp_ = nullptr
private

temperory memory for C_local

◆ C_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<T> ModuleBase::PGemmCN< T, Device >::C_tmp_
private

temperory memory for C

◆ col_nproc

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::col_nproc = 1

number of procs in col_world

◆ col_rank

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::col_rank = 0

rank in col_world

◆ col_world

template<typename T , typename Device = base_device::DEVICE_CPU>
MPI_Comm ModuleBase::PGemmCN< T, Device >::col_world = MPI_COMM_NULL

column communicator world

◆ colA_loc

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<int> ModuleBase::PGemmCN< T, Device >::colA_loc

[col_nproc] number of columns of A matrix in each proc

◆ colB_loc

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<int> ModuleBase::PGemmCN< T, Device >::colB_loc

[col_nproc] number of columns of B matrix in each proc

◆ displs

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<int> ModuleBase::PGemmCN< T, Device >::displs

displacements for gathering C_local to C_global

◆ divideCrow

template<typename T , typename Device = base_device::DEVICE_CPU>
bool ModuleBase::PGemmCN< T, Device >::divideCrow = false

whether divide C_global to C_local

◆ gatherC

template<typename T , typename Device = base_device::DEVICE_CPU>
bool ModuleBase::PGemmCN< T, Device >::gatherC = true

whether gather C_local to C_global

◆ isend_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<T> ModuleBase::PGemmCN< T, Device >::isend_tmp_
private

temperory memory for sending data

◆ LDA

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::LDA = 0

leading dimension of A in each proc

◆ LDB

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::LDB = 0

leading dimension of B in each proc

◆ LDC

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::LDC = 0

leading dimension of C, which can be C_local or C_global

◆ max_colA

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::max_colA = 0

maximum number of columns of A matrix in all procs

◆ max_colB

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::max_colB = 0

maximum number of columns of B matrix in all procs

◆ ncolA

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::ncolA = 0

number of columns of A, which is a local matrix in each proc

◆ ncolB

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::ncolB = 0

number of columns of B, which is a local matrix in each proc

◆ nrow

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::nrow = 0

number of rows of A or B

◆ recv_counts

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<int> ModuleBase::PGemmCN< T, Device >::recv_counts

receive counts for gathering C_local to C_global

◆ requests

template<typename T , typename Device = base_device::DEVICE_CPU>
std::vector<MPI_Request> ModuleBase::PGemmCN< T, Device >::requests

MPI request.

◆ row_nproc

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::row_nproc = 1

number of procs in row_world

◆ row_rank

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::row_rank = 0

rank in row_world

◆ row_world

template<typename T , typename Device = base_device::DEVICE_CPU>
MPI_Comm ModuleBase::PGemmCN< T, Device >::row_world = MPI_COMM_NULL

row communicator world

◆ size_C_global

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::size_C_global = 0

size of C_global, which is the global C matrix gathered from all procs

◆ size_C_local

template<typename T , typename Device = base_device::DEVICE_CPU>
int ModuleBase::PGemmCN< T, Device >::size_C_local = 0

size of C_local, which is a local matrix in each proc


The documentation for this class was generated from the following files: