this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values. More...

#include <para_gemm.h>

Collaboration diagram for ModuleBase::PGemmCN< T, Device >:

Public Member Functions
	PGemmCN ()

	~PGemmCN ()

void	set_dimension (MPI_Comm comm_col, MPI_Comm comm_row, const int ncolA, const int LDA, const int ncolB, const int LDB, const int nrow, const int LDC, const int mode=1)
	set the dimension of A, B, and C

void	multiply (const T alpha, const T A, const T B, const T beta, T *C)
	calculate C = alpha * A^H * B + beta * C

Public Attributes
MPI_Comm	col_world = MPI_COMM_NULL
	column communicator world

MPI_Comm	row_world = MPI_COMM_NULL
	row communicator world

int	col_rank = 0
	rank in col_world

int	col_nproc = 1
	number of procs in col_world

int	row_rank = 0
	rank in row_world

int	row_nproc = 1
	number of procs in row_world

std::vector< int >	colA_loc
	[col_nproc] number of columns of A matrix in each proc

int	max_colA = 0
	maximum number of columns of A matrix in all procs

std::vector< int >	colB_loc
	[col_nproc] number of columns of B matrix in each proc

int	max_colB = 0
	maximum number of columns of B matrix in all procs

std::vector< MPI_Request >	requests
	MPI request.

std::vector< int >	recv_counts
	receive counts for gathering C_local to C_global

std::vector< int >	displs
	displacements for gathering C_local to C_global

int	size_C_local = 0
	size of C_local, which is a local matrix in each proc

int	size_C_global = 0
	size of C_global, which is the global C matrix gathered from all procs

bool	gatherC = true
	whether gather C_local to C_global

bool	divideCrow = false
	whether divide C_global to C_local

int	ncolA = 0
	number of columns of A, which is a local matrix in each proc

int	ncolB = 0
	number of columns of B, which is a local matrix in each proc

int	nrow = 0
	number of rows of A or B

int	LDA = 0
	leading dimension of A in each proc

int	LDB = 0
	leading dimension of B in each proc

int	LDC = 0
	leading dimension of C, which can be C_local or C_global

Private Types
using	resmem_dev_op = base_device::memory::resize_memory_op< T, Device >

using	delmem_dev_op = base_device::memory::delete_memory_op< T, Device >

using	syncmem_dev_op = base_device::memory::synchronize_memory_op< T, Device, Device >

using	syncmem_d2h_op = base_device::memory::synchronize_memory_op< T, base_device::DEVICE_CPU, Device >

using	syncmem_h2d_op = base_device::memory::synchronize_memory_op< T, Device, base_device::DEVICE_CPU >

Private Member Functions
void	multiply_single (const T alpha, const T A, const T B, const T beta, T *C)
	for col_nproc == 1

void	multiply_col (const T alpha, const T A, const T B, const T beta, T *C)
	for mode = 1 or 2

void	multiply_row (const T alpha, const T A, const T B, const T beta, T *C)
	for mode = 3

Private Attributes
std::vector< T >	isend_tmp_
	temperory memory for sending data

std::vector< T >	A_tmp_
	temperory memory for A

std::vector< T >	B_tmp_
	temperory memory for B

std::vector< T >	C_tmp_
	temperory memory for C

std::vector< T >	C_global_tmp_
	temperory memory for C_global

T *	C_local_tmp_ = nullptr
	temperory memory for C_local

T *	A_tmp_device_ = nullptr
	temperory memory for A

T *	B_tmp_device_ = nullptr
	temperory memory for B

Detailed Description

template<typename T, typename Device = base_device::DEVICE_CPU>
class ModuleBase::PGemmCN< T, Device >

this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here, A and B are local matrices in each proc, C can be C_local or C_global, depending on the value of gatherC C_local is a local matrix in each proc C_global is a global matrix gathered from all procs and all procs have their own C_global matrix with the same C_global and C_local have the same LDC, but different column numbers values.

Member Typedef Documentation

◆ delmem_dev_op

template<typename T , typename Device = base_device::DEVICE_CPU>

using ModuleBase::PGemmCN< T, Device >::delmem_dev_op = base_device::memory::delete_memory_op<T, Device>

private

◆ resmem_dev_op

template<typename T , typename Device = base_device::DEVICE_CPU>

using ModuleBase::PGemmCN< T, Device >::resmem_dev_op = base_device::memory::resize_memory_op<T, Device>

private

◆ syncmem_d2h_op

template<typename T , typename Device = base_device::DEVICE_CPU>

using ModuleBase::PGemmCN< T, Device >::syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>

private

◆ syncmem_dev_op

template<typename T , typename Device = base_device::DEVICE_CPU>

using ModuleBase::PGemmCN< T, Device >::syncmem_dev_op = base_device::memory::synchronize_memory_op<T, Device, Device>

private

◆ syncmem_h2d_op

template<typename T , typename Device = base_device::DEVICE_CPU>

using ModuleBase::PGemmCN< T, Device >::syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>

private

Constructor & Destructor Documentation

◆ PGemmCN()

template<typename T , typename Device >

ModuleBase::PGemmCN< T, Device >::PGemmCN ( )

◆ ~PGemmCN()

template<typename T , typename Device >

ModuleBase::PGemmCN< T, Device >::~PGemmCN ( )

Member Function Documentation

◆ multiply()

template<typename T , typename Device >

void ModuleBase::PGemmCN< T, Device >::multiply	(	const T	alpha,
		const T *	A,
		const T *	B,
		const T	beta,
		T *	C
	)

calculate C = alpha * A^H * B + beta * C

Here is the call graph for this function:

Here is the caller graph for this function:

◆ multiply_col()

template<typename T , typename Device >

void ModuleBase::PGemmCN< T, Device >::multiply_col	(	const T	alpha,
		const T *	A,
		const T *	B,
		const T	beta,
		T *	C
	)

private

for mode = 1 or 2

Here is the call graph for this function:

◆ multiply_row()

template<typename T , typename Device >

void ModuleBase::PGemmCN< T, Device >::multiply_row	(	const T	alpha,
		const T *	A,
		const T *	B,
		const T	beta,
		T *	C
	)

private

for mode = 3

◆ multiply_single()

template<typename T , typename Device >

void ModuleBase::PGemmCN< T, Device >::multiply_single	(	const T	alpha,
		const T *	A,
		const T *	B,
		const T	beta,
		T *	C
	)

private

for col_nproc == 1

◆ set_dimension()

template<typename T , typename Device >

void ModuleBase::PGemmCN< T, Device >::set_dimension	(	MPI_Comm	comm_col,
		MPI_Comm	comm_row,
		const int	ncolA,
		const int	LDA,
		const int	ncolB,
		const int	LDB,
		const int	nrow,
		const int	LDC,
		const int	mode = `1`
	)

set the dimension of A, B, and C

Parameters

ncolA	number of columns of A, which is a local matrix in each proc
LDA	leading dimension of A in each proc
ncolB	number of columns of B, which is a local matrix in each proc
LDB	leading dimension of B in each proc
nrow	number of rows of A or B
LDC	leading dimension of C. C can be C_local or C_global
mode	1: gather C_local to C_global, 2:C_local(nrow * ncol_loc), 3:C_global(nrow_loc * ncol)

Here is the caller graph for this function:

Member Data Documentation

◆ A_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<T> ModuleBase::PGemmCN< T, Device >::A_tmp_

private

temperory memory for A

◆ A_tmp_device_

template<typename T , typename Device = base_device::DEVICE_CPU>

T* ModuleBase::PGemmCN< T, Device >::A_tmp_device_ = nullptr

private

temperory memory for A

◆ B_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<T> ModuleBase::PGemmCN< T, Device >::B_tmp_

private

temperory memory for B

◆ B_tmp_device_

template<typename T , typename Device = base_device::DEVICE_CPU>

T* ModuleBase::PGemmCN< T, Device >::B_tmp_device_ = nullptr

private

temperory memory for B

◆ C_global_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<T> ModuleBase::PGemmCN< T, Device >::C_global_tmp_

private

temperory memory for C_global

◆ C_local_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>

T* ModuleBase::PGemmCN< T, Device >::C_local_tmp_ = nullptr

private

temperory memory for C_local

◆ C_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<T> ModuleBase::PGemmCN< T, Device >::C_tmp_

private

temperory memory for C

◆ col_nproc

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::col_nproc = 1

number of procs in col_world

◆ col_rank

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::col_rank = 0

rank in col_world

◆ col_world

template<typename T , typename Device = base_device::DEVICE_CPU>

MPI_Comm ModuleBase::PGemmCN< T, Device >::col_world = MPI_COMM_NULL

column communicator world

◆ colA_loc

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<int> ModuleBase::PGemmCN< T, Device >::colA_loc

[col_nproc] number of columns of A matrix in each proc

◆ colB_loc

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<int> ModuleBase::PGemmCN< T, Device >::colB_loc

[col_nproc] number of columns of B matrix in each proc

◆ displs

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<int> ModuleBase::PGemmCN< T, Device >::displs

displacements for gathering C_local to C_global

◆ divideCrow

template<typename T , typename Device = base_device::DEVICE_CPU>

bool ModuleBase::PGemmCN< T, Device >::divideCrow = false

whether divide C_global to C_local

◆ gatherC

template<typename T , typename Device = base_device::DEVICE_CPU>

bool ModuleBase::PGemmCN< T, Device >::gatherC = true

whether gather C_local to C_global

◆ isend_tmp_

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<T> ModuleBase::PGemmCN< T, Device >::isend_tmp_

private

temperory memory for sending data

◆ LDA

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::LDA = 0

leading dimension of A in each proc

◆ LDB

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::LDB = 0

leading dimension of B in each proc

◆ LDC

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::LDC = 0

leading dimension of C, which can be C_local or C_global

◆ max_colA

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::max_colA = 0

maximum number of columns of A matrix in all procs

◆ max_colB

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::max_colB = 0

maximum number of columns of B matrix in all procs

◆ ncolA

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::ncolA = 0

number of columns of A, which is a local matrix in each proc

◆ ncolB

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::ncolB = 0

number of columns of B, which is a local matrix in each proc

◆ nrow

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::nrow = 0

number of rows of A or B

◆ recv_counts

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<int> ModuleBase::PGemmCN< T, Device >::recv_counts

receive counts for gathering C_local to C_global

◆ requests

template<typename T , typename Device = base_device::DEVICE_CPU>

std::vector<MPI_Request> ModuleBase::PGemmCN< T, Device >::requests

MPI request.

◆ row_nproc

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::row_nproc = 1

number of procs in row_world

◆ row_rank

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::row_rank = 0

rank in row_world

◆ row_world

template<typename T , typename Device = base_device::DEVICE_CPU>

MPI_Comm ModuleBase::PGemmCN< T, Device >::row_world = MPI_COMM_NULL

row communicator world

◆ size_C_global

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::size_C_global = 0

size of C_global, which is the global C matrix gathered from all procs

◆ size_C_local

template<typename T , typename Device = base_device::DEVICE_CPU>

int ModuleBase::PGemmCN< T, Device >::size_C_local = 0

size of C_local, which is a local matrix in each proc

The documentation for this class was generated from the following files:

/home/runner/work/abacus-develop/abacus-develop/source/source_base/para_gemm.h
/home/runner/work/abacus-develop/abacus-develop/source/source_base/para_gemm.cpp

Public Member Functions

Public Attributes

Private Types

Private Member Functions

Private Attributes

Detailed Description

Member Typedef Documentation

◆ delmem_dev_op

◆ resmem_dev_op

◆ syncmem_d2h_op

◆ syncmem_dev_op

◆ syncmem_h2d_op

Constructor & Destructor Documentation

◆ PGemmCN()

◆ ~PGemmCN()

Member Function Documentation

◆ multiply()

◆ multiply_col()

◆ multiply_row()

◆ multiply_single()

◆ set_dimension()

Member Data Documentation

◆ A_tmp_

◆ A_tmp_device_

◆ B_tmp_

◆ B_tmp_device_

◆ C_global_tmp_

◆ C_local_tmp_

◆ C_tmp_

◆ col_nproc

◆ col_rank

◆ col_world

◆ colA_loc

◆ colB_loc

◆ displs

◆ divideCrow

◆ gatherC

◆ isend_tmp_

◆ LDA

◆ LDB

◆ LDC

◆ max_colA

◆ max_colB

◆ ncolA

◆ ncolB

◆ nrow

◆ recv_counts

◆ requests

◆ row_nproc

◆ row_rank

◆ row_world

◆ size_C_global

◆ size_C_local