23template <
typename T,
typename Device = base_device::DEVICE_CPU>
58 void multiply(
const T alpha,
const T* A,
const T* B,
const T beta,
T* C);
this class is used to perform parallel matrix multiplication C = alpha * A^H * B + beta * C Here,...
Definition para_gemm.h:25
int row_rank
rank in row_world
Definition para_gemm.h:65
int nrow
number of rows of A or B
Definition para_gemm.h:83
std::vector< int > colA_loc
[col_nproc] number of columns of A matrix in each proc
Definition para_gemm.h:68
int col_nproc
number of procs in col_world
Definition para_gemm.h:64
T * B_tmp_device_
temperory memory for B
Definition para_gemm.h:111
T * C_local_tmp_
temperory memory for C_local
Definition para_gemm.h:109
std::vector< T > isend_tmp_
temperory memory for sending data
Definition para_gemm.h:104
~PGemmCN()
Definition para_gemm.cpp:13
std::vector< MPI_Request > requests
MPI request.
Definition para_gemm.h:73
std::vector< T > A_tmp_
temperory memory for A
Definition para_gemm.h:105
void multiply_row(const T alpha, const T *A, const T *B, const T beta, T *C)
for mode = 3
Definition para_gemm.cpp:323
void multiply(const T alpha, const T *A, const T *B, const T beta, T *C)
calculate C = alpha * A^H * B + beta * C
Definition para_gemm.cpp:147
bool divideCrow
whether divide C_global to C_local
Definition para_gemm.h:79
int max_colB
maximum number of columns of B matrix in all procs
Definition para_gemm.h:71
bool gatherC
whether gather C_local to C_global
Definition para_gemm.h:78
MPI_Comm row_world
row communicator world
Definition para_gemm.h:61
void set_dimension(MPI_Comm comm_col, MPI_Comm comm_row, const int ncolA, const int LDA, const int ncolB, const int LDB, const int nrow, const int LDC, const int mode=1)
set the dimension of A, B, and C
Definition para_gemm.cpp:23
std::vector< T > B_tmp_
temperory memory for B
Definition para_gemm.h:106
int col_rank
rank in col_world
Definition para_gemm.h:63
int size_C_global
size of C_global, which is the global C matrix gathered from all procs
Definition para_gemm.h:77
PGemmCN()
Definition para_gemm.cpp:9
MPI_Comm col_world
column communicator world
Definition para_gemm.h:60
int ncolB
number of columns of B, which is a local matrix in each proc
Definition para_gemm.h:82
std::vector< T > C_tmp_
temperory memory for C
Definition para_gemm.h:107
void multiply_col(const T alpha, const T *A, const T *B, const T beta, T *C)
for mode = 1 or 2
Definition para_gemm.cpp:191
T * A_tmp_device_
temperory memory for A
Definition para_gemm.h:110
void multiply_single(const T alpha, const T *A, const T *B, const T beta, T *C)
for col_nproc == 1
Definition para_gemm.cpp:171
int max_colA
maximum number of columns of A matrix in all procs
Definition para_gemm.h:69
int LDC
leading dimension of C, which can be C_local or C_global
Definition para_gemm.h:86
std::vector< int > recv_counts
receive counts for gathering C_local to C_global
Definition para_gemm.h:74
int LDA
leading dimension of A in each proc
Definition para_gemm.h:84
std::vector< int > displs
displacements for gathering C_local to C_global
Definition para_gemm.h:75
std::vector< int > colB_loc
[col_nproc] number of columns of B matrix in each proc
Definition para_gemm.h:70
std::vector< T > C_global_tmp_
temperory memory for C_global
Definition para_gemm.h:108
int ncolA
number of columns of A, which is a local matrix in each proc
Definition para_gemm.h:81
int LDB
leading dimension of B in each proc
Definition para_gemm.h:85
int row_nproc
number of procs in row_world
Definition para_gemm.h:66
int size_C_local
size of C_local, which is a local matrix in each proc
Definition para_gemm.h:76
#define T
Definition exp.cpp:237
Definition array_pool.h:6
base device SOURCES math_dngvd_test cpp endif() if(ENABLE_GOOGLEBENCH) AddTest(TARGET PERF_MODULE_HSOLVER_KERNELS LIBS parameter $
Definition CMakeLists.txt:10
Definition memory_op.h:77
Definition memory_op.h:17
Definition memory_op.h:45