SuperLU Distributed 9.0.0
gpu3d
|
Definitions which are precision-neutral. More...
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <stdint.h>
#include "superlu_dist_config.h"
#include "oneside.h"
#include "gpu_api_utils.h"
#include "superlu_FortranCInterface.h"
#include "superlu_FCnames.h"
#include "superlu_enum_consts.h"
#include "supermatrix.h"
#include "util_dist.h"
#include "psymbfact.h"
Go to the source code of this file.
Classes | |
struct | superlu_scope_t |
struct | gridinfo_t |
struct | gridinfo3d_t |
struct | Glu_persist_t |
struct | Glu_freeable_t |
struct | pxgstrs_comm_t |
struct | superlu_dist_options_t |
struct | superlu_dist_mem_usage_t |
struct | Ucb_indptr_t |
struct | Ublock_info_t |
struct | Remain_info_t |
struct | etree_node |
struct | superlu_pair |
struct | uPanelInfo_t |
struct | lPanelInfo_t |
struct | packLUInfo_t |
struct | HyP_t |
struct | local_l_blk_info_t |
struct | local_u_blk_info_t |
struct | perm_array_t |
struct | factStat_t |
struct | d2Hreduce_t |
struct | treeList_t |
struct | treeTopoInfo_t |
struct | gEtreeInfo_t |
struct | sForest_t |
struct | commRequests_t |
struct | factNodelists_t |
struct | msgs_t |
struct | xtrsTimer_t |
struct | C_Tree |
Macros | |
#define | SUPERLU_DIST_MAJOR_VERSION 9 |
#define | SUPERLU_DIST_MINOR_VERSION 0 |
#define | SUPERLU_DIST_PATCH_VERSION 0 |
#define | SUPERLU_DIST_RELEASE_DATE "May 8, 2024" |
#define | GPU_ACC |
#define | mpi_int_t MPI_LONG_LONG_INT |
#define | IFMT " %lld" |
#define | SuperLU_MPI_COMPLEX MPI_C_COMPLEX |
#define | SuperLU_MPI_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX |
#define | ISORT /* NOTE: qsort() has bug on Mac */ |
#define | MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */ |
#define | BC_HEADER 2 |
#define | LB_DESCRIPTOR 2 |
#define | BR_HEADER 3 |
#define | UB_DESCRIPTOR 2 |
#define | BC_HEADER_NEWU 3 |
#define | UB_DESCRIPTOR_NEWU 2 |
#define | UB_DESCRIPTOR_NEWUCPP 3 |
#define | NBUFFERS 5 |
#define | NTAGS INT_MAX |
#define | UjROW 10 |
#define | UkSUB 11 |
#define | UkVAL 12 |
#define | LkSUB 13 |
#define | LkVAL 14 |
#define | LkkDIAG 15 |
#define | XK_H 2 /* The header preceding each X block. */ |
#define | LSUM_H 2 /* The header preceding each MOD block. */ |
#define | GSUM 20 |
#define | Xk 21 |
#define | Yk 22 |
#define | LSUM 23 |
#define | COMM_ALL 100 |
#define | COMM_COLUMN 101 |
#define | COMM_ROW 102 |
#define | SUPER_LINEAR 11 |
#define | SUPER_BLOCK 12 |
#define | NO_MARKER 3 |
#define | IAM(comm) { int rank; MPI_Comm_rank ( comm, &rank ); rank}; |
#define | MYROW(iam, grid) ( (iam) / grid->npcol ) |
#define | MYCOL(iam, grid) ( (iam) % grid->npcol ) |
#define | BlockNum(i) ( supno[i] ) |
#define | FstBlockC(bnum) ( xsup[bnum] ) |
#define | SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] ) |
#define | LBi(bnum, grid) ( (bnum)/grid->nprow )/* Global to local block rowwise */ |
#define | LBj(bnum, grid) ( (bnum)/grid->npcol )/* Global to local block columnwise*/ |
#define | PROW(bnum, grid) ( (bnum) % grid->nprow ) |
#define | PCOL(bnum, grid) ( (bnum) % grid->npcol ) |
#define | PNUM(i, j, grid) ( (i)*grid->npcol + j ) /* Process number at coord(i,j) */ |
#define | CEILING(a, b) ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) ) |
#define | RHS_ITERATE(i) for (i = 0; i < nrhs; ++i) |
#define | X_BLK(i) ilsum[i] * nrhs + (i+1) * XK_H |
#define | XT_BLK(i) ilsumT[i] * nrhs + (i+1) * XK_H |
#define | LSUM_BLK(i) ilsum[i] * nrhs + (i+1) * LSUM_H |
#define | SuperLU_timer_ SuperLU_timer_dist_ |
#define | LOG2(x) (log10((double) x) / log10(2.0)) |
#define | VT_TRACEON |
#define | VT_TRACEOFF |
#define | SUPERLU_DIST_EXPORT |
#define | MAGMA_CONST |
#define | DIM_X 32 |
#define | DIM_Y 16 |
#define | DIM_XA DIM_X |
#define | DIM_YA DIM_Y |
#define | DIM_XB DIM_X |
#define | DIM_YB DIM_Y |
#define | WARP_SIZE 32 |
#define | NWARP DIM_X*DIM_Y/WARP_SIZE |
#define | THR_M ( BLK_M / DIM_X ) |
#define | THR_N ( BLK_N / DIM_Y ) |
#define | fetch(A, m, n, bound) offs_d##A[min(n*LD##A+m, bound)] |
#define | fma(A, B, C) C += (A*B) |
#define | HANDLE_SIZE 8 |
#define | cmax(a, b) ((a) > (b) ? (a) : (b)) |
#define | SLU_MPI_TAG(id, num) ( (6*(num)+id) % tag_ub ) |
#define | __SUPERLU_ASYNC_TREE |
#define | DEG_TREE 2 |
Typedefs | |
typedef int64_t | int_t |
typedef int64_t | handle_t |
typedef enum treePartStrat | treePartStrat |
typedef struct xtrsTimer_t | xtrsTimer_t |
typedef enum trtype_t | trtype_t |
Enumerations | |
enum | treePartStrat { ND , GD } |
enum | SupernodeToGridMap_t { NOT_IN_GRID , IN_GRID_ZERO , IN_GRID_AIJ } |
enum | trtype_t { UPPER_TRI , LOWER_TRI } |
Functions | |
void | superlu_gridinit (MPI_Comm, int, int, gridinfo_t *) |
All processes in the MPI communicator must call this routine. More... | |
void | superlu_gridmap (MPI_Comm, int, int, int[], int, gridinfo_t *) |
All processes in the MPI communicator must call this routine. More... | |
void | superlu_gridexit (gridinfo_t *) |
void | superlu_gridinit3d (MPI_Comm Bcomm, int nprow, int npcol, int npdep, gridinfo3d_t *grid) |
All processes in the MPI communicator must call this routine. More... | |
void | superlu_gridmap3d (MPI_Comm, int, int, int, int[], gridinfo3d_t *) |
All processes in the MPI communicator must call this routine. On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1. More... | |
void | superlu_gridexit3d (gridinfo3d_t *grid) |
void | set_default_options_dist (superlu_dist_options_t *) |
Set the default values for the options argument. More... | |
void | print_options_dist (superlu_dist_options_t *) |
Print the options setting. More... | |
void | print_sp_ienv_dist (superlu_dist_options_t *) |
Print the blocking parameters. More... | |
void | Destroy_CompCol_Matrix_dist (SuperMatrix *) |
void | Destroy_SuperNode_Matrix_dist (SuperMatrix *) |
void | Destroy_SuperMatrix_Store_dist (SuperMatrix *) |
Deallocate the structure pointing to the actual storage of the matrix. More... | |
void | Destroy_CompCol_Permuted_dist (SuperMatrix *) |
A is of type Stype==NCP. More... | |
void | Destroy_CompRowLoc_Matrix_dist (SuperMatrix *) |
void | Destroy_CompRow_Matrix_dist (SuperMatrix *) |
void | sp_colorder (superlu_dist_options_t *, SuperMatrix *, int_t *, int_t *, SuperMatrix *) |
int | sp_symetree_dist (int_t *, int_t *, int_t *, int_t, int_t *) |
Symmetric elimination tree. More... | |
int | sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *) |
Nonsymmetric elimination tree. More... | |
void | get_perm_c_dist (int_t, int_t, SuperMatrix *, int_t *) |
void | get_perm_c_batch (superlu_dist_options_t *options, int batchCount, handle_t *SparseMatrix_handles, int **CpivPtr) |
Gets sparsity permutations for a batch of matrices. More... | |
void | at_plus_a_dist (const int_t, const int_t, int_t *, int_t *, int_t *, int_t **, int_t **) |
void | getata_dist (const int_t m, const int_t n, const int_t nz, int_t *colptr, int_t *rowind, int_t *atanz, int_t **ata_colptr, int_t **ata_rowind) |
void | get_metis_dist (int_t n, int_t bnz, int_t *b_colptr, int_t *b_rowind, int_t *perm_c) |
void | get_colamd_dist (const int m, const int n, const int nnz, int_t *colptr, int_t *rowind, int_t *perm_c) |
int | genmmd_dist_ (int_t *, int_t *, int_t *a, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *) |
void | bcast_tree (void *, int, MPI_Datatype, int, int, gridinfo_t *, int, int *) |
int_t | symbfact (superlu_dist_options_t *, int, SuperMatrix *, int_t *, int_t *, Glu_persist_t *, Glu_freeable_t *) |
int_t | symbfact_SubInit (superlu_dist_options_t *options, fact_t, void *, int_t, int_t, int_t, int_t, Glu_persist_t *, Glu_freeable_t *) |
int_t | symbfact_SubXpand (int_t, int_t, int_t, MemType, int_t *, Glu_freeable_t *) |
int | symbfact_SubFree (Glu_freeable_t *) |
void | countnz_dist (const int_t, int_t *, int_t *, int_t *, Glu_persist_t *, Glu_freeable_t *) |
int64_t | fixupL_dist (const int_t, const int_t *, Glu_persist_t *, Glu_freeable_t *) |
int_t * | TreePostorder_dist (int_t, int_t *) |
float | smach_dist (const char *) |
double | dmach_dist (const char *) |
void * | superlu_malloc_dist (size_t) |
void | superlu_free_dist (void *) |
int * | int32Malloc_dist (int) |
int * | int32Calloc_dist (int) |
int_t * | intMalloc_dist (int_t) |
int_t * | intCalloc_dist (int_t) |
int | mc64id_dist (int *) |
void | arrive_at_ublock (int, int_t *, int_t *, int *, int *, int *, int_t, int_t, int_t *, int_t *, int_t *, gridinfo_t *) |
int_t | estimate_bigu_size (int_t, int_t **, Glu_persist_t *, gridinfo_t *, int_t *, int_t *) |
void | superlu_abort_and_exit_dist (char *) |
int | sp_ienv_dist (int, superlu_dist_options_t *) |
void | ifill_dist (int_t *, int_t, int_t) |
Fills an integer array with a given value. More... | |
void | super_stats_dist (int_t, int_t *) |
void | get_diag_procs (int_t, Glu_persist_t *, gridinfo_t *, int_t *, int_t **, int_t **) |
int_t | QuerySpace_dist (int_t, int_t, Glu_freeable_t *, superlu_dist_mem_usage_t *) |
int | xerr_dist (char *, int *) |
void | pxerr_dist (char *, gridinfo_t *, int_t) |
void | PStatInit (SuperLUStat_t *) |
void | PStatClear (SuperLUStat_t *) |
void | PStatFree (SuperLUStat_t *) |
void | PStatPrint (superlu_dist_options_t *, SuperLUStat_t *, gridinfo_t *) |
void | log_memory (int64_t, SuperLUStat_t *) |
void | print_memorylog (SuperLUStat_t *, char *) |
int | superlu_dist_GetVersionNumber (int *, int *, int *) |
void | quickSort (int_t *, int_t, int_t, int_t) |
void | quickSortM (int_t *, int_t, int_t, int_t, int_t, int_t) |
int_t | partition (int_t *, int_t, int_t, int_t) |
int_t | partitionM (int_t *, int_t, int_t, int_t, int_t, int_t) |
int | compareInt_t (void *a, void *b) |
Compares two integers for equality. More... | |
int | compareInt (void *a, void *b) |
Compares two integers for equality. More... | |
int | compareDouble (void *a, void *b) |
Compares two doubles for equality. More... | |
int | dist_checkArrayEq (void *arr, int length, MPI_Datatype datatype, int src_rank, int dest_rank, MPI_Comm communicator, int(*compare)(void *, void *)) |
Checks whether arrays at two MPI ranks are identical. More... | |
float | symbfact_dist (superlu_dist_options_t *, int, int, SuperMatrix *, int_t *, int_t *, int_t *, int_t *, Pslu_freeable_t *, MPI_Comm *, MPI_Comm *, superlu_dist_mem_usage_t *) |
float | get_perm_c_parmetis (SuperMatrix *, int_t *, int_t *, int, int, int_t **, int_t **, gridinfo_t *, MPI_Comm *) |
int_t | psymbfact_LUXpandMem (int, int_t, int_t, int_t, int_t, int, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *) |
int_t | psymbfact_LUXpand (int_t, int_t, int_t, int_t, int_t *, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *) |
int_t | psymbfact_LUXpand_RL (int_t, int_t, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *) |
int_t | psymbfact_prLUXpand (int_t, int_t, int, Llu_symbfact_t *, psymbfact_stat_t *) |
void | isort (int_t N, int_t *ARRAY1, int_t *ARRAY2) |
void | isort1 (int_t N, int_t *ARRAY) |
void | gemm_division_cpu_gpu (superlu_dist_options_t *, int *, int *, int *, int, int, int, int *, int, int_t) |
int_t | get_gpublas_nb (void) |
int_t | get_num_gpu_streams (void) |
int | getnGPUStreams (void) |
int | get_mpi_process_per_gpu (void) |
void | printGPUStats (int nsupers, SuperLUStat_t *stat, gridinfo3d_t *) |
double | estimate_cpu_time (int m, int n, int k) |
int | get_thread_per_process (void) |
int_t | get_max_buffer_size (void) |
int_t | get_min (int_t *, int_t) |
int | compare_pair (const void *, const void *) |
int_t | static_partition (struct superlu_pair *, int_t, int_t *, int_t, int_t *, int_t *, int) |
int | get_acc_offload (superlu_dist_options_t *) |
int | get_acc_solve (void) |
int | get_new3dsolve (void) |
int | get_new3dsolvetreecomm (void) |
void | print_panel_seg_dist (int_t, int_t, int_t, int_t, int_t *, int_t *) |
Diagnostic print of segment info after panel_dfs(). More... | |
void | check_repfnz_dist (int_t, int_t, int_t, int_t *) |
Check whether repfnz[] == SLU_EMPTY after reset. More... | |
int_t | CheckZeroDiagonal (int_t, int_t *, int_t *, int_t *) |
int | check_perm_dist (char *what, int_t n, int_t *perm) |
void | PrintDouble5 (char *, int_t, double *) |
void | PrintInt10 (char *, int_t, int_t *) |
void | PrintInt32 (char *, int, int *) |
int | file_PrintInt10 (FILE *, char *, int_t, int_t *) |
int | file_PrintInt32 (FILE *, char *, int, int *) |
int | file_PrintLong10 (FILE *, char *, int_t, int_t *) |
void | C_RdTree_Create_nv (C_Tree *tree, MPI_Comm comm, int *ranks, int rank_cnt, int msgSize, char precision, int *needrecvrd, int *needsendrd) |
void | C_RdTree_Nullify (C_Tree *tree) |
yes_no_t | C_RdTree_IsRoot (C_Tree *tree) |
void | C_RdTree_forwardMessageSimple (C_Tree *Tree, void *localBuffer, int msgSize) |
void | C_RdTree_waitSendRequest (C_Tree *Tree) |
void | C_BcTree_Create_nv (C_Tree *tree, MPI_Comm comm, int *ranks, int rank_cnt, int msgSize, char precision, int *needrecv) |
void | C_BcTree_Nullify (C_Tree *tree) |
yes_no_t | C_BcTree_IsRoot (C_Tree *tree) |
void | C_BcTree_forwardMessageSimple (C_Tree *tree, void *localBuffer, int msgSize) |
void | C_BcTree_waitSendRequest (C_Tree *tree) |
void | DistPrint (char *function_name, double value, char *Units, gridinfo_t *grid) |
void | DistPrint3D (char *function_name, double value, char *Units, gridinfo3d_t *grid3d) |
void | treeImbalance3D (gridinfo3d_t *grid3d, SCT_t *SCT) |
void | slu_SCT_printComm3D (gridinfo3d_t *grid3d, SCT_t *SCT) |
int_t | zAllocBcast (int_t size, void **ptr, gridinfo3d_t *grid3d) |
int_t | zAllocBcast_gridID (int_t size, void **ptr, int_t gridID, gridinfo3d_t *grid3d) |
void | permCol_SymbolicFact3d (superlu_dist_options_t *options, int n, SuperMatrix *GA, int_t *perm_c, int_t *etree, Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable, SuperLUStat_t *stat, superlu_dist_mem_usage_t *symb_mem_usage, gridinfo3d_t *grid3d) |
This function performs the symbolic factorization on matrix Pc*Pr*A*Pc' and sets up the nonzero data structures for L & U matrices. In the process, the matrix is also ordered and its memory usage information is fetched. More... | |
SupernodeToGridMap_t * | createSuperGridMap (int_t nsuper, int_t maxLvl, int_t *myTreeIdxs, int_t *myZeroTrIdxs, int_t *gNodeCount, int_t **gNodeLists) |
int_t * | createSupernode2TreeMap (int_t nsupers, int_t maxLvl, int_t *gNodeCount, int_t **gNodeLists) |
void | allocBcastArray (void **array, int_t size, int root, MPI_Comm comm) |
Allocates and broadcasts an array in a MPI environment. More... | |
void | allocBcastLargeArray (void **array, int64_t size, int root, MPI_Comm comm) |
int_t * | create_iperm_c_supno (int_t nsupers, superlu_dist_options_t *options, Glu_persist_t *Glu_persist, int_t *etree, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d) |
gEtreeInfo_t | fillEtreeInfo (int_t nsupers, int_t *setree, treeList_t *treeList) |
sForest_t ** | compute_sForests (int_t nsupers, Glu_persist_t *Glu_persist, int_t *etree, gridinfo3d_t *grid3d) |
int * | getBrecvTree (int_t nlb, sForest_t *sforest, int *bmod, gridinfo_t *grid) |
int * | getBrecvTree_newsolve (int_t nlb, int_t nsupers, int *supernodeMask, int *bmod, gridinfo_t *grid) |
int | getNrootUsolveTree (int_t *nbrecvmod, sForest_t *sforest, int *brecv, int *bmod, gridinfo_t *grid) |
int | getNbrecvX (sForest_t *sforest, int_t *Urbs, gridinfo_t *grid) |
int | getNbrecvX_newsolve (int_t nsupers, int *supernodeMask, int_t *Urbs, Ucb_indptr_t **Ucb_indptr, gridinfo_t *grid) |
int | getNrootUsolveTree_newsolve (int_t *nbrecvmod, int_t nsupers, int *supernodeMask, int *brecv, int *bmod, gridinfo_t *grid) |
int_t | getNfrecvmodLeaf (int *nleaf, sForest_t *sforest, int *frecv, int *fmod, gridinfo_t *grid) |
int_t | getNfrecvmod_newsolve (int *nleaf, int_t nsupers, int *supernodeMask, int *frecv, int *fmod, gridinfo_t *grid) |
int * | getfrecv_newsolve (int_t nsupers, int *supernodeMask, int_t nlb, int *fmod, int *mod_bit, gridinfo_t *grid) |
int * | getfrecvLeaf (sForest_t *sforest, int_t nlb, int *fmod, int *mod_bit, gridinfo_t *grid) |
int | getNfrecvx_newsolve (int_t nsupers, int *supernodeMask, int_t **Lrowind_bc_ptr, int_t **Lindval_loc_bc_ptr, gridinfo_t *grid) |
int | getNfrecvxLeaf (sForest_t *sforest, int_t **Lrowind_bc_ptr, gridinfo_t *grid) |
int * | getfmod_newsolve (int_t nlb, int_t nsupers, int *supernodeMask, int_t **Lrowind_bc_ptr, int_t **Lindval_loc_bc_ptr, gridinfo_t *grid) |
int * | getfmodLeaf (int_t nlb, int *fmod_i) |
int | getldu (int_t knsupc, int_t iklrow, int_t *usub) |
int * | getBmod3d (int_t treeId, int_t nlb, sForest_t *sforest, int_t *xsup, int_t **Ufstnz_br_ptr, int_t *supernode2treeMap, gridinfo_t *grid) |
int * | getBmod3d_newsolve (int_t nlb, int_t nsupers, int *supernodeMask, int_t *xsup, int_t **Ufstnz_br_ptr, gridinfo_t *grid) |
int_t * | getPerm_c_supno (int_t nsupers, superlu_dist_options_t *, int_t *etree, Glu_persist_t *Glu_persist, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo_t *) |
int_t * | getPerm_c_supno_allgrid (int_t nsupers, superlu_dist_options_t *options, int_t *etree, Glu_persist_t *Glu_persist, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d) |
void | slu_SCT_init (SCT_t *) |
void | slu_SCT_print (gridinfo_t *grid, SCT_t *SCT) |
void | slu_SCT_printSummary (gridinfo_t *grid, SCT_t *SCT) |
void | slu_SCT_print3D (gridinfo3d_t *grid3d, SCT_t *SCT) |
void | slu_SCT_free (SCT_t *) |
treeList_t * | setree2list (int_t nsuper, int_t *setree) |
int | free_treelist (int_t nsuper, treeList_t *treeList) |
int_t | calcTreeWeight (int_t nsupers, int_t *setree, treeList_t *treeList, int_t *xsup) |
int_t | getDescendList (int_t k, int_t *dlist, treeList_t *treeList) |
int_t | getCommonAncestorList (int_t k, int_t *alist, int_t *seTree, treeList_t *treeList) |
int_t | getCommonAncsCount (int_t k, treeList_t *treeList) |
int_t * | getPermNodeList (int_t nnode, int_t *nlist, int_t *perm_c_sup, int_t *iperm_c_sup) |
int_t * | getEtreeLB (int_t nnodes, int_t *perm_l, int_t *gTopOrder) |
int_t * | getSubTreeRoots (int_t k, int_t *numSubtrees, treeList_t *treeList) |
int_t * | merg_perms (int_t nperms, int_t *nnodes, int_t **perms) |
int_t * | getGlobal_iperm (int_t nsupers, int_t nperms, int_t **perms, int_t *nnodes) |
int_t | log2i (int_t index) |
int_t * | supernodal_etree (int_t nsuper, int_t *etree, int_t *supno, int_t *xsup) |
int_t | testSubtreeNodelist (int_t nsupers, int_t numList, int_t **nodeList, int_t *nodeCount) |
int_t | testListPerm (int_t nodeCount, int_t *nodeList, int_t *permList, int_t *gTopLevel) |
int_t * | topological_ordering (int_t nsuper, int_t *setree) |
int_t * | Etree_LevelBoundry (int_t *perm, int_t *tsort_etree, int_t nsuper) |
int_t * | calculate_num_children (int_t nsuper, int_t *setree) |
void | Print_EtreeLevelBoundry (int_t *Etree_LvlBdry, int_t max_level, int_t nsuper) |
void | print_etree_leveled (int_t *setree, int_t *tsort_etree, int_t nsuper) |
void | print_etree (int_t *setree, int_t *iperm, int_t nsuper) |
int_t | printFileList (char *sname, int_t nnodes, int_t *dlist, int_t *setree) |
int * | getLastDepBtree (int_t nsupers, treeList_t *treeList) |
int_t * | getReplicatedTrees (gridinfo3d_t *grid3d) |
int_t * | getGridTrees (gridinfo3d_t *grid3d) |
int_t ** | getNodeList (int_t maxLvl, int_t *setree, int_t *nnodes, int_t *treeHeads, treeList_t *treeList) |
int_t * | calcNumNodes (int_t maxLvl, int_t *treeHeads, treeList_t *treeList) |
int_t * | getTreeHeads (int_t maxLvl, int_t nsupers, treeList_t *treeList) |
int_t * | getMyIperm (int_t nnodes, int_t nsupers, int_t *myPerm) |
int_t * | getMyTopOrder (int_t nnodes, int_t *myPerm, int_t *myIperm, int_t *setree) |
int_t * | getMyEtLims (int_t nnodes, int_t *myTopOrder) |
treeTopoInfo_t | getMyTreeTopoInfo (int_t nnodes, int_t nsupers, int_t *myPerm, int_t *setree) |
sForest_t ** | getNestDissForests (int_t maxLvl, int_t nsupers, int_t *setree, treeList_t *treeList) |
int_t ** | getTreePermForest (int_t *myTreeIdxs, int_t *myZeroTrIdxs, sForest_t *sForests, int_t *perm_c_supno, int_t *iperm_c_supno, gridinfo3d_t *grid3d) |
int_t ** | getTreePermFr (int_t *myTreeIdxs, sForest_t **sForests, gridinfo3d_t *grid3d) |
int_t * | getMyNodeCountsFr (int_t maxLvl, int_t *myTreeIdxs, sForest_t **sForests) |
int_t ** | getNodeListFr (int_t maxLvl, sForest_t **sForests) |
int_t * | getNodeCountsFr (int_t maxLvl, sForest_t **sForests) |
int * | getIsNodeInMyGrid (int_t nsupers, int_t maxLvl, int_t *myNodeCount, int_t **treePerm) |
void | printForestWeightCost (sForest_t **sForests, SCT_t *SCT, gridinfo3d_t *grid3d) |
sForest_t ** | getGreedyLoadBalForests (int_t maxLvl, int_t nsupers, int_t *setree, treeList_t *treeList) |
sForest_t ** | getForests (int_t maxLvl, int_t nsupers, int_t *setree, treeList_t *treeList) |
int_t | getBigUSize (superlu_dist_options_t *, int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr) |
void | getSCUweight (int_t nsupers, treeList_t *treeList, int_t *xsup, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d) |
void | getSCUweight_allgrid (int_t nsupers, treeList_t *treeList, int_t *xsup, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d) |
int | Wait_LUDiagSend (int_t k, MPI_Request *U_diag_blk_send_req, MPI_Request *L_diag_blk_send_req, gridinfo_t *grid, SCT_t *SCT) |
void | applyRowPerm (int_t *colptr, int_t *rowind, int_t *perm_r, int_t n) |
int | getNsupers (int n, Glu_persist_t *Glu_persist) |
int | set_tag_ub (void) |
int | getNumThreads (int) |
int_t | num_full_cols_U (int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, gridinfo_t *, int_t *, int_t *) |
int_t * | getFactPerm (int_t) |
int_t * | getFactIperm (int_t *, int_t) |
int_t | initCommRequests (commRequests_t *comReqs, gridinfo_t *grid) |
int_t | initFactStat (int nsupers, factStat_t *factStat) |
int | freeFactStat (factStat_t *factStat) |
int_t | initFactNodelists (int_t, int_t, int_t, factNodelists_t *) |
int | freeFactNodelists (factNodelists_t *fNlists) |
int_t | initMsgs (msgs_t *msgs) |
int_t | getNumLookAhead (superlu_dist_options_t *) |
commRequests_t ** | initCommRequestsArr (int_t mxLeafNode, int_t ldt, gridinfo_t *grid) |
int | freeCommRequestsArr (int_t mxLeafNode, commRequests_t **comReqss) |
msgs_t ** | initMsgsArr (int_t numLA) |
int | freeMsgsArr (int_t numLA, msgs_t **msgss) |
int_t | Trs2_InitUblock_info (int_t klst, int_t nb, Ublock_info_t *, int_t *usub, Glu_persist_t *, SuperLUStat_t *) |
int | Cmpfunc_R_info (const void *a, const void *b) |
int | Cmpfunc_U_info (const void *a, const void *b) |
int | sort_R_info (Remain_info_t *Remain_info, int n) |
int | sort_U_info (Ublock_info_t *Ublock_info, int n) |
int | sort_R_info_elm (Remain_info_t *Remain_info, int n) |
int | sort_U_info_elm (Ublock_info_t *Ublock_info, int n) |
void | printTRStimer (xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d) |
void | initTRStimer (xtrsTimer_t *xtrsTimer, gridinfo_t *grid) |
int_t ** | getTreePerm (int_t *myTreeIdxs, int_t *myZeroTrIdxs, int_t *nodeCount, int_t **nodeList, int_t *perm_c_supno, int_t *iperm_c_supno, gridinfo3d_t *grid3d) |
int_t * | getMyNodeCounts (int_t maxLvl, int_t *myTreeIdxs, int_t *gNodeCount) |
int_t | checkIntVector3d (int_t *vec, int_t len, gridinfo3d_t *grid3d) |
int_t | reduceStat (PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t *grid3d) |
int_t | Wait_LSend (int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *s, SCT_t *) |
int_t | Wait_USend (MPI_Request *, gridinfo_t *, SCT_t *) |
int_t | Check_LRecv (MPI_Request *, int *msgcnt) |
int_t | Wait_UDiagBlockSend (MPI_Request *, gridinfo_t *, SCT_t *) |
int_t | Wait_LDiagBlockSend (MPI_Request *, gridinfo_t *, SCT_t *) |
int_t | Wait_UDiagBlock_Recv (MPI_Request *, SCT_t *) |
int_t | Test_UDiagBlock_Recv (MPI_Request *, SCT_t *) |
int_t | Wait_LDiagBlock_Recv (MPI_Request *, SCT_t *) |
int_t | Test_LDiagBlock_Recv (MPI_Request *, SCT_t *) |
int_t | LDiagBlockRecvWait (int_t k, int *factored_U, MPI_Request *, gridinfo_t *) |
int_t | num_full_cols_U_mod (int_t kk, int_t *usub, int_t *xsup, gridinfo_t *grid, int_t *perm_u, int_t *ldu) |
Variables | |
static const int | BC_L =1 |
static const int | RD_L =2 |
static const int | BC_U =3 |
static const int | RD_U =4 |
Definitions which are precision-neutral.
Copyright (c) 2003, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from U.S. Dept. of Energy)
All rights reserved.
The source code is distributed under BSD license, see the file License.txt at the top-level directory.
-- Distributed SuperLU routine (version 9.0) -- Lawrence Berkeley National Lab, Univ. of California Berkeley. November 1, 2007 Modified: February 20, 2008 October 11, 2014 September 18, 2018 version 6.0 February 8, 2019 version 6.1.1 November 12, 2019 version 6.2.0 October 23, 2020 version 6.4.0 May 12, 2021 version 7.0.0 October 5, 2021 version 7.1.0 October 18, 2021 version 7.1.1 December 12, 2021 version 7.2.0 May 22, 2022 version 8.0.0 July 5, 2022 version 8.1.0 October 1, 2022 version 8.1.1 November 12, 2022 version 8.1.2 November 17, 2023 version 8.2.1 May 8, 2024 version 9.0.0
#define __SUPERLU_ASYNC_TREE |
#define BC_HEADER 2 |
#define BC_HEADER_NEWU 3 |
#define BR_HEADER 3 |
#define CEILING | ( | a, | |
b | |||
) | ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) ) |
#define cmax | ( | a, | |
b | |||
) | ((a) > (b) ? (a) : (b)) |
#define COMM_ALL 100 |
#define COMM_COLUMN 101 |
#define COMM_ROW 102 |
#define DEG_TREE 2 |
#define DIM_X 32 |
#define DIM_XA DIM_X |
#define DIM_XB DIM_X |
#define DIM_Y 16 |
#define DIM_YA DIM_Y |
#define DIM_YB DIM_Y |
#define fetch | ( | A, | |
m, | |||
n, | |||
bound | |||
) | offs_d##A[min(n*LD##A+m, bound)] |
#define fma | ( | A, | |
B, | |||
C | |||
) | C += (A*B) |
#define FstBlockC | ( | bnum | ) | ( xsup[bnum] ) |
#define GPU_ACC |
#define GSUM 20 |
#define HANDLE_SIZE 8 |
#define IAM | ( | comm | ) | { int rank; MPI_Comm_rank ( comm, &rank ); rank}; |
#define IFMT " %lld" |
#define ISORT /* NOTE: qsort() has bug on Mac */ |
#define LB_DESCRIPTOR 2 |
#define LBi | ( | bnum, | |
grid | |||
) | ( (bnum)/grid->nprow )/* Global to local block rowwise */ |
#define LBj | ( | bnum, | |
grid | |||
) | ( (bnum)/grid->npcol )/* Global to local block columnwise*/ |
#define LkkDIAG 15 |
#define LkSUB 13 |
#define LkVAL 14 |
#define LOG2 | ( | x | ) | (log10((double) x) / log10(2.0)) |
#define LSUM 23 |
#define LSUM_H 2 /* The header preceding each MOD block. */ |
#define MAGMA_CONST |
#define MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */ |
#define mpi_int_t MPI_LONG_LONG_INT |
#define MYCOL | ( | iam, | |
grid | |||
) | ( (iam) % grid->npcol ) |
#define MYROW | ( | iam, | |
grid | |||
) | ( (iam) / grid->npcol ) |
#define NBUFFERS 5 |
#define NO_MARKER 3 |
#define NTAGS INT_MAX |
#define PCOL | ( | bnum, | |
grid | |||
) | ( (bnum) % grid->npcol ) |
#define PROW | ( | bnum, | |
grid | |||
) | ( (bnum) % grid->nprow ) |
#define SLU_MPI_TAG | ( | id, | |
num | |||
) | ( (6*(num)+id) % tag_ub ) |
#define SUPER_BLOCK 12 |
#define SUPER_LINEAR 11 |
#define SUPERLU_DIST_EXPORT |
#define SUPERLU_DIST_MAJOR_VERSION 9 |
#define SUPERLU_DIST_MINOR_VERSION 0 |
#define SUPERLU_DIST_PATCH_VERSION 0 |
#define SUPERLU_DIST_RELEASE_DATE "May 8, 2024" |
#define SuperLU_MPI_COMPLEX MPI_C_COMPLEX |
#define SuperLU_MPI_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX |
double SuperLU_timer_ SuperLU_timer_dist_ |
#define SuperSize | ( | bnum | ) | ( xsup[bnum+1]-xsup[bnum] ) |
#define THR_M ( BLK_M / DIM_X ) |
#define THR_N ( BLK_N / DIM_Y ) |
#define UB_DESCRIPTOR 2 |
#define UB_DESCRIPTOR_NEWU 2 |
#define UB_DESCRIPTOR_NEWUCPP 3 |
#define UjROW 10 |
#define UkSUB 11 |
#define UkVAL 12 |
#define VT_TRACEOFF |
#define VT_TRACEON |
#define WARP_SIZE 32 |
#define Xk 21 |
#define XK_H 2 /* The header preceding each X block. */ |
#define Yk 22 |
typedef int64_t handle_t |
typedef int64_t int_t |
typedef enum treePartStrat treePartStrat |
typedef struct xtrsTimer_t xtrsTimer_t |
enum SupernodeToGridMap_t |
enum treePartStrat |
enum trtype_t |
void allocBcastArray | ( | void ** | array, |
int_t | size, | ||
int | root, | ||
MPI_Comm | comm | ||
) |
Allocates and broadcasts an array in a MPI environment.
This function sends the size from the root process to all other processes in the communicator. If the process is not the root, it receives the size from the root and allocates the array. Then, the function broadcasts the array from the root process to all other processes in the communicator.
array | Pointer to the array to be allocated and broadcasted. |
size | The size of the array. |
comm | The MPI communicator. |
root | The root process. |
void allocBcastLargeArray | ( | void ** | array, |
int64_t | size, | ||
int | root, | ||
MPI_Comm | comm | ||
) |
Performs a row permutation operation on a sparse matrix (CSC format) using a user-provided permutation array.
colptr | The column pointer array of the sparse matrix (CSC format). |
rowind | The row index array of the sparse matrix (CSC format). |
perm_r | The user-provided permutation array for the rows. |
n | The number of columns in the sparse matrix. |
void arrive_at_ublock | ( | int | j, |
int_t * | iukp, | ||
int_t * | rukp, | ||
int * | jb, | ||
int * | ljb, | ||
int * | nsupc, | ||
int_t | iukp0, | ||
int_t | rukp0, | ||
int_t * | usub, | ||
int_t * | perm_u, | ||
int_t * | xsup, | ||
gridinfo_t * | grid | ||
) |
void at_plus_a_dist | ( | const int_t | n, |
const int_t | nz, | ||
int_t * | colptr, | ||
int_t * | rowind, | ||
int_t * | bnz, | ||
int_t ** | b_colptr, | ||
int_t ** | b_rowind | ||
) |
Purpose ======= Form the structure of A'+A. A is an n-by-n matrix in column oriented format represented by (colptr, rowind). The output A'+A is in column oriented format (symmetrically, also row oriented), represented by (b_colptr, b_rowind).
void bcast_tree | ( | void * | buf, |
int | count, | ||
MPI_Datatype | dtype, | ||
int | root, | ||
int | tag, | ||
gridinfo_t * | grid, | ||
int | scope, | ||
int * | recvcnt | ||
) |
Purpose ======= Broadcast an array of *dtype* numbers. The communication pattern is a tree with number of branches equal to NBRANCHES. The process ranks are between 0 and Np-1. The following two pairs of graphs give different ways of viewing the same algorithm. The first pair shows the trees as they should be visualized when examining the algorithm. The second pair are isomorphic graphs of of the first, which show the actual pattern of data movement. Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a hypercube broadcast (however, it does not require the nodes be a power of two to work). TREE BROADCAST, NBRANCHES = 2 * TREE BROADCAST, NBRANCHES = 3 root=2 i=4 &______________ * | \ * root=2 i=2 &______ &______ * i=3 &______________________ | \ | \ * | \ \ i=1 &__ &__ &__ &__ * i=1 &______ &______ &__ | \ | \ | \ | \ * | \ \ | \ \ | \ 2 3 4 5 6 7 0 1 * 2 3 4 5 6 7 0 1 ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS: 2 2 _________|_________ ___________|____________ / | \ / | | \ 6 4 3 5 0 3 4 / \ | / \ | 0 7 5 6 7 1 | 1 Arguments ========= scope
void C_BcTree_Create_nv | ( | C_Tree * | tree, |
MPI_Comm | comm, | ||
int * | ranks, | ||
int | rank_cnt, | ||
int | msgSize, | ||
char | precision, | ||
int * | needrecv | ||
) |
void C_BcTree_forwardMessageSimple | ( | C_Tree * | tree, |
void * | localBuffer, | ||
int | msgSize | ||
) |
void C_BcTree_Nullify | ( | C_Tree * | tree | ) |
void C_BcTree_waitSendRequest | ( | C_Tree * | tree | ) |
void C_RdTree_Create_nv | ( | C_Tree * | tree, |
MPI_Comm | comm, | ||
int * | ranks, | ||
int | rank_cnt, | ||
int | msgSize, | ||
char | precision, | ||
int * | needrecvrd, | ||
int * | needsendrd | ||
) |
void C_RdTree_forwardMessageSimple | ( | C_Tree * | Tree, |
void * | localBuffer, | ||
int | msgSize | ||
) |
void C_RdTree_Nullify | ( | C_Tree * | tree | ) |
void C_RdTree_waitSendRequest | ( | C_Tree * | Tree | ) |
int_t * calcNumNodes | ( | int_t | maxLvl, |
int_t * | treeHeads, | ||
treeList_t * | treeList | ||
) |
int_t calcTreeWeight | ( | int_t | nsupers, |
int_t * | setree, | ||
treeList_t * | treeList, | ||
int_t * | xsup | ||
) |
int_t Check_LRecv | ( | MPI_Request * | recv_req, |
int * | msgcnt | ||
) |
Check whether repfnz[] == SLU_EMPTY after reset.
Check whether repfnz[] == SLU_EMPTY after reset.
int_t checkIntVector3d | ( | int_t * | vec, |
int_t | len, | ||
gridinfo3d_t * | grid3d | ||
) |
int Cmpfunc_R_info | ( | const void * | a, |
const void * | b | ||
) |
int Cmpfunc_U_info | ( | const void * | a, |
const void * | b | ||
) |
int compare_pair | ( | const void * | a, |
const void * | b | ||
) |
int compareDouble | ( | void * | a, |
void * | b | ||
) |
Compares two doubles for equality.
a | Void pointer to the first double |
b | Void pointer to the second double |
int compareInt | ( | void * | a, |
void * | b | ||
) |
Compares two integers for equality.
a | Void pointer to the first integer |
b | Void pointer to the second integer |
int compareInt_t | ( | void * | a, |
void * | b | ||
) |
Compares two integers for equality.
a | Void pointer to the first integer |
b | Void pointer to the second integer |
sForest_t ** compute_sForests | ( | int_t | nsupers, |
Glu_persist_t * | Glu_persist, | ||
int_t * | etree, | ||
gridinfo3d_t * | grid3d | ||
) |
void countnz_dist | ( | const int_t | n, |
int_t * | xprune, | ||
int_t * | nnzL, | ||
int_t * | nnzU, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Count the total number of nonzeros in factors L and U, and in the symmetrically reduced L.
int_t * create_iperm_c_supno | ( | int_t | nsupers, |
superlu_dist_options_t * | options, | ||
Glu_persist_t * | Glu_persist, | ||
int_t * | etree, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo3d_t * | grid3d | ||
) |
SupernodeToGridMap_t * createSuperGridMap | ( | int_t | nsuper, |
int_t | maxLvl, | ||
int_t * | myTreeIdxs, | ||
int_t * | myZeroTrIdxs, | ||
int_t * | gNodeCount, | ||
int_t ** | gNodeLists | ||
) |
int_t * createSupernode2TreeMap | ( | int_t | nsupers, |
int_t | maxLvl, | ||
int_t * | gNodeCount, | ||
int_t ** | gNodeLists | ||
) |
void Destroy_CompCol_Matrix_dist | ( | SuperMatrix * | A | ) |
void Destroy_CompCol_Permuted_dist | ( | SuperMatrix * | A | ) |
A is of type Stype==NCP.
void Destroy_CompRow_Matrix_dist | ( | SuperMatrix * | A | ) |
void Destroy_CompRowLoc_Matrix_dist | ( | SuperMatrix * | A | ) |
void Destroy_SuperMatrix_Store_dist | ( | SuperMatrix * | A | ) |
Deallocate the structure pointing to the actual storage of the matrix.
void Destroy_SuperNode_Matrix_dist | ( | SuperMatrix * | A | ) |
int dist_checkArrayEq | ( | void * | arr, |
int | length, | ||
MPI_Datatype | datatype, | ||
int | src_rank, | ||
int | dest_rank, | ||
MPI_Comm | communicator, | ||
int(*)(void *, void *) | compare | ||
) |
Checks whether arrays at two MPI ranks are identical.
This function is used to check if a copy of an array at two different MPI ranks are the same. It uses MPI_Send and MPI_Recv to transfer data between ranks, then compares the arrays.
arr | Void pointer to the array to be compared |
length | The length of the array |
datatype | MPI_Datatype of the array elements |
src_rank | The source rank that has the original array |
dest_rank | The destination rank that has the copied array |
communicator | The MPI_Comm communicator that includes both ranks |
compare | A function pointer to the function used to compare elements. Should take two void pointers and return 0 if they are equal and a non-zero value otherwise. |
void DistPrint | ( | char * | function_name, |
double | value, | ||
char * | Units, | ||
gridinfo_t * | grid | ||
) |
void DistPrint3D | ( | char * | function_name, |
double | value, | ||
char * | Units, | ||
gridinfo3d_t * | grid3d | ||
) |
double dmach_dist | ( | const char * | cmach | ) |
int_t estimate_bigu_size | ( | int_t | nsupers, |
int_t ** | Ufstnz_br_ptr, | ||
Glu_persist_t * | Glu_persist, | ||
gridinfo_t * | grid, | ||
int_t * | perm_u, | ||
int_t * | max_ncols | ||
) |
double estimate_cpu_time | ( | int | m, |
int | n, | ||
int | k | ||
) |
int file_PrintInt32 | ( | FILE * | fp, |
char * | name, | ||
int | len, | ||
int * | x | ||
) |
gEtreeInfo_t fillEtreeInfo | ( | int_t | nsupers, |
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
int64_t fixupL_dist | ( | const int_t | n, |
const int_t * | perm_r, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Fix up the data storage lsub[] for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts. Return value: number of entries in lsub[], which includes the size of the pruned graph, which is interspersed in the supernodal graph in the lsub[] array.
Fix up the data storage lsub for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts.
int free_treelist | ( | int_t | nsuper, |
treeList_t * | treeList | ||
) |
int freeCommRequestsArr | ( | int_t | mxLeafNode, |
commRequests_t ** | comReqss | ||
) |
int freeFactNodelists | ( | factNodelists_t * | fNlists | ) |
int freeFactStat | ( | factStat_t * | factStat | ) |
void gemm_division_cpu_gpu | ( | superlu_dist_options_t * | options, |
int * | num_streams_used, | ||
int * | stream_end_col, | ||
int * | ncpu_blks, | ||
int | nbrow, | ||
int | ldu, | ||
int | nstreams, | ||
int * | full_u_cols, | ||
int | num_blks, | ||
int_t | gemmBufferSize | ||
) |
int genmmd_dist_ | ( | int_t * | neqns, |
int_t * | xadj, | ||
int_t * | a, | ||
int_t * | invp, | ||
int_t * | perm, | ||
int_t * | delta, | ||
int_t * | dhead, | ||
int_t * | qsize, | ||
int_t * | llist, | ||
int_t * | marker, | ||
int_t * | maxint, | ||
int_t * | nofsub | ||
) |
int get_acc_offload | ( | superlu_dist_options_t * | options | ) |
int get_acc_solve | ( | void | ) |
void get_colamd_dist | ( | const int | m, |
const int | n, | ||
const int | nnz, | ||
int_t * | colptr, | ||
int_t * | rowind, | ||
int_t * | perm_c | ||
) |
void get_diag_procs | ( | int_t | n, |
Glu_persist_t * | Glu_persist, | ||
gridinfo_t * | grid, | ||
int_t * | num_diag_procs, | ||
int_t ** | diag_procs, | ||
int_t ** | diag_len | ||
) |
int_t get_gpublas_nb | ( | void | ) |
int_t get_max_buffer_size | ( | void | ) |
int get_mpi_process_per_gpu | ( | void | ) |
int get_new3dsolve | ( | void | ) |
int get_new3dsolvetreecomm | ( | void | ) |
int_t get_num_gpu_streams | ( | void | ) |
void get_perm_c_batch | ( | superlu_dist_options_t * | options, |
int | batchCount, | ||
handle_t * | SparseMatrix_handles, | ||
int ** | CpivPtr | ||
) |
Gets sparsity permutations for a batch of matrices.
[in] | options | solver options |
[in] | batchCount | number of matrices in the batch |
[in] | SparseMatrix_handles | pointers to the matrices in the batch, each pointing to the actual stoage in CSC format On entry, the original matrices, may be overwritten by A1 <- Pr*diag(R)*A*diag(C) from dequil_batch() and dpivot_batch() |
[out] | CpivPtr | pointers to column permutation vectors for each matrix, each of size n |
void get_perm_c_dist | ( | int_t | pnum, |
int_t | ispec, | ||
SuperMatrix * | A, | ||
int_t * | perm_c | ||
) |
Purpose ======= GET_PERM_C_DIST obtains a permutation matrix Pc, by applying the multiple minimum degree ordering code by Joseph Liu to matrix A'*A or A+A', or using approximate minimum degree column ordering by Davis et. al. The LU factorization of A*Pc tends to have less fill than the LU factorization of A. Arguments ========= ispec (input) colperm_t Specifies what type of column permutation to use to reduce fill. = NATURAL: natural ordering (i.e., Pc = I) = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A = MMD_ATA: minimum degree ordering on structure of A'*A = METIS_AT_PLUS_A: MeTis on A'+A A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Currently, the type of A can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. In the future, more general A can be handled. perm_c (output) int* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc.
float get_perm_c_parmetis | ( | SuperMatrix * | A, |
int_t * | perm_r, | ||
int_t * | perm_c, | ||
int | nprocs_i, | ||
int | noDomains, | ||
int_t ** | sizes, | ||
int_t ** | fstVtxSep, | ||
gridinfo_t * | grid, | ||
MPI_Comm * | metis_comm | ||
) |
Purpose ======= GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a graph partitioning algorithm to the symmetrized graph A+A'. The multilevel graph partitioning algorithm used is the ParMETIS_V3_NodeND routine available in the parallel graph partitioning package parMETIS. The number of independent sub-domains noDomains computed by this algorithm has to be a power of 2. Hence noDomains is the larger number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow * npcol is the number of processors used in SuperLU_DIST. Arguments ========= A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means row i of A is in position j in Pr*A. perm_c (output) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. nprocs_i (input) int* Number of processors the input matrix is distributed on in a block row format. It corresponds to number of processors used in SuperLU_DIST. noDomains (input) int*, must be power of 2 Number of independent domains to be computed by the graph partitioning algorithm. ( noDomains <= nprocs_i ) sizes (output) int_t**, of size 2 * noDomains Returns pointer to an array containing the number of nodes for each sub-domain and each separator. Separators are stored from left to right. Memory for the array is allocated in this routine. fstVtxSep (output) int_t**, of size 2 * noDomains Returns pointer to an array containing first node for each sub-domain and each separator. Memory for the array is allocated in this routine. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory.
int get_thread_per_process | ( | void | ) |
void getata_dist | ( | const int_t | m, |
const int_t | n, | ||
const int_t | nz, | ||
int_t * | colptr, | ||
int_t * | rowind, | ||
int_t * | atanz, | ||
int_t ** | ata_colptr, | ||
int_t ** | ata_rowind | ||
) |
Purpose ======= Form the structure of A'*A. A is an m-by-n matrix in column oriented format represented by (colptr, rowind). The output A'*A is in column oriented format (symmetrically, also row oriented), represented by (ata_colptr, ata_rowind). This routine is modified from GETATA routine by Tim Davis. The complexity of this algorithm is: SUM_{i=1,m} r(i)^2, i.e., the sum of the square of the row counts. Questions ========= o Do I need to withhold the *dense* rows? o How do I know the number of nonzeros in A'*A?
int_t getBigUSize | ( | superlu_dist_options_t * | options, |
int_t | nsupers, | ||
gridinfo_t * | grid, | ||
int_t ** | Lrowind_bc_ptr | ||
) |
int * getBmod3d | ( | int_t | treeId, |
int_t | nlb, | ||
sForest_t * | sforest, | ||
int_t * | xsup, | ||
int_t ** | Ufstnz_br_ptr, | ||
int_t * | supernode2treeMap, | ||
gridinfo_t * | grid | ||
) |
int * getBmod3d_newsolve | ( | int_t | nlb, |
int_t | nsupers, | ||
int * | supernodeMask, | ||
int_t * | xsup, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo_t * | grid | ||
) |
int * getBrecvTree | ( | int_t | nlb, |
sForest_t * | sforest, | ||
int * | bmod, | ||
gridinfo_t * | grid | ||
) |
int * getBrecvTree_newsolve | ( | int_t | nlb, |
int_t | nsupers, | ||
int * | supernodeMask, | ||
int * | bmod, | ||
gridinfo_t * | grid | ||
) |
int_t getCommonAncestorList | ( | int_t | k, |
int_t * | alist, | ||
int_t * | seTree, | ||
treeList_t * | treeList | ||
) |
int_t getCommonAncsCount | ( | int_t | k, |
treeList_t * | treeList | ||
) |
int_t getDescendList | ( | int_t | k, |
int_t * | dlist, | ||
treeList_t * | treeList | ||
) |
int * getfmod_newsolve | ( | int_t | nlb, |
int_t | nsupers, | ||
int * | supernodeMask, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Lindval_loc_bc_ptr, | ||
gridinfo_t * | grid | ||
) |
int * getfmodLeaf | ( | int_t | nlb, |
int * | fmod_i | ||
) |
sForest_t ** getForests | ( | int_t | maxLvl, |
int_t | nsupers, | ||
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
int * getfrecv_newsolve | ( | int_t | nsupers, |
int * | supernodeMask, | ||
int_t | nlb, | ||
int * | fmod, | ||
int * | mod_bit, | ||
gridinfo_t * | grid | ||
) |
int * getfrecvLeaf | ( | sForest_t * | sforest, |
int_t | nlb, | ||
int * | fmod, | ||
int * | mod_bit, | ||
gridinfo_t * | grid | ||
) |
sForest_t ** getGreedyLoadBalForests | ( | int_t | maxLvl, |
int_t | nsupers, | ||
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
int_t * getGridTrees | ( | gridinfo3d_t * | grid3d | ) |
int * getLastDepBtree | ( | int_t | nsupers, |
treeList_t * | treeList | ||
) |
treeTopoInfo_t getMyTreeTopoInfo | ( | int_t | nnodes, |
int_t | nsupers, | ||
int_t * | myPerm, | ||
int_t * | setree | ||
) |
int getNbrecvX | ( | sForest_t * | sforest, |
int_t * | Urbs, | ||
gridinfo_t * | grid | ||
) |
int getNbrecvX_newsolve | ( | int_t | nsupers, |
int * | supernodeMask, | ||
int_t * | Urbs, | ||
Ucb_indptr_t ** | Ucb_indptr, | ||
gridinfo_t * | grid | ||
) |
sForest_t ** getNestDissForests | ( | int_t | maxLvl, |
int_t | nsupers, | ||
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
int_t getNfrecvmod_newsolve | ( | int * | nleaf, |
int_t | nsupers, | ||
int * | supernodeMask, | ||
int * | frecv, | ||
int * | fmod, | ||
gridinfo_t * | grid | ||
) |
int_t getNfrecvmodLeaf | ( | int * | nleaf, |
sForest_t * | sforest, | ||
int * | frecv, | ||
int * | fmod, | ||
gridinfo_t * | grid | ||
) |
int getNfrecvx_newsolve | ( | int_t | nsupers, |
int * | supernodeMask, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Lindval_loc_bc_ptr, | ||
gridinfo_t * | grid | ||
) |
int getNfrecvxLeaf | ( | sForest_t * | sforest, |
int_t ** | Lrowind_bc_ptr, | ||
gridinfo_t * | grid | ||
) |
int getnGPUStreams | ( | void | ) |
int_t ** getNodeList | ( | int_t | maxLvl, |
int_t * | setree, | ||
int_t * | nnodes, | ||
int_t * | treeHeads, | ||
treeList_t * | treeList | ||
) |
int getNrootUsolveTree | ( | int_t * | nbrecvmod, |
sForest_t * | sforest, | ||
int * | brecv, | ||
int * | bmod, | ||
gridinfo_t * | grid | ||
) |
int getNrootUsolveTree_newsolve | ( | int_t * | nbrecvmod, |
int_t | nsupers, | ||
int * | supernodeMask, | ||
int * | brecv, | ||
int * | bmod, | ||
gridinfo_t * | grid | ||
) |
int getNsupers | ( | int | n, |
Glu_persist_t * | Glu_persist | ||
) |
int_t getNumLookAhead | ( | superlu_dist_options_t * | options | ) |
int getNumThreads | ( | int | iam | ) |
int_t * getPerm_c_supno | ( | int_t | nsupers, |
superlu_dist_options_t * | options, | ||
int_t * | etree, | ||
Glu_persist_t * | Glu_persist, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo_t * | grid | ||
) |
int_t * getPerm_c_supno_allgrid | ( | int_t | nsupers, |
superlu_dist_options_t * | options, | ||
int_t * | etree, | ||
Glu_persist_t * | Glu_persist, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t * getReplicatedTrees | ( | gridinfo3d_t * | grid3d | ) |
void getSCUweight | ( | int_t | nsupers, |
treeList_t * | treeList, | ||
int_t * | xsup, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo3d_t * | grid3d | ||
) |
void getSCUweight_allgrid | ( | int_t | nsupers, |
treeList_t * | treeList, | ||
int_t * | xsup, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t * getSubTreeRoots | ( | int_t | k, |
int_t * | numSubtrees, | ||
treeList_t * | treeList | ||
) |
int_t * getTreeHeads | ( | int_t | maxLvl, |
int_t | nsupers, | ||
treeList_t * | treeList | ||
) |
int_t ** getTreePerm | ( | int_t * | myTreeIdxs, |
int_t * | myZeroTrIdxs, | ||
int_t * | nodeCount, | ||
int_t ** | nodeList, | ||
int_t * | perm_c_supno, | ||
int_t * | iperm_c_supno, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t ** getTreePermForest | ( | int_t * | myTreeIdxs, |
int_t * | myZeroTrIdxs, | ||
sForest_t * | sForests, | ||
int_t * | perm_c_supno, | ||
int_t * | iperm_c_supno, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t ** getTreePermFr | ( | int_t * | myTreeIdxs, |
sForest_t ** | sForests, | ||
gridinfo3d_t * | grid3d | ||
) |
Fills an integer array with a given value.
int_t initCommRequests | ( | commRequests_t * | comReqs, |
gridinfo_t * | grid | ||
) |
commRequests_t ** initCommRequestsArr | ( | int_t | mxLeafNode, |
int_t | ldt, | ||
gridinfo_t * | grid | ||
) |
int_t initFactNodelists | ( | int_t | ldt, |
int_t | num_threads, | ||
int_t | nsupers, | ||
factNodelists_t * | fNlists | ||
) |
int_t initFactStat | ( | int | nsupers, |
factStat_t * | factStat | ||
) |
void initTRStimer | ( | xtrsTimer_t * | xtrsTimer, |
gridinfo_t * | grid | ||
) |
int * int32Calloc_dist | ( | int | n | ) |
int * int32Malloc_dist | ( | int | n | ) |
int_t LDiagBlockRecvWait | ( | int_t | k, |
int * | factored_U, | ||
MPI_Request * | L_diag_blk_recv_req, | ||
gridinfo_t * | grid | ||
) |
void log_memory | ( | int64_t | cur_bytes, |
SuperLUStat_t * | stat | ||
) |
int mc64id_dist | ( | int * | icntl | ) |
int_t num_full_cols_U | ( | int_t | kk, |
int_t ** | Ufstnz_br_ptr, | ||
int_t * | xsup, | ||
gridinfo_t * | grid, | ||
int_t * | perm_u, | ||
int_t * | ldu | ||
) |
int_t num_full_cols_U_mod | ( | int_t | kk, |
int_t * | usub, | ||
int_t * | xsup, | ||
gridinfo_t * | grid, | ||
int_t * | perm_u, | ||
int_t * | ldu | ||
) |
void permCol_SymbolicFact3d | ( | superlu_dist_options_t * | options, |
int | n, | ||
SuperMatrix * | GA, | ||
int_t * | perm_c, | ||
int_t * | etree, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable, | ||
SuperLUStat_t * | stat, | ||
superlu_dist_mem_usage_t * | symb_mem_usage, | ||
gridinfo3d_t * | grid3d | ||
) |
This function performs the symbolic factorization on matrix Pc*Pr*A*Pc' and sets up the nonzero data structures for L & U matrices. In the process, the matrix is also ordered and its memory usage information is fetched.
options | The options for the SuperLU distribution. |
n | Dimension of the global matrix A. |
GA | A pointer to the global matrix A. |
perm_c | The column permutation vector. |
etree | The elimination tree of Pc*Pr*A*Pc'. |
Glu_persist | Pointer to the structure which tracks the symbolic factorization information. |
Glu_freeable | Pointer to the structure which tracks the space used to store L/U data structures. |
stat | Information on program execution. |
grid3d | The 3D process grid. |
void print_memorylog | ( | SuperLUStat_t * | stat, |
char * | msg | ||
) |
void print_options_dist | ( | superlu_dist_options_t * | options | ) |
Print the options setting.
void print_panel_seg_dist | ( | int_t | n, |
int_t | w, | ||
int_t | jcol, | ||
int_t | nseg, | ||
int_t * | segrep, | ||
int_t * | repfnz | ||
) |
Diagnostic print of segment info after panel_dfs().
void print_sp_ienv_dist | ( | superlu_dist_options_t * | options | ) |
Print the blocking parameters.
void PrintDouble5 | ( | char * | , |
int_t | , | ||
double * | |||
) |
void printForestWeightCost | ( | sForest_t ** | sForests, |
SCT_t * | SCT, | ||
gridinfo3d_t * | grid3d | ||
) |
void printGPUStats | ( | int | nsupers, |
SuperLUStat_t * | stat, | ||
gridinfo3d_t * | |||
) |
void PrintInt32 | ( | char * | name, |
int | len, | ||
int * | x | ||
) |
void printTRStimer | ( | xtrsTimer_t * | xtrsTimer, |
gridinfo3d_t * | grid3d | ||
) |
void PStatClear | ( | SuperLUStat_t * | stat | ) |
void PStatFree | ( | SuperLUStat_t * | stat | ) |
void PStatInit | ( | SuperLUStat_t * | stat | ) |
void PStatPrint | ( | superlu_dist_options_t * | options, |
SuperLUStat_t * | stat, | ||
gridinfo_t * | grid | ||
) |
int_t psymbfact_LUXpand | ( | int_t | iam, |
int_t | n, | ||
int_t | fstVtxLvl_loc, | ||
int_t | vtxXp, | ||
int_t * | p_next, | ||
int_t | min_new_len, | ||
int_t | mem_type, | ||
int_t | rout_type, | ||
int_t | free_prev_mem, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
vtcsInfo_symbfact_t * | VInfo, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error due to a memory alocation failure
Sherry: this function is used in the upper separator tree above the domains. It does not call 'expand()'
int_t psymbfact_LUXpand_RL | ( | int_t | iam, |
int_t | n, | ||
int_t | vtxXp, | ||
int_t | next, | ||
int_t | len_texp, | ||
int_t | mem_type, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
vtcsInfo_symbfact_t * | VInfo, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
Sherry: this function calls psymbfact_LUXpandMem().
int_t psymbfact_LUXpandMem | ( | int | iam, |
int_t | n, | ||
int_t | vtxXp, | ||
int_t | next, | ||
int_t | min_new_len, | ||
int | mem_type, | ||
int | rout_type, | ||
int | free_prev_mem, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
vtcsInfo_symbfact_t * | VInfo, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
Sherry: this function is used inside the domains.
int_t psymbfact_prLUXpand | ( | int_t | iam, |
int_t | min_new_len, | ||
int | mem_type, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U pruned during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error when run out of space
Sherry: this function calls 'expand()' directly.
void pxerr_dist | ( | char * | srname, |
gridinfo_t * | grid, | ||
int_t | info | ||
) |
int_t QuerySpace_dist | ( | int_t | n, |
int_t | lsub_size, | ||
Glu_freeable_t * | Glu_freeable, | ||
superlu_dist_mem_usage_t * | mem_usage | ||
) |
mem_usage consists of the following fields:
int_t reduceStat | ( | PhaseType | PHASE, |
SuperLUStat_t * | stat, | ||
gridinfo3d_t * | grid3d | ||
) |
reduce the states from all the two grids before prinitng it out See the defenition of enum PhaseType in superlu_enum_const.h
void set_default_options_dist | ( | superlu_dist_options_t * | options | ) |
Set the default values for the options argument.
int set_tag_ub | ( | void | ) |
treeList_t * setree2list | ( | int_t | nsuper, |
int_t * | setree | ||
) |
void slu_SCT_free | ( | SCT_t * | SCT | ) |
void slu_SCT_init | ( | SCT_t * | SCT | ) |
void slu_SCT_print | ( | gridinfo_t * | grid, |
SCT_t * | SCT | ||
) |
void slu_SCT_print3D | ( | gridinfo3d_t * | grid3d, |
SCT_t * | SCT | ||
) |
void slu_SCT_printComm3D | ( | gridinfo3d_t * | grid3d, |
SCT_t * | SCT | ||
) |
void slu_SCT_printSummary | ( | gridinfo_t * | grid, |
SCT_t * | SCT | ||
) |
float smach_dist | ( | const char * | cmach | ) |
int sort_R_info | ( | Remain_info_t * | Remain_info, |
int | n | ||
) |
int sort_R_info_elm | ( | Remain_info_t * | Remain_info, |
int | n | ||
) |
int sort_U_info | ( | Ublock_info_t * | Ublock_info, |
int | n | ||
) |
int sort_U_info_elm | ( | Ublock_info_t * | Ublock_info, |
int | n | ||
) |
int sp_coletree_dist | ( | int_t * | acolst, |
int_t * | acolend, | ||
int_t * | arow, | ||
int_t | nr, | ||
int_t | nc, | ||
int_t * | parent | ||
) |
Nonsymmetric elimination tree.
Find the elimination tree for A'*A. This uses something similar to Liu's algorithm. It runs in time O(nz(A)*log n) and does not form A'*A. Input: Sparse matrix A. Numeric values are ignored, so any explicit zeros are treated as nonzero. Output: Integer array of parents representing the elimination tree of the symbolic product A'*A. Each vertex is a column of A, and nc means a root of the elimination forest. John R. Gilbert, Xerox, 10 Dec 1990 Based on code by JRG dated 1987, 1988, and 1990.
void sp_colorder | ( | superlu_dist_options_t * | options, |
SuperMatrix * | A, | ||
int_t * | perm_c, | ||
int_t * | etree, | ||
SuperMatrix * | AC | ||
) |
Purpose ======= sp_colorder() permutes the columns of the original matrix. It performs the following steps: 1. Apply column permutation perm_c[] to A's column pointers to form AC; 2. If options->Fact = DOFACT, then (1) Compute column elimination tree etree[] of AC'AC; (2) Post order etree[] to get a postordered elimination tree etree[], and a postorder permutation post[]; (3) Apply post[] permutation to columns of AC; (4) Overwrite perm_c[] with the product perm_c * post. Arguments ========= options (input) superlu_dist_options_t* Specifies whether or not the elimination tree will be re-used. If options->Fact == DOFACT, this means first time factor A, etree is computed and output. Otherwise, re-factor A, etree is input, unchanged on exit. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Currently, the type of A can be: Stype = SLU_NC or SLU_NCP; Dtype = SLU__D; Mtype = SLU_GE. In the future, more general A can be handled. perm_c (input/output) int* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. If options->Fact == DOFACT, perm_c is both input and output. On output, it is changed according to a postorder of etree. Otherwise, perm_c is input. etree (input/output) int* Elimination tree of Pc*(A'+A)*Pc', dimension A->ncol. If options->Fact == DOFACT, etree is an output argument, otherwise it is an input argument. Note: etree is a vector of parent pointers for a forest whose vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. AC (output) SuperMatrix* The resulting matrix after applied the column permutation perm_c[] to matrix A. The type of AC can be: Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
int sp_ienv_dist | ( | int | ispec, |
superlu_dist_options_t * | options | ||
) |
sp_ienv_dist() is inquired to choose machine-dependent integer parameters for the local environment. See ISPEC for a description of the parameters.
This version provides a set of parameters which should give good,
but not optimal, performance on many of the currently available
computers. Users are encouraged to set the environment variable to change the tuning parameters for their particular machines.
ISPEC (input) int Specifies the parameter to be returned as the value of SP_IENV_DIST.
= 1: the panel size w; a panel consists of w consecutive columns of matrix A in the process of Gaussian elimination. The best value depends on machine's cache characters. = 2: the relaxation parameter relax; if the number of nodes (columns) in a subtree of the elimination tree is less than relax, this subtree is considered as one supernode, regardless of the their row structures. = 3: the maximum size for a supernode, which must be greater than or equal to relaxation parameter (see case 2); = 4: the minimum row dimension for 2-D blocking to be used; = 5: the minimum column dimension for 2-D blocking to be used; = 6: the estimated fills factor for the adjacency structures of L and U, compared with A; = 7: the minimum value of the product M*N*K for a GEMM call worth being offloaded to accelerator (e.g., GPU, Xeon Phi). = 8: the maximum buffer size on GPU that can hold the "dC" matrix in the GEMM call for the Schur complement update. If this is too small, the Schur complement update will be done in multiple partitions, may be slower. = 9: number of GPU streams = 10: whether to offload computations to GPU or not = 11: whether to offload triangular solve to GPU or not
options (input) superlu_dist_options_t* The structure defines the input parameters to control how the LU decomposition the solves are performed.
(SP_IENV_DIST) (output) int >= 0: the value of the parameter specified by ISPEC
< 0: if SP_IENV_DIST = -k, the k-th argument had an illegal value.
sp_ienv_dist() is inquired to choose machine-dependent integer parameters for the local environment. See ISPEC for a description of the parameters.
This version provides a set of parameters which should give good,
but not optimal, performance on many of the currently available
computers. Users are encouraged to set the environment variable to change the tuning parameters for their particular machines.
ISPEC (input) int Specifies the parameter to be returned as the value of SP_IENV_DIST.
= 1: the panel size w; a panel consists of w consecutive columns of matrix A in the process of Gaussian elimination. The best value depends on machine's cache characters. = 2: the relaxation parameter relax; if the number of nodes (columns) in a subtree of the elimination tree is less than relax, this subtree is considered as one supernode, regardless of the their row structures. = 3: the maximum size for a supernode, which must be greater than or equal to relaxation parameter (see case 2); = 4: the minimum row dimension for 2-D blocking to be used; = 5: the minimum column dimension for 2-D blocking to be used; = 6: the estimated fills factor for the adjacency structures of L and U, compared with A; = 7: the minimum value of the product M*N*K for a GEMM call worth being offloaded to accelerator (e.g., GPU, Xeon Phi). = 8: the maximum buffer size on GPU that can hold the "dC" matrix in the GEMM call for the Schur complement update. If this is too small, the Schur complement update will be done in multiple partitions, may be slower. = 9: number of GPU streams = 10: whether to offload work to GPU or not
options (input) superlu_dist_options_t* The structure defines the input parameters to control how the LU decomposition the solves are performed.
(SP_IENV_DIST) (output) int >= 0: the value of the parameter specified by ISPEC
< 0: if SP_IENV_DIST = -k, the k-th argument had an illegal value.
Symmetric elimination tree.
p = spsymetree (A); Find the elimination tree for symmetric matrix A. This uses Liu's algorithm, and runs in time O(nz*log n). Input: Square sparse matrix A. No check is made for symmetry; elements below and on the diagonal are ignored. Numeric values are ignored, so any explicit zeros are treated as nonzero. Output: Integer array of parents representing the etree, with n meaning a root of the elimination forest. Note: This routine uses only the upper triangle, while sparse Cholesky (as in spchol.c) uses only the lower. Matlab's dense Cholesky uses only the upper. This routine could be modified to use the lower triangle either by transposing the matrix or by traversing it by rows with auxiliary pointer and link arrays. John R. Gilbert, Xerox, 10 Dec 1990 Based on code by JRG dated 1987, 1988, and 1990. Modified by X.S. Li, November 1999.
int_t static_partition | ( | struct superlu_pair * | work_load, |
int_t | nwl, | ||
int_t * | partition, | ||
int_t | ldp, | ||
int_t * | sums, | ||
int_t * | counts, | ||
int | nprocs | ||
) |
void superlu_abort_and_exit_dist | ( | char * | msg | ) |
int superlu_dist_GetVersionNumber | ( | int * | major, |
int * | minor, | ||
int * | bugfix | ||
) |
void superlu_free_dist | ( | void * | addr | ) |
void superlu_gridexit | ( | gridinfo_t * | grid | ) |
void superlu_gridexit3d | ( | gridinfo3d_t * | grid | ) |
void superlu_gridinit | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
gridinfo_t * | grid | ||
) |
All processes in the MPI communicator must call this routine.
On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1
void superlu_gridinit3d | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
int | npdep, | ||
gridinfo3d_t * | grid | ||
) |
All processes in the MPI communicator must call this routine.
void superlu_gridmap | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
int | usermap[], | ||
int | ldumap, | ||
gridinfo_t * | grid | ||
) |
All processes in the MPI communicator must call this routine.
On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1
void superlu_gridmap3d | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
int | npdep, | ||
int | usermap[], | ||
gridinfo3d_t * | grid | ||
) |
All processes in the MPI communicator must call this routine. On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1.
void * superlu_malloc_dist | ( | size_t | size | ) |
Returns Supernodal Elimination Tree
nsuper | Number of Supernodes |
etree | Scalar elimination tree |
supno | Vertex to supernode mapping |
xsup | Supernodal boundaries |
int_t symbfact | ( | superlu_dist_options_t * | options, |
int | pnum, | ||
SuperMatrix * | A, | ||
int_t * | perm_c, | ||
int_t * | etree, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Purpose ======= symbfact() performs a symbolic factorization on matrix A and sets up the nonzero data structures which are suitable for supernodal Gaussian elimination with no pivoting (GENP). This routine features: o depth-first search (DFS) o supernodes o symmetric structure pruning Return value ============ < 0, number of bytes needed for LSUB. = 0, matrix dimension is 1. > 0, number of bytes allocated when out of memory.
float symbfact_dist | ( | superlu_dist_options_t * | options, |
int | nprocs_num, | ||
int | nprocs_symb, | ||
SuperMatrix * | A, | ||
int_t * | perm_c, | ||
int_t * | perm_r, | ||
int_t * | sizes, | ||
int_t * | fstVtxSep, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
MPI_Comm * | num_comm, | ||
MPI_Comm * | symb_comm, | ||
superlu_dist_mem_usage_t * | symb_mem_usage | ||
) |
Purpose ======= symbfact_dist() performs symbolic factorization of matrix A suitable for performing the supernodal Gaussian elimination with no pivoting (GEPP). This routine computes the structure of one column of L and one row of U at a time. It uses: o distributed input matrix o supernodes o symmetric structure pruning Arguments ========= nprocs_num (input) int Number of processors SuperLU_DIST is executed on, and the input matrix is distributed on. nprocs_symb (input) int Number of processors on which the symbolic factorization is performed. It is equal to the number of independent domains idenfied in the graph partitioning algorithm executed previously and has to be a power of 2. It corresponds to number of leaves in the separator tree. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. Matrix A is not yet permuted by perm_c. perm_c (input) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means column i of A is in position j in Pr*A. sizes (input) int_t* Contains the number of vertices in each separator. fstVtxSep (input) int_t* Contains first vertex for each separator. Pslu_freeable (output) Pslu_freeable_t* Returns the local L and U structure, and global to local information on the indexing of the vertices. Contains all the information necessary for performing the data distribution towards the numeric factorization. num_comm (input) MPI_Comm* Communicator for numerical factorization symb_comm (input) MPI_Comm* Communicator for symbolic factorization symb_mem_usage (input) superlu_dist_mem_usage_t * Statistics on memory usage. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory. Sketch of the algorithm ======================= Distrbute the vertices on the processors using a subtree to subcube algorithm. Redistribute the structure of the input matrix A according to the subtree to subcube computed previously for the symbolic factorization routine. This implies in particular a distribution from nprocs_num processors to nprocs_symb processors. Perform symbolic factorization guided by the separator tree provided by a graph partitioning algorithm. The symbolic factorization uses a combined left-looking, right-looking approach.
Purpose ======= symbfact_dist() performs symbolic factorization of matrix A suitable for performing the supernodal Gaussian elimination with no pivoting (GEPP). This routine computes the structure of one column of L and one row of U at a time. It uses: o distributed input matrix o supernodes o symmetric structure pruning Arguments ========= nprocs_num (input) int Number of processors SuperLU_DIST is executed on, and the input matrix is distributed on. nprocs_symb (input) int Number of processors on which the symbolic factorization is performed. It is equal to the number of independent domains idenfied in the graph partitioning algorithm executed previously and has to be a power of 2. It corresponds to number of leaves in the separator tree. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. Matrix A is not yet permuted by perm_c. perm_c (input) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means column i of A is in position j in Pr*A. sizes (input) int_t* Contains the number of vertices in each separator. fstVtxSep (input) int_t* Contains first vertex for each separator. Pslu_freeable (output) Pslu_freeable_t* Returns the local L and U structure, and global to local information on the indexing of the vertices. Contains all the information necessary for performing the data distribution towards the numeric factorization. num_comm (input) MPI_Comm* Communicator for numerical factorization symb_comm (input) MPI_Comm* Communicator for symbolic factorization symb_mem_usage (input) superlu_dist_mem_usage_t * Statistics on memory usage. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory. Sketch of the algorithm ======================= Distrbute the vertices on the processors using a subtree to subcube algorithm. Redistribute the structure of the input matrix A according to the subtree to subcube computed previously for the symbolic factorization routine. This implies in particular a distribution from nprocs_num processors to nprocs_symb processors. Perform symbolic factorization guided by the separator tree provided by a graph partitioning algorithm. The symbolic factorization uses a combined left-looking, right-looking approach.
Purpose ======= symbfact_dist() performs symbolic factorization of matrix A suitable for performing the supernodal Gaussian elimination with no pivoting (GEPP). This routine computes the structure of one column of L and one row of U at a time. It uses: o distributed input matrix o supernodes o symmetric structure pruning Arguments ========= nprocs_num (input) int Number of processors SuperLU_DIST is executed on, and the input matrix is distributed on. nprocs_symb (input) int Number of processors on which the symbolic factorization is performed. It is equal to the number of independent domains idenfied in the graph partitioning algorithm executed previously and has to be a power of 2. It corresponds to number of leaves in the separator tree. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. Matrix A is not yet permuted by perm_c. perm_c (input) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means column i of A is in position j in Pr*A. sizes (input) int_t* Contains the number of vertices in each separator. fstVtxSep (input) int_t* Contains first vertex for each separator. Pslu_freeable (output) Pslu_freeable_t* Returns the local L and U structure, and global to local information on the indexing of the vertices. Contains all the information necessary for performing the data distribution towards the numeric factorization. num_comm (input) MPI_Comm* Communicator for numerical factorization symb_comm (input) MPI_Comm* Communicator for symbolic factorization symb_mem_usage (input) superlu_dist_mem_usage_t * Statistics on memory usage. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory. Sketch of the algorithm ======================= Distrbute the vertices on the processors using a subtree to subcube algorithm. Redistribute the structure of the input matrix A according to the subtree to subcube computed previously for the symbolic factorization routine. This implies in particular a distribution from nprocs_num processors to nprocs_symb processors. Perform symbolic factorization guided by the separator tree provided by a graph partitioning algorithm. The symbolic factorization uses a combined left-looking, right-looking approach.
int symbfact_SubFree | ( | Glu_freeable_t * | Glu_freeable | ) |
Deallocate storage of the data structures common to symbolic factorization routines.
int_t symbfact_SubInit | ( | superlu_dist_options_t * | options, |
fact_t | fact, | ||
void * | work, | ||
int_t | lwork, | ||
int_t | m, | ||
int_t | n, | ||
int_t | annz, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Allocate storage for the data structures common to symbolic factorization routines. For those unpredictable size, make a guess as FILL * nnz(A). Return value: If lwork = -1, return the estimated amount of space required, plus n; otherwise, return the amount of space actually allocated when memory allocation failure occurred.
int_t symbfact_SubXpand | ( | int_t | n, |
int_t | jcol, | ||
int_t | next, | ||
MemType | mem_type, | ||
int_t * | maxlen, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
void treeImbalance3D | ( | gridinfo3d_t * | grid3d, |
SCT_t * | SCT | ||
) |
int_t Trs2_InitUblock_info | ( | int_t | klst, |
int_t | nb, | ||
Ublock_info_t * | Ublock_info, | ||
int_t * | usub, | ||
Glu_persist_t * | Glu_persist, | ||
SuperLUStat_t * | stat | ||
) |
int_t Wait_LDiagBlockSend | ( | MPI_Request * | L_diag_blk_send_req, |
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int_t Wait_LSend | ( | int_t | k, |
gridinfo_t * | grid, | ||
int ** | ToSendR, | ||
MPI_Request * | s, | ||
SCT_t * | SCT | ||
) |
int Wait_LUDiagSend | ( | int_t | k, |
MPI_Request * | U_diag_blk_send_req, | ||
MPI_Request * | L_diag_blk_send_req, | ||
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int_t Wait_UDiagBlockSend | ( | MPI_Request * | U_diag_blk_send_req, |
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int_t Wait_USend | ( | MPI_Request * | send_req, |
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int xerr_dist | ( | char * | srname, |
int * | info | ||
) |
int_t zAllocBcast | ( | int_t | size, |
void ** | ptr, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t zAllocBcast_gridID | ( | int_t | size, |
void ** | ptr, | ||
int_t | gridID, | ||
gridinfo3d_t * | grid3d | ||
) |
|
static |
|
static |
|
static |
|
static |