SuperLU Distributed 8.2.1
Distributed memory sparse direct solver
|
Definitions which are precision-neutral. More...
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <stdint.h>
#include "superlu_dist_config.h"
#include "superlu_FortranCInterface.h"
#include "superlu_FCnames.h"
#include "superlu_enum_consts.h"
#include "supermatrix.h"
#include "util_dist.h"
#include "psymbfact.h"
Go to the source code of this file.
Classes | |
struct | superlu_scope_t |
struct | gridinfo_t |
struct | gridinfo3d_t |
struct | Glu_persist_t |
struct | Glu_freeable_t |
struct | pxgstrs_comm_t |
struct | superlu_dist_options_t |
struct | superlu_dist_mem_usage_t |
struct | Ucb_indptr_t |
struct | Ublock_info_t |
struct | Remain_info_t |
struct | etree_node |
struct | superlu_pair |
struct | uPanelInfo_t |
struct | lPanelInfo_t |
struct | packLUInfo_t |
struct | HyP_t |
struct | local_l_blk_info_t |
struct | local_u_blk_info_t |
struct | perm_array_t |
struct | factStat_t |
struct | d2Hreduce_t |
struct | treeList_t |
struct | treeTopoInfo_t |
struct | gEtreeInfo_t |
struct | sForest_t |
struct | commRequests_t |
struct | factNodelists_t |
struct | msgs_t |
struct | xtrsTimer_t |
struct | C_Tree |
Macros | |
#define | SUPERLU_DIST_MAJOR_VERSION 8 |
#define | SUPERLU_DIST_MINOR_VERSION 2 |
#define | SUPERLU_DIST_PATCH_VERSION 1 |
#define | SUPERLU_DIST_RELEASE_DATE "November 17, 2023" |
#define | mpi_int_t MPI_LONG_LONG_INT |
#define | IFMT "%lld" |
#define | SuperLU_MPI_COMPLEX MPI_C_COMPLEX |
#define | SuperLU_MPI_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX |
#define | MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */ |
#define | ISORT /* NOTE: qsort() has bug on Mac */ |
#define | BC_HEADER 2 |
#define | LB_DESCRIPTOR 2 |
#define | BR_HEADER 3 |
#define | UB_DESCRIPTOR 2 |
#define | BC_HEADER_NEWU 3 |
#define | UB_DESCRIPTOR_NEWU 2 |
#define | NBUFFERS 5 |
#define | NTAGS INT_MAX |
#define | UjROW 10 |
#define | UkSUB 11 |
#define | UkVAL 12 |
#define | LkSUB 13 |
#define | LkVAL 14 |
#define | LkkDIAG 15 |
#define | XK_H 2 /* The header preceding each X block. */ |
#define | LSUM_H 2 /* The header preceding each MOD block. */ |
#define | GSUM 20 |
#define | Xk 21 |
#define | Yk 22 |
#define | LSUM 23 |
#define | COMM_ALL 100 |
#define | COMM_COLUMN 101 |
#define | COMM_ROW 102 |
#define | SUPER_LINEAR 11 |
#define | SUPER_BLOCK 12 |
#define | NO_MARKER 3 |
#define | IAM(comm) { int rank; MPI_Comm_rank ( comm, &rank ); rank}; |
#define | MYROW(iam, grid) ( (iam) / grid->npcol ) |
#define | MYCOL(iam, grid) ( (iam) % grid->npcol ) |
#define | BlockNum(i) ( supno[i] ) |
#define | FstBlockC(bnum) ( xsup[bnum] ) |
#define | SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] ) |
#define | LBi(bnum, grid) ( (bnum)/grid->nprow )/* Global to local block rowwise */ |
#define | LBj(bnum, grid) ( (bnum)/grid->npcol )/* Global to local block columnwise*/ |
#define | PROW(bnum, grid) ( (bnum) % grid->nprow ) |
#define | PCOL(bnum, grid) ( (bnum) % grid->npcol ) |
#define | PNUM(i, j, grid) ( (i)*grid->npcol + j ) /* Process number at coord(i,j) */ |
#define | CEILING(a, b) ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) ) |
#define | RHS_ITERATE(i) for (i = 0; i < nrhs; ++i) |
#define | X_BLK(i) ilsum[i] * nrhs + (i+1) * XK_H |
#define | LSUM_BLK(i) ilsum[i] * nrhs + (i+1) * LSUM_H |
#define | SuperLU_timer_ SuperLU_timer_dist_ |
#define | LOG2(x) (log10((double) x) / log10(2.0)) |
#define | VT_TRACEON |
#define | VT_TRACEOFF |
#define | SUPERLU_DIST_EXPORT |
#define | MAGMA_CONST |
#define | DIM_X 16 |
#define | DIM_Y 16 |
#define | BLK_M DIM_X*4 |
#define | BLK_N DIM_Y*4 |
#define | BLK_K 2048/(BLK_M) |
#define | DIM_XA DIM_X |
#define | DIM_YA DIM_Y |
#define | DIM_XB DIM_X |
#define | DIM_YB DIM_Y |
#define | NWARP DIM_X*DIM_Y/32 |
#define | THR_M ( BLK_M / DIM_X ) |
#define | THR_N ( BLK_N / DIM_Y ) |
#define | fetch(A, m, n, bound) offs_d##A[min(n*LD##A+m, bound)] |
#define | fma(A, B, C) C += (A*B) |
#define | cmax(a, b) ((a) > (b) ? (a) : (b)) |
#define | SLU_MPI_TAG(id, num) ( (6*(num)+id) % tag_ub ) |
#define | __SUPERLU_ASYNC_TREE |
#define | DEG_TREE 2 |
Typedefs | |
typedef int64_t | int_t |
typedef enum treePartStrat | treePartStrat |
typedef struct xtrsTimer_t | xtrsTimer_t |
Enumerations | |
enum | treePartStrat { ND , GD , ND , GD } |
Variables | |
static const int | BC_L =1 |
static const int | RD_L =2 |
static const int | BC_U =3 |
static const int | RD_U =4 |
Definitions which are precision-neutral.
Copyright (c) 2003, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from U.S. Dept. of Energy)
All rights reserved.
The source code is distributed under BSD license, see the file License.txt at the top-level directory.
-- Distributed SuperLU routine (version 8.1.2) -- Lawrence Berkeley National Lab, Univ. of California Berkeley. November 1, 2007 Modified: February 20, 2008 October 11, 2014 September 18, 2018 version 6.0 February 8, 2019 version 6.1.1 November 12, 2019 version 6.2.0 October 23, 2020 version 6.4.0 May 12, 2021 version 7.0.0 October 5, 2021 version 7.1.0 October 18, 2021 version 7.1.1 December 12, 2021 version 7.2.0 May 22, 2022 version 8.0.0 July 5, 2022 version 8.1.0 October 1, 2022 version 8.1.1 November 12, 2022 version 8.1.2 November 17, 2023 version 8.2.1
#define __SUPERLU_ASYNC_TREE |
#define BC_HEADER 2 |
#define BC_HEADER_NEWU 3 |
#define BLK_K 2048/(BLK_M) |
#define BLK_M DIM_X*4 |
#define BLK_N DIM_Y*4 |
#define BR_HEADER 3 |
#define CEILING | ( | a, | |
b | |||
) | ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) ) |
#define cmax | ( | a, | |
b | |||
) | ((a) > (b) ? (a) : (b)) |
#define COMM_ALL 100 |
#define COMM_COLUMN 101 |
#define COMM_ROW 102 |
#define DEG_TREE 2 |
#define DIM_X 16 |
#define DIM_XA DIM_X |
#define DIM_XB DIM_X |
#define DIM_Y 16 |
#define DIM_YA DIM_Y |
#define DIM_YB DIM_Y |
#define fetch | ( | A, | |
m, | |||
n, | |||
bound | |||
) | offs_d##A[min(n*LD##A+m, bound)] |
#define fma | ( | A, | |
B, | |||
C | |||
) | C += (A*B) |
#define FstBlockC | ( | bnum | ) | ( xsup[bnum] ) |
#define GSUM 20 |
#define IAM | ( | comm | ) | { int rank; MPI_Comm_rank ( comm, &rank ); rank}; |
#define IFMT "%lld" |
#define ISORT /* NOTE: qsort() has bug on Mac */ |
#define LB_DESCRIPTOR 2 |
#define LBi | ( | bnum, | |
grid | |||
) | ( (bnum)/grid->nprow )/* Global to local block rowwise */ |
#define LBj | ( | bnum, | |
grid | |||
) | ( (bnum)/grid->npcol )/* Global to local block columnwise*/ |
#define LkkDIAG 15 |
#define LkSUB 13 |
#define LkVAL 14 |
#define LOG2 | ( | x | ) | (log10((double) x) / log10(2.0)) |
#define LSUM 23 |
#define LSUM_H 2 /* The header preceding each MOD block. */ |
#define MAGMA_CONST |
#define MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */ |
#define mpi_int_t MPI_LONG_LONG_INT |
#define MYCOL | ( | iam, | |
grid | |||
) | ( (iam) % grid->npcol ) |
#define MYROW | ( | iam, | |
grid | |||
) | ( (iam) / grid->npcol ) |
#define NBUFFERS 5 |
#define NO_MARKER 3 |
#define NTAGS INT_MAX |
#define PCOL | ( | bnum, | |
grid | |||
) | ( (bnum) % grid->npcol ) |
#define PROW | ( | bnum, | |
grid | |||
) | ( (bnum) % grid->nprow ) |
#define SLU_MPI_TAG | ( | id, | |
num | |||
) | ( (6*(num)+id) % tag_ub ) |
#define SUPER_BLOCK 12 |
#define SUPER_LINEAR 11 |
#define SUPERLU_DIST_EXPORT |
#define SUPERLU_DIST_MAJOR_VERSION 8 |
#define SUPERLU_DIST_MINOR_VERSION 2 |
#define SUPERLU_DIST_PATCH_VERSION 1 |
#define SUPERLU_DIST_RELEASE_DATE "November 17, 2023" |
#define SuperLU_MPI_COMPLEX MPI_C_COMPLEX |
#define SuperLU_MPI_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX |
#define SuperLU_timer_ | ( | void | ) | SuperLU_timer_dist_ |
#define SuperSize | ( | bnum | ) | ( xsup[bnum+1]-xsup[bnum] ) |
#define UB_DESCRIPTOR 2 |
#define UB_DESCRIPTOR_NEWU 2 |
#define UjROW 10 |
#define UkSUB 11 |
#define UkVAL 12 |
#define VT_TRACEOFF |
#define VT_TRACEON |
#define Xk 21 |
#define XK_H 2 /* The header preceding each X block. */ |
#define Yk 22 |
typedef int64_t int_t |
typedef enum treePartStrat treePartStrat |
typedef struct xtrsTimer_t xtrsTimer_t |
enum treePartStrat |
void arrive_at_ublock | ( | int_t | j, |
int_t * | iukp, | ||
int_t * | rukp, | ||
int_t * | jb, | ||
int_t * | ljb, | ||
int_t * | nsupc, | ||
int_t | iukp0, | ||
int_t | rukp0, | ||
int_t * | usub, | ||
int_t * | perm_u, | ||
int_t * | xsup, | ||
gridinfo_t * | grid | ||
) |
void at_plus_a_dist | ( | const int_t | n, |
const int_t | nz, | ||
int_t * | colptr, | ||
int_t * | rowind, | ||
int_t * | bnz, | ||
int_t ** | b_colptr, | ||
int_t ** | b_rowind | ||
) |
Purpose ======= Form the structure of A'+A. A is an n-by-n matrix in column oriented format represented by (colptr, rowind). The output A'+A is in column oriented format (symmetrically, also row oriented), represented by (b_colptr, b_rowind).
void bcast_tree | ( | void * | buf, |
int | count, | ||
MPI_Datatype | dtype, | ||
int | root, | ||
int | tag, | ||
gridinfo_t * | grid, | ||
int | scope, | ||
int * | recvcnt | ||
) |
Purpose ======= Broadcast an array of *dtype* numbers. The communication pattern is a tree with number of branches equal to NBRANCHES. The process ranks are between 0 and Np-1. The following two pairs of graphs give different ways of viewing the same algorithm. The first pair shows the trees as they should be visualized when examining the algorithm. The second pair are isomorphic graphs of of the first, which show the actual pattern of data movement. Note that a tree broadcast with NBRANCHES = 2 is isomorphic with a hypercube broadcast (however, it does not require the nodes be a power of two to work). TREE BROADCAST, NBRANCHES = 2 * TREE BROADCAST, NBRANCHES = 3 root=2 i=4 &______________ * | \ * root=2 i=2 &______ &______ * i=3 &______________________ | \ | \ * | \ \ i=1 &__ &__ &__ &__ * i=1 &______ &______ &__ | \ | \ | \ | \ * | \ \ | \ \ | \ 2 3 4 5 6 7 0 1 * 2 3 4 5 6 7 0 1 ISOMORPHIC GRAPHS OF ABOVE, SHOWN IN MORE FAMILIAR TERMS: 2 2 _________|_________ ___________|____________ / | \ / | | \ 6 4 3 5 0 3 4 / \ | / \ | 0 7 5 6 7 1 | 1 Arguments ========= scope
void C_BcTree_Create | ( | C_Tree * | tree, |
MPI_Comm | comm, | ||
int * | ranks, | ||
int | rank_cnt, | ||
int | msgSize, | ||
char | precision | ||
) |
void C_BcTree_forwardMessageSimple | ( | C_Tree * | tree, |
void * | localBuffer, | ||
int | msgSize | ||
) |
void C_BcTree_Nullify | ( | C_Tree * | tree | ) |
void C_BcTree_waitSendRequest | ( | C_Tree * | tree | ) |
void C_RdTree_Create | ( | C_Tree * | tree, |
MPI_Comm | comm, | ||
int * | ranks, | ||
int | rank_cnt, | ||
int | msgSize, | ||
char | precision | ||
) |
void C_RdTree_forwardMessageSimple | ( | C_Tree * | Tree, |
void * | localBuffer, | ||
int | msgSize | ||
) |
void C_RdTree_Nullify | ( | C_Tree * | tree | ) |
void C_RdTree_waitSendRequest | ( | C_Tree * | Tree | ) |
int_t * calcNumNodes | ( | int_t | maxLvl, |
int_t * | treeHeads, | ||
treeList_t * | treeList | ||
) |
int_t calcTreeWeight | ( | int_t | nsupers, |
int_t * | setree, | ||
treeList_t * | treeList, | ||
int_t * | xsup | ||
) |
int_t Check_LRecv | ( | MPI_Request * | recv_req, |
int * | msgcnt | ||
) |
Check whether repfnz[] == EMPTY after reset.
Check whether repfnz[] == EMPTY after reset.
int_t checkIntVector3d | ( | int_t * | vec, |
int_t | len, | ||
gridinfo3d_t * | grid3d | ||
) |
int Cmpfunc_R_info | ( | const void * | a, |
const void * | b | ||
) |
int Cmpfunc_U_info | ( | const void * | a, |
const void * | b | ||
) |
int compare_pair | ( | const void * | a, |
const void * | b | ||
) |
void countnz_dist | ( | const int_t | n, |
int_t * | xprune, | ||
int_t * | nnzL, | ||
int_t * | nnzU, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Count the total number of nonzeros in factors L and U, and in the symmetrically reduced L.
void Destroy_CompCol_Matrix_dist | ( | SuperMatrix * | A | ) |
void Destroy_CompCol_Permuted_dist | ( | SuperMatrix * | A | ) |
A is of type Stype==NCP.
void Destroy_CompRow_Matrix_dist | ( | SuperMatrix * | A | ) |
void Destroy_CompRowLoc_Matrix_dist | ( | SuperMatrix * | A | ) |
void Destroy_SuperMatrix_Store_dist | ( | SuperMatrix * | A | ) |
Deallocate the structure pointing to the actual storage of the matrix.
void Destroy_SuperNode_Matrix_dist | ( | SuperMatrix * | A | ) |
void DistPrint | ( | char * | function_name, |
double | value, | ||
char * | Units, | ||
gridinfo_t * | grid | ||
) |
void DistPrint3D | ( | char * | function_name, |
double | value, | ||
char * | Units, | ||
gridinfo3d_t * | grid3d | ||
) |
double dmach_dist | ( | char * | cmach | ) |
int_t estimate_bigu_size | ( | int_t | nsupers, |
int_t ** | Ufstnz_br_ptr, | ||
Glu_persist_t * | Glu_persist, | ||
gridinfo_t * | grid, | ||
int_t * | perm_u, | ||
int_t * | max_ncols | ||
) |
double estimate_cpu_time | ( | int | m, |
int | n, | ||
int | k | ||
) |
int file_PrintInt32 | ( | FILE * | fp, |
char * | name, | ||
int | len, | ||
int * | x | ||
) |
int64_t fixupL_dist | ( | const int_t | n, |
const int_t * | perm_r, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Fix up the data storage lsub for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts.
Fix up the data storage lsub[] for L-subscripts. It removes the subscript sets for structural pruning, and applies permuation to the remaining subscripts. Return value: number of entries in lsub[], which includes the size of the pruned graph, which is interspersed in the supernodal graph in the lsub[] array.
int free_treelist | ( | int_t | nsuper, |
treeList_t * | treeList | ||
) |
int freeCommRequestsArr | ( | int_t | mxLeafNode, |
commRequests_t ** | comReqss | ||
) |
int freeFactNodelists | ( | factNodelists_t * | fNlists | ) |
int freeFactStat | ( | factStat_t * | factStat | ) |
int genmmd_dist_ | ( | int_t * | neqns, |
int_t * | xadj, | ||
int_t * | a, | ||
int_t * | invp, | ||
int_t * | perm, | ||
int_t * | delta, | ||
int_t * | dhead, | ||
int_t * | qsize, | ||
int_t * | llist, | ||
int_t * | marker, | ||
int_t * | maxint, | ||
int_t * | nofsub | ||
) |
int get_acc_offload | ( | void | ) |
void get_diag_procs | ( | int_t | n, |
Glu_persist_t * | Glu_persist, | ||
gridinfo_t * | grid, | ||
int_t * | num_diag_procs, | ||
int_t ** | diag_procs, | ||
int_t ** | diag_len | ||
) |
int_t get_max_buffer_size | ( | void | ) |
void get_perm_c_dist | ( | int_t | pnum, |
int_t | ispec, | ||
SuperMatrix * | A, | ||
int_t * | perm_c | ||
) |
Purpose ======= GET_PERM_C_DIST obtains a permutation matrix Pc, by applying the multiple minimum degree ordering code by Joseph Liu to matrix A'*A or A+A', or using approximate minimum degree column ordering by Davis et. al. The LU factorization of A*Pc tends to have less fill than the LU factorization of A. Arguments ========= ispec (input) colperm_t Specifies what type of column permutation to use to reduce fill. = NATURAL: natural ordering (i.e., Pc = I) = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A = MMD_ATA: minimum degree ordering on structure of A'*A = METIS_AT_PLUS_A: MeTis on A'+A A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Currently, the type of A can be: Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE. In the future, more general A can be handled. perm_c (output) int* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc.
float get_perm_c_parmetis | ( | SuperMatrix * | A, |
int_t * | perm_r, | ||
int_t * | perm_c, | ||
int | nprocs_i, | ||
int | noDomains, | ||
int_t ** | sizes, | ||
int_t ** | fstVtxSep, | ||
gridinfo_t * | grid, | ||
MPI_Comm * | metis_comm | ||
) |
Purpose ======= GET_PERM_C_PARMETIS obtains a permutation matrix Pc, by applying a graph partitioning algorithm to the symmetrized graph A+A'. The multilevel graph partitioning algorithm used is the ParMETIS_V3_NodeND routine available in the parallel graph partitioning package parMETIS. The number of independent sub-domains noDomains computed by this algorithm has to be a power of 2. Hence noDomains is the larger number power of 2 that is smaller than nprocs_i, where nprocs_i = nprow * npcol is the number of processors used in SuperLU_DIST. Arguments ========= A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means row i of A is in position j in Pr*A. perm_c (output) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. nprocs_i (input) int* Number of processors the input matrix is distributed on in a block row format. It corresponds to number of processors used in SuperLU_DIST. noDomains (input) int*, must be power of 2 Number of independent domains to be computed by the graph partitioning algorithm. ( noDomains <= nprocs_i ) sizes (output) int_t**, of size 2 * noDomains Returns pointer to an array containing the number of nodes for each sub-domain and each separator. Separators are stored from left to right. Memory for the array is allocated in this routine. fstVtxSep (output) int_t**, of size 2 * noDomains Returns pointer to an array containing first node for each sub-domain and each separator. Memory for the array is allocated in this routine. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory.
int get_thread_per_process | ( | void | ) |
int_t getBigUSize | ( | superlu_dist_options_t * | options, |
int_t | nsupers, | ||
gridinfo_t * | grid, | ||
int_t ** | Lrowind_bc_ptr | ||
) |
int_t getCommonAncestorList | ( | int_t | k, |
int_t * | alist, | ||
int_t * | seTree, | ||
treeList_t * | treeList | ||
) |
int_t getCommonAncsCount | ( | int_t | k, |
treeList_t * | treeList | ||
) |
int_t getDescendList | ( | int_t | k, |
int_t * | dlist, | ||
treeList_t * | treeList | ||
) |
sForest_t ** getForests | ( | int_t | maxLvl, |
int_t | nsupers, | ||
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
sForest_t ** getGreedyLoadBalForests | ( | int_t | maxLvl, |
int_t | nsupers, | ||
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
int_t * getGridTrees | ( | gridinfo3d_t * | grid3d | ) |
int * getLastDepBtree | ( | int_t | nsupers, |
treeList_t * | treeList | ||
) |
treeTopoInfo_t getMyTreeTopoInfo | ( | int_t | nnodes, |
int_t | nsupers, | ||
int_t * | myPerm, | ||
int_t * | setree | ||
) |
sForest_t ** getNestDissForests | ( | int_t | maxLvl, |
int_t | nsupers, | ||
int_t * | setree, | ||
treeList_t * | treeList | ||
) |
int_t ** getNodeList | ( | int_t | maxLvl, |
int_t * | setree, | ||
int_t * | nnodes, | ||
int_t * | treeHeads, | ||
treeList_t * | treeList | ||
) |
int getNsupers | ( | int | n, |
Glu_persist_t * | Glu_persist | ||
) |
int_t getNumLookAhead | ( | superlu_dist_options_t * | options | ) |
int getNumThreads | ( | int | iam | ) |
int_t * getPerm_c_supno | ( | int_t | nsupers, |
superlu_dist_options_t * | options, | ||
int_t * | etree, | ||
Glu_persist_t * | Glu_persist, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo_t * | grid | ||
) |
int_t * getReplicatedTrees | ( | gridinfo3d_t * | grid3d | ) |
void getSCUweight | ( | int_t | nsupers, |
treeList_t * | treeList, | ||
int_t * | xsup, | ||
int_t ** | Lrowind_bc_ptr, | ||
int_t ** | Ufstnz_br_ptr, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t * getSubTreeRoots | ( | int_t | k, |
treeList_t * | treeList | ||
) |
int_t * getTreeHeads | ( | int_t | maxLvl, |
int_t | nsupers, | ||
treeList_t * | treeList | ||
) |
int_t ** getTreePerm | ( | int_t * | myTreeIdxs, |
int_t * | myZeroTrIdxs, | ||
int_t * | nodeCount, | ||
int_t ** | nodeList, | ||
int_t * | perm_c_supno, | ||
int_t * | iperm_c_supno, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t ** getTreePermForest | ( | int_t * | myTreeIdxs, |
int_t * | myZeroTrIdxs, | ||
sForest_t * | sForests, | ||
int_t * | perm_c_supno, | ||
int_t * | iperm_c_supno, | ||
gridinfo3d_t * | grid3d | ||
) |
int_t ** getTreePermFr | ( | int_t * | myTreeIdxs, |
sForest_t ** | sForests, | ||
gridinfo3d_t * | grid3d | ||
) |
Fills an integer array with a given value.
int_t initCommRequests | ( | commRequests_t * | comReqs, |
gridinfo_t * | grid | ||
) |
commRequests_t ** initCommRequestsArr | ( | int_t | mxLeafNode, |
int_t | ldt, | ||
gridinfo_t * | grid | ||
) |
int_t initFactNodelists | ( | int_t | ldt, |
int_t | num_threads, | ||
int_t | nsupers, | ||
factNodelists_t * | fNlists | ||
) |
int_t initFactStat | ( | int_t | nsupers, |
factStat_t * | factStat | ||
) |
void initTRStimer | ( | xtrsTimer_t * | xtrsTimer, |
gridinfo_t * | grid | ||
) |
int * int32Calloc_dist | ( | int | n | ) |
int * int32Malloc_dist | ( | int | n | ) |
int_t LDiagBlockRecvWait | ( | int_t | k, |
int_t * | factored_U, | ||
MPI_Request * | L_diag_blk_recv_req, | ||
gridinfo_t * | grid | ||
) |
void log_memory | ( | int64_t | cur_bytes, |
SuperLUStat_t * | stat | ||
) |
int mc64id_dist | ( | int * | icntl | ) |
int_t num_full_cols_U | ( | int_t | kk, |
int_t ** | Ufstnz_br_ptr, | ||
int_t * | xsup, | ||
gridinfo_t * | grid, | ||
int_t * | perm_u, | ||
int_t * | ldu | ||
) |
void print_memorylog | ( | SuperLUStat_t * | stat, |
char * | msg | ||
) |
void print_options_dist | ( | superlu_dist_options_t * | options | ) |
Print the options setting.
void print_panel_seg_dist | ( | int_t | n, |
int_t | w, | ||
int_t | jcol, | ||
int_t | nseg, | ||
int_t * | segrep, | ||
int_t * | repfnz | ||
) |
Diagnostic print of segment info after panel_dfs().
void print_sp_ienv_dist | ( | superlu_dist_options_t * | options | ) |
Print the blocking parameters.
void PrintDouble5 | ( | char * | , |
int_t | , | ||
double * | |||
) |
void printForestWeightCost | ( | sForest_t ** | sForests, |
SCT_t * | SCT, | ||
gridinfo3d_t * | grid3d | ||
) |
void PrintInt32 | ( | char * | name, |
int | len, | ||
int * | x | ||
) |
void printTRStimer | ( | xtrsTimer_t * | xtrsTimer, |
gridinfo3d_t * | grid3d | ||
) |
void PStatClear | ( | SuperLUStat_t * | stat | ) |
void PStatFree | ( | SuperLUStat_t * | stat | ) |
void PStatInit | ( | SuperLUStat_t * | stat | ) |
void PStatPrint | ( | superlu_dist_options_t * | options, |
SuperLUStat_t * | stat, | ||
gridinfo_t * | grid | ||
) |
int_t psymbfact_LUXpand | ( | int_t | iam, |
int_t | n, | ||
int_t | fstVtxLvl_loc, | ||
int_t | vtxXp, | ||
int_t * | p_next, | ||
int_t | min_new_len, | ||
int_t | mem_type, | ||
int_t | rout_type, | ||
int_t | free_prev_mem, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
vtcsInfo_symbfact_t * | VInfo, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error due to a memory alocation failure
Expand the data structures for L and U during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error due to a memory alocation failure
Sherry: this function is used in the upper separator tree above the domains. It does not call 'expand()'
int_t psymbfact_LUXpand_RL | ( | int_t | iam, |
int_t | n, | ||
int_t | vtxXp, | ||
int_t | next, | ||
int_t | len_texp, | ||
int_t | mem_type, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
vtcsInfo_symbfact_t * | VInfo, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
Sherry: this function calls psymbfact_LUXpandMem().
int_t psymbfact_LUXpandMem | ( | int | iam, |
int_t | n, | ||
int_t | vtxXp, | ||
int_t | next, | ||
int_t | min_new_len, | ||
int | mem_type, | ||
int | rout_type, | ||
int | free_prev_mem, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
vtcsInfo_symbfact_t * | VInfo, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
Sherry: this function is used inside the domains.
int_t psymbfact_prLUXpand | ( | int_t | iam, |
int_t | min_new_len, | ||
int | mem_type, | ||
Llu_symbfact_t * | Llu_symbfact, | ||
psymbfact_stat_t * | PS | ||
) |
Expand the data structures for L and U pruned during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error when run out of space
Expand the data structures for L and U pruned during the factorization. Return value: SUCCES_RET - successful return ERROR_RET - error when run out of space
Sherry: this function calls 'expand()' directly.
void pxerr_dist | ( | char * | srname, |
gridinfo_t * | grid, | ||
int_t | info | ||
) |
int_t QuerySpace_dist | ( | int_t | n, |
int_t | lsub_size, | ||
Glu_freeable_t * | Glu_freeable, | ||
superlu_dist_mem_usage_t * | mem_usage | ||
) |
mem_usage consists of the following fields:
int_t reduceStat | ( | PhaseType | PHASE, |
SuperLUStat_t * | stat, | ||
gridinfo3d_t * | grid3d | ||
) |
reduce the states from all the two grids before prinitng it out See the defenition of enum PhaseType in superlu_enum_const.h
void SCT_free | ( | SCT_t * | SCT | ) |
void SCT_init | ( | SCT_t * | SCT | ) |
void SCT_print | ( | gridinfo_t * | grid, |
SCT_t * | SCT | ||
) |
void SCT_print3D | ( | gridinfo3d_t * | grid3d, |
SCT_t * | SCT | ||
) |
void SCT_printComm3D | ( | gridinfo3d_t * | grid3d, |
SCT_t * | SCT | ||
) |
void set_default_options_dist | ( | superlu_dist_options_t * | options | ) |
Set the default values for the options argument.
int set_tag_ub | ( | void | ) |
treeList_t * setree2list | ( | int_t | nsuper, |
int_t * | setree | ||
) |
float smach_dist | ( | char * | cmach | ) |
int sort_R_info | ( | Remain_info_t * | Remain_info, |
int | n | ||
) |
int sort_R_info_elm | ( | Remain_info_t * | Remain_info, |
int | n | ||
) |
int sort_U_info | ( | Ublock_info_t * | Ublock_info, |
int | n | ||
) |
int sort_U_info_elm | ( | Ublock_info_t * | Ublock_info, |
int | n | ||
) |
int sp_coletree_dist | ( | int_t * | acolst, |
int_t * | acolend, | ||
int_t * | arow, | ||
int_t | nr, | ||
int_t | nc, | ||
int_t * | parent | ||
) |
Nonsymmetric elimination tree.
Find the elimination tree for A'*A. This uses something similar to Liu's algorithm. It runs in time O(nz(A)*log n) and does not form A'*A. Input: Sparse matrix A. Numeric values are ignored, so any explicit zeros are treated as nonzero. Output: Integer array of parents representing the elimination tree of the symbolic product A'*A. Each vertex is a column of A, and nc means a root of the elimination forest. John R. Gilbert, Xerox, 10 Dec 1990 Based on code by JRG dated 1987, 1988, and 1990.
void sp_colorder | ( | superlu_dist_options_t * | options, |
SuperMatrix * | A, | ||
int_t * | perm_c, | ||
int_t * | etree, | ||
SuperMatrix * | AC | ||
) |
Purpose ======= sp_colorder() permutes the columns of the original matrix. It performs the following steps: 1. Apply column permutation perm_c[] to A's column pointers to form AC; 2. If options->Fact = DOFACT, then (1) Compute column elimination tree etree[] of AC'AC; (2) Post order etree[] to get a postordered elimination tree etree[], and a postorder permutation post[]; (3) Apply post[] permutation to columns of AC; (4) Overwrite perm_c[] with the product perm_c * post. Arguments ========= options (input) superlu_dist_options_t* Specifies whether or not the elimination tree will be re-used. If options->Fact == DOFACT, this means first time factor A, etree is computed and output. Otherwise, re-factor A, etree is input, unchanged on exit. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Currently, the type of A can be: Stype = SLU_NC or SLU_NCP; Dtype = SLU__D; Mtype = SLU_GE. In the future, more general A can be handled. perm_c (input/output) int* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. If options->Fact == DOFACT, perm_c is both input and output. On output, it is changed according to a postorder of etree. Otherwise, perm_c is input. etree (input/output) int* Elimination tree of Pc*(A'+A)*Pc', dimension A->ncol. If options->Fact == DOFACT, etree is an output argument, otherwise it is an input argument. Note: etree is a vector of parent pointers for a forest whose vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol. AC (output) SuperMatrix* The resulting matrix after applied the column permutation perm_c[] to matrix A. The type of AC can be: Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
int sp_ienv_dist | ( | int | ispec, |
superlu_dist_options_t * | options | ||
) |
sp_ienv_dist() is inquired to choose machine-dependent integer parameters for the local environment. See ISPEC for a description of the parameters.
This version provides a set of parameters which should give good,
but not optimal, performance on many of the currently available
computers. Users are encouraged to set the environment variable to change the tuning parameters for their particular machines.
ISPEC (input) int Specifies the parameter to be returned as the value of SP_IENV_DIST.
= 1: the panel size w; a panel consists of w consecutive columns of matrix A in the process of Gaussian elimination. The best value depends on machine's cache characters. = 2: the relaxation parameter relax; if the number of nodes (columns) in a subtree of the elimination tree is less than relax, this subtree is considered as one supernode, regardless of the their row structures. = 3: the maximum size for a supernode, which must be greater than or equal to relaxation parameter (see case 2); = 4: the minimum row dimension for 2-D blocking to be used; = 5: the minimum column dimension for 2-D blocking to be used; = 6: the estimated fills factor for the adjacency structures of L and U, compared with A; = 7: the minimum value of the product M*N*K for a GEMM call worth being offloaded to accelerator (e.g., GPU, Xeon Phi). = 8: the maximum buffer size on GPU that can hold the "dC" matrix in the GEMM call for the Schur complement update. If this is too small, the Schur complement update will be done in multiple partitions, may be slower. = 9: number of GPU streams = 10: whether to offload work to GPU or not
options (input) superlu_dist_options_t* The structure defines the input parameters to control how the LU decomposition the solves are performed.
(SP_IENV_DIST) (output) int >= 0: the value of the parameter specified by ISPEC
< 0: if SP_IENV_DIST = -k, the k-th argument had an illegal value.
Symmetric elimination tree.
p = spsymetree (A); Find the elimination tree for symmetric matrix A. This uses Liu's algorithm, and runs in time O(nz*log n). Input: Square sparse matrix A. No check is made for symmetry; elements below and on the diagonal are ignored. Numeric values are ignored, so any explicit zeros are treated as nonzero. Output: Integer array of parents representing the etree, with n meaning a root of the elimination forest. Note: This routine uses only the upper triangle, while sparse Cholesky (as in spchol.c) uses only the lower. Matlab's dense Cholesky uses only the upper. This routine could be modified to use the lower triangle either by transposing the matrix or by traversing it by rows with auxiliary pointer and link arrays. John R. Gilbert, Xerox, 10 Dec 1990 Based on code by JRG dated 1987, 1988, and 1990. Modified by X.S. Li, November 1999.
int_t static_partition | ( | struct superlu_pair * | work_load, |
int_t | nwl, | ||
int_t * | partition, | ||
int_t | ldp, | ||
int_t * | sums, | ||
int_t * | counts, | ||
int | nprocs | ||
) |
void superlu_abort_and_exit_dist | ( | char * | msg | ) |
int superlu_dist_GetVersionNumber | ( | int * | major, |
int * | minor, | ||
int * | bugfix | ||
) |
void superlu_free_dist | ( | void * | addr | ) |
void superlu_gridexit | ( | gridinfo_t * | grid | ) |
void superlu_gridexit3d | ( | gridinfo3d_t * | grid | ) |
void superlu_gridinit | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
gridinfo_t * | grid | ||
) |
All processes in the MPI communicator must call this routine.
On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1
void superlu_gridinit3d | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
int | npdep, | ||
gridinfo3d_t * | grid | ||
) |
All processes in the MPI communicator must call this routine.
void superlu_gridmap | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
int | usermap[], | ||
int | ldumap, | ||
gridinfo_t * | grid | ||
) |
All processes in the MPI communicator must call this routine.
On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1
void superlu_gridmap3d | ( | MPI_Comm | Bcomm, |
int | nprow, | ||
int | npcol, | ||
int | npdep, | ||
int | usermap[], | ||
gridinfo3d_t * | grid | ||
) |
All processes in the MPI communicator must call this routine. On output, if a process is not in the SuperLU group, the following values are assigned to it: grid->comm = MPI_COMM_NULL grid->iam = -1.
void * superlu_malloc_dist | ( | size_t | size | ) |
Returns Supernodal Elimination Tree
nsuper | Number of Supernodes |
etree | Scalar elimination tree |
supno | Vertex to supernode mapping |
xsup | Supernodal boundaries |
int_t symbfact | ( | superlu_dist_options_t * | options, |
int | pnum, | ||
SuperMatrix * | A, | ||
int_t * | perm_c, | ||
int_t * | etree, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Purpose ======= symbfact() performs a symbolic factorization on matrix A and sets up the nonzero data structures which are suitable for supernodal Gaussian elimination with no pivoting (GENP). This routine features: o depth-first search (DFS) o supernodes o symmetric structure pruning Return value ============ < 0, number of bytes needed for LSUB. = 0, matrix dimension is 1. > 0, number of bytes allocated when out of memory.
float symbfact_dist | ( | superlu_dist_options_t * | options, |
int | nprocs_num, | ||
int | nprocs_symb, | ||
SuperMatrix * | A, | ||
int_t * | perm_c, | ||
int_t * | perm_r, | ||
int_t * | sizes, | ||
int_t * | fstVtxSep, | ||
Pslu_freeable_t * | Pslu_freeable, | ||
MPI_Comm * | num_comm, | ||
MPI_Comm * | symb_comm, | ||
superlu_dist_mem_usage_t * | symb_mem_usage | ||
) |
Purpose ======= symbfact_dist() performs symbolic factorization of matrix A suitable for performing the supernodal Gaussian elimination with no pivoting (GEPP). This routine computes the structure of one column of L and one row of U at a time. It uses: o distributed input matrix o supernodes o symmetric structure pruning Arguments ========= nprocs_num (input) int Number of processors SuperLU_DIST is executed on, and the input matrix is distributed on. nprocs_symb (input) int Number of processors on which the symbolic factorization is performed. It is equal to the number of independent domains idenfied in the graph partitioning algorithm executed previously and has to be a power of 2. It corresponds to number of leaves in the separator tree. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. Matrix A is not yet permuted by perm_c. perm_c (input) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means column i of A is in position j in Pr*A. sizes (input) int_t* Contains the number of vertices in each separator. fstVtxSep (input) int_t* Contains first vertex for each separator. Pslu_freeable (output) Pslu_freeable_t* Returns the local L and U structure, and global to local information on the indexing of the vertices. Contains all the information necessary for performing the data distribution towards the numeric factorization. num_comm (input) MPI_Comm* Communicator for numerical factorization symb_comm (input) MPI_Comm* Communicator for symbolic factorization symb_mem_usage (input) superlu_dist_mem_usage_t * Statistics on memory usage. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory. Sketch of the algorithm ======================= Distrbute the vertices on the processors using a subtree to subcube algorithm. Redistribute the structure of the input matrix A according to the subtree to subcube computed previously for the symbolic factorization routine. This implies in particular a distribution from nprocs_num processors to nprocs_symb processors. Perform symbolic factorization guided by the separator tree provided by a graph partitioning algorithm. The symbolic factorization uses a combined left-looking, right-looking approach.
Purpose ======= symbfact_dist() performs symbolic factorization of matrix A suitable for performing the supernodal Gaussian elimination with no pivoting (GEPP). This routine computes the structure of one column of L and one row of U at a time. It uses: o distributed input matrix o supernodes o symmetric structure pruning Arguments ========= nprocs_num (input) int Number of processors SuperLU_DIST is executed on, and the input matrix is distributed on. nprocs_symb (input) int Number of processors on which the symbolic factorization is performed. It is equal to the number of independent domains idenfied in the graph partitioning algorithm executed previously and has to be a power of 2. It corresponds to number of leaves in the separator tree. A (input) SuperMatrix* Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number of the linear equations is A->nrow. Matrix A is distributed in NRformat_loc format. Matrix A is not yet permuted by perm_c. perm_c (input) int_t* Column permutation vector of size A->ncol, which defines the permutation matrix Pc; perm_c[i] = j means column i of A is in position j in A*Pc. perm_r (input) int_t* Row permutation vector of size A->nrow, which defines the permutation matrix Pr; perm_r[i] = j means column i of A is in position j in Pr*A. sizes (input) int_t* Contains the number of vertices in each separator. fstVtxSep (input) int_t* Contains first vertex for each separator. Pslu_freeable (output) Pslu_freeable_t* Returns the local L and U structure, and global to local information on the indexing of the vertices. Contains all the information necessary for performing the data distribution towards the numeric factorization. num_comm (input) MPI_Comm* Communicator for numerical factorization symb_comm (input) MPI_Comm* Communicator for symbolic factorization symb_mem_usage (input) superlu_dist_mem_usage_t * Statistics on memory usage. Return value ============ < 0, number of bytes allocated on return from the symbolic factorization. > 0, number of bytes allocated when out of memory. Sketch of the algorithm ======================= Distrbute the vertices on the processors using a subtree to subcube algorithm. Redistribute the structure of the input matrix A according to the subtree to subcube computed previously for the symbolic factorization routine. This implies in particular a distribution from nprocs_num processors to nprocs_symb processors. Perform symbolic factorization guided by the separator tree provided by a graph partitioning algorithm. The symbolic factorization uses a combined left-looking, right-looking approach.
int_t symbfact_SubFree | ( | Glu_freeable_t * | Glu_freeable | ) |
Deallocate storage of the data structures common to symbolic factorization routines.
int_t symbfact_SubInit | ( | superlu_dist_options_t * | options, |
fact_t | fact, | ||
void * | work, | ||
int_t | lwork, | ||
int_t | m, | ||
int_t | n, | ||
int_t | annz, | ||
Glu_persist_t * | Glu_persist, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Allocate storage for the data structures common to symbolic factorization routines. For those unpredictable size, make a guess as FILL * nnz(A). Return value: If lwork = -1, return the estimated amount of space required, plus n; otherwise, return the amount of space actually allocated when memory allocation failure occurred.
int_t symbfact_SubXpand | ( | int_t | n, |
int_t | jcol, | ||
int_t | next, | ||
MemType | mem_type, | ||
int_t * | maxlen, | ||
Glu_freeable_t * | Glu_freeable | ||
) |
Expand the data structures for L and U during the factorization. Return value: 0 - successful return > 0 - number of bytes allocated when run out of space
void treeImbalance3D | ( | gridinfo3d_t * | grid3d, |
SCT_t * | SCT | ||
) |
int_t Trs2_InitUblock_info | ( | int_t | klst, |
int_t | nb, | ||
Ublock_info_t * | Ublock_info, | ||
int_t * | usub, | ||
Glu_persist_t * | Glu_persist, | ||
SuperLUStat_t * | stat | ||
) |
int_t Wait_LDiagBlockSend | ( | MPI_Request * | L_diag_blk_send_req, |
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int_t Wait_LSend | ( | int_t | k, |
gridinfo_t * | grid, | ||
int ** | ToSendR, | ||
MPI_Request * | s, | ||
SCT_t * | SCT | ||
) |
int Wait_LUDiagSend | ( | int_t | k, |
MPI_Request * | U_diag_blk_send_req, | ||
MPI_Request * | L_diag_blk_send_req, | ||
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int_t Wait_UDiagBlockSend | ( | MPI_Request * | U_diag_blk_send_req, |
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int_t Wait_USend | ( | MPI_Request * | send_req, |
gridinfo_t * | grid, | ||
SCT_t * | SCT | ||
) |
int xerr_dist | ( | char * | srname, |
int * | info | ||
) |
|
static |
|
static |
|
static |
|
static |