SuperLU Distributed 9.0.0
gpu3d
superlu_defs.h
Go to the documentation of this file.
1
40#ifndef __SUPERLU_DEFS /* allow multiple inclusions */
41#define __SUPERLU_DEFS
42
43/*
44 * File name: superlu_defs.h
45 * Purpose: Definitions which are precision-neutral
46 */
47#ifdef _CRAY
48 #include <fortran.h>
49#endif
50
51#ifdef _OPENMP
52 #include <omp.h>
53#endif
54
55
56#include <mpi.h>
57#include <stdlib.h>
58#include <stdio.h>
59#include <limits.h>
60#include <string.h>
61#include <ctype.h>
62// #include <stdatomic.h>
63#include <math.h>
64#include <stdint.h>
65//#include <malloc.h> Sherry: not available on Mac OS
66// /* Following is for vtune */
67// #if 0
68// #include <ittnotify.h>
69// #define USE_VTUNE
70// #endif
71
72#if defined(VTUNE) && (VTUNE>=1)
73#include <ittnotify.h>
74#endif
75
76/*************************************************************************
77 * Constants
78 **************************************************************************/
79/*
80 * You can support older version of SuperLU_DIST.
81 * At compile-time, you can catch the new release as:
82 * #ifdef SUPERLU_DIST_MAIN_VERSION == 5
83 * use the new interface
84 * #else
85 * use the old interface
86 * #endif
87 * Versions 4.x and earlier do not include a #define'd version numbers.
88 */
89#define SUPERLU_DIST_MAJOR_VERSION 9
90#define SUPERLU_DIST_MINOR_VERSION 0
91#define SUPERLU_DIST_PATCH_VERSION 0
92#define SUPERLU_DIST_RELEASE_DATE "May 8, 2024"
93
94#include "superlu_dist_config.h"
95
96#ifdef HAVE_CUDA
97#define GPU_ACC
98//#include "cublas_utils.h"
99#endif
100
101#ifdef HAVE_HIP
102#ifndef GPU_ACC
103#define GPU_ACC
104#endif
105#endif
106
107#ifdef GPU_ACC
108#include "oneside.h"
109#include "gpu_api_utils.h"
110#endif
111
112
113/* Define my integer size int_t */
114#ifdef _CRAY
115 typedef short int_t;
116 /*#undef int Revert back to int of default size. */
117 #define mpi_int_t MPI_SHORT
118#elif defined (_LONGINT)
119 typedef int64_t int_t;
120 #define mpi_int_t MPI_LONG_LONG_INT
121 #define IFMT " %lld"
122#else /* Default */
123 typedef int int_t;
124 #define mpi_int_t MPI_INT
125 #define IFMT "%8d"
126#endif
127
128
129/* MPI C complex datatype */
130#define SuperLU_MPI_COMPLEX MPI_C_COMPLEX
131#define SuperLU_MPI_DOUBLE_COMPLEX MPI_C_DOUBLE_COMPLEX
132
133/* MPI_Datatype cannot be used in C typedef
134typedef MPI_C_COMPLEX SuperLU_MPI_COMPLEX;
135typedef MPI_C_DOUBLE_COMPLEX SuperLU_MPI_DOUBLE_COMPLEX;
136*/
137
139#include "superlu_FCnames.h"
140#include "superlu_enum_consts.h"
141#include "supermatrix.h"
142#include "util_dist.h"
143#include "psymbfact.h"
144
145#define ISORT /* NOTE: qsort() has bug on Mac */
146
147/***********************************************************************
148 * Constants
149 ***********************************************************************/
150
151#define MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */
152
153/*
154 * For each block column of L, the index[] array contains both the row
155 * subscripts and the integers describing the size of the blocks.
156 * The organization of index[] looks like:
157 *
158 * [ BLOCK COLUMN HEADER (size BC_HEADER)
159 * number of blocks
160 * number of row subscripts, i.e., LDA of nzval[]
161 * BLOCK 0 <----
162 * BLOCK DESCRIPTOR (of size LB_DESCRIPTOR) |
163 * block number (global) |
164 * number of full rows in the block |
165 * actual row subscripts |
166 * BLOCK 1 | Repeat ...
167 * BLOCK DESCRIPTOR | number of blocks
168 * block number (global) |
169 * number of full rows in the block |
170 * actual row subscripts |
171 * . |
172 * . |
173 * . <----
174 * ]
175 *
176 * For each block row of U, the organization of index[] looks like:
177 *
178 * [ BLOCK ROW HEADER (of size BR_HEADER)
179 * number of blocks
180 * number of entries in nzval[]
181 * number of entries in index[]
182 * BLOCK 0 <----
183 * BLOCK DESCRIPTOR (of size UB_DESCRIPTOR) |
184 * block number (global) |
185 * number of nonzeros in the block |
186 * actual fstnz subscripts |
187 * BLOCK 1 | Repeat ...
188 * BLOCK DESCRIPTOR | number of blocks
189 * block number (global) |
190 * number of nonzeros in the block |
191 * actual fstnz subscripts |
192 * . |
193 * . |
194 * . <----
195 * ]
196 *
197 */
198#define BC_HEADER 2
199#define LB_DESCRIPTOR 2
200#define BR_HEADER 3
201#define UB_DESCRIPTOR 2
202#define BC_HEADER_NEWU 3
203#define UB_DESCRIPTOR_NEWU 2
204#define UB_DESCRIPTOR_NEWUCPP 3 // this should be the same as UPANEL_HEADER_SIZE, but only the highest skyline is used as the LDA
205#define NBUFFERS 5
206
207/*
208 * Communication tags
209 */
210/* Return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 *
211 * for each supernodal column "num", the five communications are: *
212 * 0,1: for sending L to "right" *
213 * 2,3: for sending off-diagonal blocks of U "down" *
214 * 4 : for sending the diagonal blcok down (in pxgstrf2) */
215//#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub )
216
217 /* For numeric factorization. */
218#if 0
219#define NTAGS 10000
220#else
221#define NTAGS INT_MAX
222#endif
223#define UjROW 10
224#define UkSUB 11
225#define UkVAL 12
226#define LkSUB 13
227#define LkVAL 14
228#define LkkDIAG 15
229 /* For triangular solves. */
230#define XK_H 2 /* The header preceding each X block. */
231#define LSUM_H 2 /* The header preceding each MOD block. */
232#define GSUM 20
233#define Xk 21
234#define Yk 22
235#define LSUM 23
236
237
238static const int BC_L=1; /* MPI tag for x in L-solve*/
239static const int RD_L=2; /* MPI tag for lsum in L-solve*/
240static const int BC_U=3; /* MPI tag for x in U-solve*/
241static const int RD_U=4; /* MPI tag for lsum in U-solve*/
242
243/*
244 * Communication scopes
245 */
246#define COMM_ALL 100
247#define COMM_COLUMN 101
248#define COMM_ROW 102
249
250/*
251 * Matrix distribution for sparse matrix-vector multiplication
252 */
253#define SUPER_LINEAR 11
254#define SUPER_BLOCK 12
255
256/*
257 * No of marker arrays used in the symbolic factorization, each of size n
258 */
259#define NO_MARKER 3
260
261
262
263/***********************************************************************
264 * Macros
265 ***********************************************************************/
266#define IAM(comm) { int rank; MPI_Comm_rank ( comm, &rank ); rank};
267#define MYROW(iam,grid) ( (iam) / grid->npcol )
268#define MYCOL(iam,grid) ( (iam) % grid->npcol )
269#define BlockNum(i) ( supno[i] )
270#define FstBlockC(bnum) ( xsup[bnum] )
271#define SuperSize(bnum) ( xsup[bnum+1]-xsup[bnum] )
272#define LBi(bnum,grid) ( (bnum)/grid->nprow )/* Global to local block rowwise */
273#define LBj(bnum,grid) ( (bnum)/grid->npcol )/* Global to local block columnwise*/
274#define PROW(bnum,grid) ( (bnum) % grid->nprow )
275#define PCOL(bnum,grid) ( (bnum) % grid->npcol )
276#define PNUM(i,j,grid) ( (i)*grid->npcol + j ) /* Process number at coord(i,j) */
277#define CEILING(a,b) ( ((a)%(b)) ? ((a)/(b) + 1) : ((a)/(b)) )
278 /* For triangular solves */
279#define RHS_ITERATE(i) \
280 for (i = 0; i < nrhs; ++i)
281#define X_BLK(i) \
282 ilsum[i] * nrhs + (i+1) * XK_H
283#define XT_BLK(i) \
284 ilsumT[i] * nrhs + (i+1) * XK_H
285#define LSUM_BLK(i) \
286 ilsum[i] * nrhs + (i+1) * LSUM_H
287
288#define SuperLU_timer_ SuperLU_timer_dist_
289#define LOG2(x) (log10((double) x) / log10(2.0))
290
291#if defined(VAMPIR) && (VAMPIR>=1)
292#define VT_TRACEON VT_traceon()
293#define VT_TRACEOFF VT_traceoff()
294#else
295#define VT_TRACEON
296#define VT_TRACEOFF
297#endif
298
299/* Support Windows */
300#ifndef SUPERLU_DIST_EXPORT
301#if defined(MSVC) && MSVC
302#ifdef SUPERLU_DIST_EXPORTS
303#define SUPERLU_DIST_EXPORT __declspec(dllexport)
304#else
305#define SUPERLU_DIST_EXPORT __declspec(dllimport)
306#endif /* SUPERLU_DIST_EXPORTS */
307#else
308#define SUPERLU_DIST_EXPORT
309#endif /* MSVC */
310#endif /* SUPERLU_DIST_EXPORT */
311
312
313/*
314 * CONSTANTS in MAGMA
315 */
316#ifndef MAGMA_CONST
317#define MAGMA_CONST
318
319#define DIM_X 32
320#define DIM_Y 16
321
322// #define DIM_X 16
323// #define DIM_Y 16
324
325#define DIM_XA DIM_X
326#define DIM_YA DIM_Y
327#define DIM_XB DIM_X
328#define DIM_YB DIM_Y
329
330#define WARP_SIZE 32
331#define NWARP DIM_X*DIM_Y/WARP_SIZE
332// #define U_BLOCK_PER_ROW_ROWDATA 1 // Use row-wise storage of U in single-GPU U solve
333// #define SINGLE_RHS_OPT 1 // Use optimized kernels for single-GPU L and U solve
334
335// // // // // // #define TILE_SIZE 32
336
337
338#define THR_M ( BLK_M / DIM_X )
339#define THR_N ( BLK_N / DIM_Y )
340
341#define fetch(A, m, n, bound) offs_d##A[min(n*LD##A+m, bound)]
342#define fma(A, B, C) C += (A*B)
343#endif
344/*---- end MAGMA ----*/
345
346/* New batched types */
347#define HANDLE_SIZE 8
348typedef int64_t handle_t;
349
350
351#ifdef __cplusplus
352extern "C" {
353#endif
354
355
356#ifndef max
357 #define cmax(a,b) ((a) > (b) ? (a) : (b))
358#endif
359
360#ifdef __cplusplus
361 }
362#endif
363
364
365/***********************************************************************
366 * New data types
367 ***********************************************************************/
368
369/*
370 * Define the 2D mapping of matrix blocks to process grid.
371 *
372 * Process grid:
373 * Processes are numbered (0 : P-1).
374 * P = Pr x Pc, where Pr, Pc are the number of process rows and columns.
375 * (pr,pc) is the coordinate of IAM; 0 <= pr < Pr, 0 <= pc < Pc.
376 *
377 * Matrix blocks:
378 * Matrix is partitioned according to supernode partitions, both
379 * column and row-wise.
380 * The k-th block columns (rows) contains columns (rows) (s:t), where
381 * s=xsup[k], t=xsup[k+1]-1.
382 * Block A(I,J) contains
383 * rows from (xsup[I]:xsup[I+1]-1) and
384 * columns from (xsup[J]:xsup[J+1]-1)
385 *
386 * Mapping of matrix entry (i,j) to matrix block (I,J):
387 * (I,J) = ( supno[i], supno[j] )
388 *
389 * Mapping of matrix block (I,J) to process grid (pr,pc):
390 * (pr,pc) = ( MOD(I,NPROW), MOD(J,NPCOL) )
391 *
392 * (xsup[nsupers],supno[n]) are replicated on all processors.
393 *
394 */
395
396/*-- Communication subgroup */
397typedef struct {
398 MPI_Comm comm; /* MPI communicator */
399 int Np; /* number of processes */
400 int Iam; /* my process number */
402
403/*-- 2D process grid definition */
404typedef struct {
405 MPI_Comm comm; /* MPI communicator */
406 superlu_scope_t rscp; /* process scope in rowwise, horizontal directon */
407 superlu_scope_t cscp; /* process scope in columnwise, vertical direction */
408 int iam; /* my process number in this grid */
409 int_t nprow; /* number of process rows */
410 int_t npcol; /* number of process columns */
411} gridinfo_t;
412
413/*-- 3D process grid definition */
414typedef struct {
415 MPI_Comm comm; /* MPI communicator */
416 superlu_scope_t rscp; /* row scope */
417 superlu_scope_t cscp; /* column scope */
418 superlu_scope_t zscp; /* scope in third dimension */
419 gridinfo_t grid2d; /* for using 2D functions */
420 int iam; /* my process number in this grid */
421 int_t nprow; /* number of process rows */
422 int_t npcol; /* number of process columns */
423 int_t npdep; /* number of replication factor in Z-dimension */
424 int rankorder; /* = 0: Z-major ( default )
425 * e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
426 * 0 3 6 9
427 * 1 4 7 10
428 * 2 5 8 11
429 * = 1: XY-major (need set env. var.: SUPERLU_RANKORDER=XY)
430 * e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
431 * 0 1 2 3
432 * 4 5 6 7
433 * 8 9 10 11
434 */
436
437
438/*
439 *-- The structures are determined by SYMBFACT and used thereafter.
440 *
441 * (xsup,supno) describes mapping between supernode and column:
442 * xsup[s] is the leading column of the s-th supernode.
443 * supno[i] is the supernode no to which column i belongs;
444 * e.g. supno 0 1 2 2 3 3 3 4 4 4 4 4 (n=12)
445 * xsup 0 1 2 4 7 12
446 * Note: dfs will be performed on supernode rep. relative to the new
447 * row pivoting ordering
448 *
449 * This is allocated during symbolic factorization SYMBFACT.
450 */
451typedef struct {
455
456/*
457 *-- The structures are determined by SYMBFACT and used by DDISTRIBUTE.
458 *
459 * (xlsub,lsub): lsub[*] contains the compressed subscript of
460 * rectangular supernodes; xlsub[j] points to the starting
461 * location of the j-th column in lsub[*]. Note that xlsub
462 * is indexed by column.
463 * Storage: original row subscripts
464 *
465 * During the course of sparse LU factorization, we also use
466 * (xlsub,lsub) for the purpose of symmetric pruning. For each
467 * supernode {s,s+1,...,t=s+r} with first column s and last
468 * column t, the subscript set
469 * lsub[j], j=xlsub[s], .., xlsub[s+1]-1
470 * is the structure of column s (i.e. structure of this supernode).
471 * It is used for the storage of numerical values.
472 * Furthermore,
473 * lsub[j], j=xlsub[t], .., xlsub[t+1]-1
474 * is the structure of the last column t of this supernode.
475 * It is for the purpose of symmetric pruning. Therefore, the
476 * structural subscripts can be rearranged without making physical
477 * interchanges among the numerical values.
478 *
479 * However, if the supernode has only one column, then we
480 * only keep one set of subscripts. For any subscript interchange
481 * performed, similar interchange must be done on the numerical
482 * values.
483 *
484 * The last column structures (for pruning) will be removed
485 * after the numercial LU factorization phase.
486 *
487 * (xusub,usub): xusub[i] points to the starting location of column i
488 * in usub[]. For each U-segment, only the row index of first nonzero
489 * is stored in usub[].
490 *
491 * Each U column consists of a number of full segments. Each full segment
492 * starts from a leading nonzero, running up to the supernode (block)
493 * boundary. (Recall that the column-wise supernode partition is also
494 * imposed on the rows.) Because the segment is full, we don't store all
495 * the row indices. Instead, only the leading nonzero index is stored.
496 * The rest can be found together with xsup/supno pair.
497 * For example,
498 * usub[xsub[j+1]] - usub[xsub[j]] = number of segments in column j.
499 * for any i in usub[],
500 * supno[i] = block number in which i belongs to
501 * xsup[supno[i]+1] = first row of the next block
502 * The nonzeros of this segment are:
503 * i, i+1 ... xsup[supno[i]+1]-1 (only i is stored in usub[])
504 *
505 */
506typedef struct {
507 int_t *lsub; /* compressed L subscripts */
509 int_t *usub; /* compressed U subscripts */
511 int_t nzlmax; /* current max size of lsub */
512 int_t nzumax; /* " " " usub */
513 LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
514 //int_t *llvl; /* keep track of level in L for level-based ILU */
515 //int_t *ulvl; /* keep track of level in U for level-based ILU */
516 int64_t nnzLU; /* number of nonzeros in L+U*/
518
519#if 0 // Sherry: move to precision-dependent file
520/*
521 *-- The structure used to store matrix A of the linear system and
522 * several vectors describing the transformations done to matrix A.
523 *
524 * A (SuperMatrix*)
525 * Matrix A in A*X=B, of dimension (A->nrow, A->ncol).
526 * The number of linear equations is A->nrow. The type of A can be:
527 * Stype = SLU_NC; Dtype = SLU_D; Mtype = SLU_GE.
528 *
529 * DiagScale (DiagScale_t)
530 * Specifies the form of equilibration that was done.
531 * = NOEQUIL: No equilibration.
532 * = ROW: Row equilibration, i.e., A was premultiplied by diag(R).
533 * = COL: Column equilibration, i.e., A was postmultiplied by diag(C).
534 * = BOTH: Both row and column equilibration, i.e., A was replaced
535 * by diag(R)*A*diag(C).
536 *
537 * R double*, dimension (A->nrow)
538 * The row scale factors for A.
539 * If DiagScale = ROW or BOTH, A is multiplied on the left by diag(R).
540 * If DiagScale = NOEQUIL or COL, R is not defined.
541 *
542 * C double*, dimension (A->ncol)
543 * The column scale factors for A.
544 * If DiagScale = COL or BOTH, A is multiplied on the right by diag(C).
545 * If DiagScale = NOEQUIL or ROW, C is not defined.
546 *
547 * perm_r (int*) dimension (A->nrow)
548 * Row permutation vector which defines the permutation matrix Pr,
549 * perm_r[i] = j means row i of A is in position j in Pr*A.
550 *
551 * perm_c (int*) dimension (A->ncol)
552 * Column permutation vector, which defines the
553 * permutation matrix Pc; perm_c[i] = j means column i of A is
554 * in position j in A*Pc.
555 *
556 */
557typedef struct {
558 DiagScale_t DiagScale;
559 double *R;
560 double *C;
561 int_t *perm_r;
562 int_t *perm_c;
563} ScalePermstruct_t;
564#endif
565
566/*-- Data structure for redistribution of B and X --*/
567typedef struct {
570 int *ptr_to_ibuf, *ptr_to_dbuf;
571
572 /* the following are needed in the hybrid solver PDSLin */
578
579 int_t x2b, b2x;
585
586/*
587 *-- This contains the options used to control the solution process.
588 *
589 * Fact (fact_t)
590 * Specifies whether or not the factored form of the matrix
591 * A is supplied on entry, and if not, how the matrix A should
592 * be factorizaed.
593 * = DOFACT: The matrix A will be factorized from scratch, and the
594 * factors will be stored in L and U.
595 * = SamePattern: The matrix A will be factorized assuming
596 * that a factorization of a matrix with the same sparsity
597 * pattern was performed prior to this one. Therefore, this
598 * factorization will reuse column permutation vector
599 * ScalePermstruct->perm_c and the column elimination tree
600 * LUstruct->etree.
601 * = SamePattern_SameRowPerm: The matrix A will be factorized
602 * assuming that a factorization of a matrix with the same
603 * sparsity pattern and similar numerical values was performed
604 * prior to this one. Therefore, this factorization will reuse
605 * both row and column scaling factors R and C, both row and
606 * column permutation vectors perm_r and perm_c, and the
607 * data structure set up from the previous symbolic factorization.
608 * = FACTORED: On entry, L, U, perm_r and perm_c contain the
609 * factored form of A. If DiagScale is not NOEQUIL, the matrix
610 * A has been equilibrated with scaling factors R and C.
611 *
612 * Equil (yes_no_t)
613 * Specifies whether to equilibrate the system (scale A's row and
614 * columns to have unit norm).
615 *
616 * DiagInv (yes_no_t)
617 * Specifies whether to invert the diagonal blocks of the LU
618 * triangular matrices.
619 *
620 * ColPerm (colperm_t)
621 * Specifies what type of column permutation to use to reduce fill.
622 * = NATURAL: use the natural ordering
623 * = MMD_ATA: use minimum degree ordering on structure of A'*A
624 * = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A
625 * = COLAMD: use approximate minimum degree column ordering
626 * = MY_PERMC: use the ordering specified by the user
627 *
628 * Trans (trans_t)
629 * Specifies the form of the system of equations:
630 * = NOTRANS: A * X = B (No transpose)
631 * = TRANS: A**T * X = B (Transpose)
632 * = CONJ: A**H * X = B (Transpose)
633 *
634 * IterRefine (IterRefine_t)
635 * Specifies whether to perform iterative refinement.
636 * = NO: no iterative refinement
637 * = SINGLE: perform iterative refinement in single precision
638 * = DOUBLE: perform iterative refinement in double precision
639 * = EXTRA: perform iterative refinement in extra precision
640 *
641 * DiagPivotThresh (double, in [0.0, 1.0]) (only for serial SuperLU)
642 * Specifies the threshold used for a diagonal entry to be an
643 * acceptable pivot.
644 *
645 * SymmetricMode (yest_no_t) (only for serial SuperLU)
646 * Specifies whether to use symmetric mode. Symmetric mode gives
647 * preference to diagonal pivots, and uses an (A'+A)-based column
648 * permutation algorithm.
649 *
650 * PivotGrowth (yes_no_t) (only for serial SuperLU)
651 * Specifies whether to compute the reciprocal pivot growth.
652 *
653 * ConditionNumber (ues_no_t) (only for serial SuperLU)
654 * Specifies whether to compute the reciprocal condition number.
655 *
656 * RowPerm (rowperm_t) (only for SuperLU_DIST or ILU in serial SuperLU)
657 * Specifies whether to permute rows of the original matrix.
658 * = NO: not to permute the rows
659 * = LargeDiag: make the diagonal large relative to the off-diagonal
660 * = MY_PERMR: use the permutation given by the user
661 *
662 * ILU_DropRule (int) (only for serial SuperLU)
663 * Specifies the dropping rule:
664 * = DROP_BASIC: Basic dropping rule, supernodal based ILUTP(tau).
665 * = DROP_PROWS: Supernodal based ILUTP(p,tau), p = gamma * nnz(A)/n.
666 * = DROP_COLUMN: Variant of ILUTP(p,tau), for j-th column,
667 * p = gamma * nnz(A(:,j)).
668 * = DROP_AREA: Variation of ILUTP, for j-th column, use
669 * nnz(F(:,1:j)) / nnz(A(:,1:j)) to control memory.
670 * = DROP_DYNAMIC: Modify the threshold tau during factorizaion:
671 * If nnz(L(:,1:j)) / nnz(A(:,1:j)) > gamma
672 * tau_L(j) := MIN(tau_0, tau_L(j-1) * 2);
673 * Otherwise
674 * tau_L(j) := MAX(tau_0, tau_L(j-1) / 2);
675 * tau_U(j) uses the similar rule.
676 * NOTE: the thresholds used by L and U are separate.
677 * = DROP_INTERP: Compute the second dropping threshold by
678 * interpolation instead of sorting (default).
679 * In this case, the actual fill ratio is not
680 * guaranteed to be smaller than gamma.
681 * Note: DROP_PROWS, DROP_COLUMN and DROP_AREA are mutually exclusive.
682 * ( Default: DROP_BASIC | DROP_AREA )
683 *
684 * ILU_DropTol (double) (only for serial SuperLU)
685 * numerical threshold for dropping.
686 *
687 * ILU_FillFactor (double) (only for serial SuperLU)
688 * Gamma in the secondary dropping.
689 *
690 * ILU_Norm (norm_t) (only for serial SuperLU)
691 * Specify which norm to use to measure the row size in a
692 * supernode: infinity-norm, 1-norm, or 2-norm.
693 *
694 * ILU_FillTol (double) (only for serial SuperLU)
695 * numerical threshold for zero pivot perturbation.
696 *
697 * ILU_MILU (milu_t) (only for serial SuperLU)
698 * Specifies which version of MILU to use.
699 *
700 * ILU_MILU_Dim (double)
701 * Dimension of the PDE if available.
702 *
703 * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST)
704 * Specifies whether to replace the tiny diagonals by
705 * sqrt(epsilon)*||A|| during LU factorization.
706 *
707 * SolveInitialized (yes_no_t) (only for SuperLU_DIST)
708 * Specifies whether the initialization has been performed to the
709 * triangular solve.
710 *
711 * RefineInitialized (yes_no_t) (only for SuperLU_DIST)
712 * Specifies whether the initialization has been performed to the
713 * sparse matrix-vector multiplication routine needed in iterative
714 * refinement.
715 *
716 * num_lookaheads (int) (only for SuperLU_DIST)
717 * Specifies the number of levels in the look-ahead factorization
718 *
719 * lookahead_etree (yes_no_t) (only for SuperLU_DIST)
720 * Specifies whether to use the elimination tree computed from the
721 * serial symbolic factorization to perform scheduling.
722 *
723 * SymPattern (yes_no_t) (only for SuperLU_DIST)
724 * Gives the scheduling algorithm a hint whether the matrix
725 * would have symmetric pattern.
726 *
727 */
728typedef struct {
740 int SolveOnly; /* Treat the input matrix A as the L & U factors */
741 int ILU_level; /* level-based ILU */
743 double ILU_DropTol; /* threshold for dropping */
744 double ILU_FillFactor; /* gamma in the secondary dropping */
745 norm_t ILU_Norm; /* infinity-norm, 1-norm, or 2-norm */
746 double ILU_FillTol; /* threshold for zero pivot perturbation */
748 double ILU_MILU_Dim; /* Dimension of PDE (if available) */
750 yes_no_t ReplaceTinyPivot; /* used in SuperLU_DIST */
754 //int nnzL, nnzU; /* used to store nnzs for now */
755 yes_no_t lookahead_etree; /* use etree computed from the
756 serial symbolic factorization */
757 int num_lookaheads; /* num of levels in look-ahead */
758 int superlu_relax; /* max. allowed relaxed supernode size; see sp_ienv(2) */
759 int superlu_maxsup; /* max. allowed supernode size; see sp_ienv(3) */
760 char superlu_rankorder[4]; /* Z-major or XY-majir order in 3D grid */
761 char superlu_lbs[4]; /* etree load balancing strategy in 3D algorithm */
762 int superlu_n_gemm; /* one of GEMM offload criteria; see sp_ienv(7) */
763 int superlu_max_buffer_size; /* max. buffer size on GPU; see sp_ienv(8) */
764 int superlu_num_gpu_streams; /* number of GPU streams; see sp_ienv(9) */
765 int superlu_acc_offload; /* whether to offload work to GPU; see sp_ienv(10) */
766 int batchCount; /* number of systems in the batched interface
767 0 : not to use batch interface (default) */
768 yes_no_t SymPattern; /* symmetric factorization */
769 yes_no_t Use_TensorCore; /* Use Tensor Core or not */
770 yes_no_t Algo3d; /* use 3D factorization/solve algorithms */
772
773typedef struct {
774 float for_lu;
775 float total;
777 int64_t nnzL, nnzU;
779
780/*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */
781typedef struct {
782 int_t lbnum; /* Row block number (local). */
783 int_t indpos; /* Starting position in Uindex[]. */
785
786/*
787 *-- The new structures added in the hybrid GPU + OpenMP + MPI code.
788 */
789typedef struct {
794 int_t eo; /* order of elimination. For 3D algorithm */
798
799typedef struct {
802 int_t eo; /* order of elimination, for 3D code */
807
808typedef struct
809{
810 int id, key;
811 void *next;
812} etree_node;
813
815{
816 int ind;
817 int val;
818};
819
822/*==== For 3D code ====*/
823
824typedef struct
825{
830 //double *uval;
832
833typedef struct
834{
836 //double *lusup;
837 void *lusup;
839 int_t nlb; //number of l blocks
842
843typedef struct
844{
850
851
852/* HyP_t is the data structure to assist HALO offload of Schur-complement. */
853typedef struct
854{
856 Ublock_info_t *Ublock_info, *Ublock_info_Phi;
857
858 int_t first_l_block_acc , first_u_block_acc;
860 int_t *Lblock_dirty_bit, * Ublock_dirty_bit;
861 void *lookAhead_L_buff, *Remain_L_buff;
862 int_t lookAheadBlk; /* number of blocks in look-ahead window */
863 int_t RemainBlk ; /* number of blocks outside look-ahead window */
864 int_t num_look_aheads, nsupers;
865 int_t ldu, ldu_Phi;
866 int_t num_u_blks, num_u_blks_Phi;
867
869 void *bigU_Phi;
873
879} HyP_t;
880
881
882
883/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 *
884 * for each supernodal column, the five communications are: *
885 * 0,1: for sending L to "right" *
886 * 2,3: for sending off-diagonal blocks of U "down" *
887 * 4 : for sending the diagonal blcok down (in pxgstrf2) */
888// int tag_ub;
889// #define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub )
890
891// #undef SLU_MPI_TAG
892/*defining my own MPI tags */
893/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 *
894 * for each supernodal column, the five communications are: *
895 * 0,1: for sending L to "right" *
896 * 2,3: for sending off-diagonal blocks of U "down" *
897 * 4 : for sending the diagonal blcok down (in pxgstrf2) *
898 * 5 : for sending the diagonal L block right () : added by piyush */
899#define SLU_MPI_TAG(id,num) ( (6*(num)+id) % tag_ub )
900
901/*structs for quick look up */
902typedef struct
903{
908
909typedef struct
910{
915
916
917//global variable
918// extern double CPU_CLOCK_RATE;
919
920typedef struct
921{
925
926typedef struct
927{
933 int* IbcastPanel_L; /*I bcast and recv placed for the k-th L panel*/
934 int* IbcastPanel_U; /*I bcast and recv placed for the k-th U panel*/
935 //int* numChildLeft; /* (NOT USED in this structure) number of children left to be factored*/
936 int* gpuLUreduced; /*New for GPU acceleration*/
937} factStat_t;
938
939typedef struct
940{
955
956typedef struct{
963 int_t depth; // distance from the top
964 double weight; // weight of the supernode
965 double iWeight; // weight of the whole subtree below
966 double scuWeight; // weight of schur complement update = max|n_k||L_k||U_k|
967} treeList_t;
968
969typedef struct
970{
971 int_t numLvl; // number of level in tree;
972 int_t* eTreeTopLims; // boundaries of each level of size
973 int_t* myIperm; // Iperm for my tree size nsupers;
974
976
977typedef struct
978{
979 int_t* setree; // global supernodal elimination tree
982
983typedef enum treePartStrat{
984 ND, // nested dissection ordering or natural ordering
985 GD // greedy load balance stregy
987
988typedef struct
989{
990 /* data */
991 int_t nNodes; // total number of nodes
992 int_t* nodeList; // list of nodes, should be in order of factorization
993#if 0 // Sherry: the following array is used on rForest_t. ???
994 int_t* treeHeads;
995#endif
996 /*topological information about the tree*/
997 int_t numLvl; // number of Topological levels in the forest
998 int_t numTrees; // number of tree in the forest
1000#if 0 // Sherry fix: the following two structures are in treeTopoInfo_t. ???
1001 int_t* eTreeTopLims; // boundaries of each level of size
1002 int_t* myIperm; // Iperm for my tree size nsupers;
1003#endif
1004
1005 /*information about load balance*/
1006 double weight; // estimated cost
1007 double cost; // measured cost
1008
1009} sForest_t;
1010
1011typedef struct
1012{
1013 /* data */
1018 MPI_Request* recv_req;
1019 MPI_Request* recv_requ;
1020 MPI_Request* send_req;
1021 MPI_Request* send_requ;
1023
1024typedef struct
1025{
1032
1033typedef struct
1034{
1037} msgs_t;
1038
1039typedef struct xtrsTimer_t
1040{
1049 double tfs_comm;
1053 double tbs_comm;
1056
1057 // counters for communication and computation volume
1058
1063
1064 double ppXmem; // perprocess X-memory
1066
1067
1068/*==== end For 3D code ====*/
1069
1070/*====================*/
1071
1072/***********************************************************************
1073 * Function prototypes
1074 ***********************************************************************/
1075
1076#ifdef __cplusplus
1077extern "C" {
1078#endif
1079
1080extern void superlu_gridinit(MPI_Comm, int, int, gridinfo_t *);
1081extern void superlu_gridmap(MPI_Comm, int, int, int [], int, gridinfo_t *);
1082extern void superlu_gridexit(gridinfo_t *);
1083extern void superlu_gridinit3d(MPI_Comm Bcomm, int nprow, int npcol, int npdep,
1084 gridinfo3d_t *grid) ;
1085extern void superlu_gridmap3d(MPI_Comm, int, int, int, int [], gridinfo3d_t *);
1086extern void superlu_gridexit3d(gridinfo3d_t *grid);
1087
1098 SuperMatrix*);
1099extern int sp_symetree_dist(int_t *, int_t *, int_t *, int_t, int_t *);
1100extern int sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *);
1101extern void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *);
1102extern void get_perm_c_batch(superlu_dist_options_t *options, int batchCount,
1103 handle_t *SparseMatrix_handles, int **CpivPtr);
1104extern void at_plus_a_dist(const int_t, const int_t, int_t *, int_t *,
1105 int_t *, int_t **, int_t **);
1106extern void getata_dist(const int_t m, const int_t n, const int_t nz, int_t *colptr, int_t *rowind,
1107 int_t *atanz, int_t **ata_colptr, int_t **ata_rowind);
1108extern void get_metis_dist(int_t n, int_t bnz, int_t *b_colptr, int_t *b_rowind, int_t *perm_c);
1109extern void get_colamd_dist(const int m, const int n, const int nnz,
1110 int_t *colptr, int_t *rowind, int_t *perm_c);
1111extern int genmmd_dist_(int_t *, int_t *, int_t *a,
1112 int_t *, int_t *, int_t *, int_t *,
1113 int_t *, int_t *, int_t *, int_t *, int_t *);
1114extern void bcast_tree(void *, int, MPI_Datatype, int, int,
1115 gridinfo_t *, int, int *);
1119 fact_t, void *, int_t, int_t, int_t, int_t,
1122 Glu_freeable_t *);
1123extern int symbfact_SubFree(Glu_freeable_t *);
1124extern void countnz_dist (const int_t, int_t *, int_t *, int_t *,
1126extern int64_t fixupL_dist (const int_t, const int_t *, Glu_persist_t *,
1127 Glu_freeable_t *);
1128extern int_t *TreePostorder_dist (int_t, int_t *);
1129extern float smach_dist(const char *);
1130extern double dmach_dist(const char *);
1131extern void *superlu_malloc_dist (size_t);
1132extern void superlu_free_dist (void*);
1133extern int *int32Malloc_dist (int);
1134extern int *int32Calloc_dist (int);
1135extern int_t *intMalloc_dist (int_t);
1136extern int_t *intCalloc_dist (int_t);
1137extern int mc64id_dist(int *);
1138extern void arrive_at_ublock (int, int_t *, int_t *, int *, int *, int *,
1139 int_t, int_t, int_t *, int_t *, int_t *, gridinfo_t *);
1141 gridinfo_t *, int_t *, int_t*);
1142
1143/* Auxiliary routines */
1144extern double SuperLU_timer_ (void);
1145extern void superlu_abort_and_exit_dist(char *);
1146extern int sp_ienv_dist (int, superlu_dist_options_t *);
1147extern void ifill_dist (int_t *, int_t, int_t);
1148extern void super_stats_dist (int_t, int_t *);
1150 int_t **, int_t **);
1152extern int xerr_dist (char *, int *);
1153extern void pxerr_dist (char *, gridinfo_t *, int_t);
1154extern void PStatInit(SuperLUStat_t *);
1155extern void PStatClear(SuperLUStat_t *);
1156extern void PStatFree(SuperLUStat_t *);
1158extern void log_memory(int64_t, SuperLUStat_t *);
1159extern void print_memorylog(SuperLUStat_t *, char *);
1160extern int superlu_dist_GetVersionNumber(int *, int *, int *);
1161extern void quickSort( int_t*, int_t, int_t, int_t);
1162extern void quickSortM( int_t*, int_t, int_t, int_t, int_t, int_t);
1163extern int_t partition( int_t*, int_t, int_t, int_t);
1165
1166extern int compareInt_t(void *a, void *b);
1167extern int compareInt(void *a, void *b);
1168extern int compareDouble(void *a, void *b);
1169extern int dist_checkArrayEq(void *arr, int length, MPI_Datatype datatype, int src_rank, int dest_rank, MPI_Comm communicator, int (*compare)(void *, void *));
1170
1171
1172
1173/* Prototypes for parallel symbolic factorization */
1174extern float symbfact_dist
1175(superlu_dist_options_t *, int, int,
1176 SuperMatrix *, int_t *, int_t *, int_t *, int_t *,
1177 Pslu_freeable_t *, MPI_Comm *, MPI_Comm *, superlu_dist_mem_usage_t *);
1178
1179/* Get the column permutation using parmetis */
1180extern float get_perm_c_parmetis
1181(SuperMatrix *, int_t *, int_t *, int, int,
1182 int_t **, int_t **, gridinfo_t *, MPI_Comm *);
1183
1184/* Auxiliary routines for memory expansions used during
1185 the parallel symbolic factorization routine */
1186
1188(int, int_t, int_t, int_t, int_t, int, int, int,
1190
1194
1198
1201
1202#ifdef ISORT
1203extern void isort (int_t N, int_t *ARRAY1, int_t *ARRAY2);
1204extern void isort1 (int_t N, int_t *ARRAY);
1205#else
1206int superlu_sort_perm (const void *arg1, const void *arg2)
1207{
1208 const int_t *val1 = (const int_t *) arg1;
1209 const int_t *val2 = (const int_t *) arg2;
1210 return (*val2 < *val1);
1211}
1212#endif
1213
1214#ifdef GPU_ACC /* GPU related */
1216 int *, int *, int *, int,
1217 int, int, int *, int, int_t);
1218extern int_t get_gpublas_nb (void);
1219extern int_t get_num_gpu_streams (void);
1220extern int getnGPUStreams(void);
1221extern int get_mpi_process_per_gpu (void);
1222/*to print out various statistics from GPU activities*/
1223extern void printGPUStats(int nsupers, SuperLUStat_t *stat, gridinfo3d_t*);
1224#endif
1225
1226extern double estimate_cpu_time(int m, int n , int k);
1227
1228extern int get_thread_per_process(void);
1230extern int_t get_min (int_t *, int_t);
1231extern int compare_pair (const void *, const void *);
1232extern int_t static_partition (struct superlu_pair *, int_t, int_t *, int_t,
1233 int_t *, int_t *, int);
1235extern int get_acc_solve(void);
1236extern int get_new3dsolve(void);
1237extern int get_new3dsolvetreecomm(void);
1238
1239/* Routines for debugging */
1240extern void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *);
1241extern void check_repfnz_dist(int_t, int_t, int_t, int_t *);
1242extern int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *);
1243extern int check_perm_dist(char *what, int_t n, int_t *perm);
1244extern void PrintDouble5(char *, int_t, double *);
1245extern void PrintInt10(char *, int_t, int_t *);
1246extern void PrintInt32(char *, int, int *);
1247extern int file_PrintInt10(FILE *, char *, int_t, int_t *);
1248extern int file_PrintInt32(FILE *, char *, int, int *);
1249extern int file_PrintLong10(FILE *, char *, int_t, int_t *);
1250
1251/* Routines for Async_tree communication*/
1252
1253#ifndef __SUPERLU_ASYNC_TREE /* allow multiple inclusions */
1254#define __SUPERLU_ASYNC_TREE
1255typedef struct
1256{
1257 MPI_Request sendRequests_[2];
1258 MPI_Comm comm_;
1261 int myDests_[2];
1264 int tag_;
1266 MPI_Datatype type_;
1268} C_Tree;
1269
1270#ifndef DEG_TREE
1271#define DEG_TREE 2
1272#endif
1273
1274#endif
1275
1276// extern void C_RdTree_Create(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision);
1277extern void C_RdTree_Create_nv(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision, int* needrecvrd,int* needsendrd);
1278extern void C_RdTree_Nullify(C_Tree* tree);
1279extern yes_no_t C_RdTree_IsRoot(C_Tree* tree);
1280extern void C_RdTree_forwardMessageSimple(C_Tree* Tree, void* localBuffer, int msgSize);
1281extern void C_RdTree_waitSendRequest(C_Tree* Tree);
1282
1283// extern void C_BcTree_Create(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision);
1284extern void C_BcTree_Create_nv(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision, int* needrecv);
1285extern void C_BcTree_Nullify(C_Tree* tree);
1286extern yes_no_t C_BcTree_IsRoot(C_Tree* tree);
1287extern void C_BcTree_forwardMessageSimple(C_Tree* tree, void* localBuffer, int msgSize);
1288extern void C_BcTree_waitSendRequest(C_Tree* tree);
1289
1290/*==== For 3D code ====*/
1291typedef enum {
1292 NOT_IN_GRID, // doesn't belong to my grid
1293 IN_GRID_ZERO, // belongsto my grid but doesn't initialized with zeros
1294 IN_GRID_AIJ // belongsto my grid and initialized with non-zeros
1296
1297extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid);
1298extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d);
1299extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT);
1300extern void slu_SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT);
1301extern int_t zAllocBcast(int_t size, void** ptr, gridinfo3d_t* grid3d);
1302extern int_t zAllocBcast_gridID(int_t size, void** ptr, int_t gridID, gridinfo3d_t* grid3d);
1303extern void permCol_SymbolicFact3d(superlu_dist_options_t *options, int n, SuperMatrix *GA, int_t *perm_c, int_t *etree,
1304 Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable, SuperLUStat_t *stat,
1305 superlu_dist_mem_usage_t*symb_mem_usage,
1306 gridinfo3d_t* grid3d);
1307extern SupernodeToGridMap_t* createSuperGridMap(int_t nsuper,int_t maxLvl, int_t *myTreeIdxs,
1308 int_t *myZeroTrIdxs, int_t* gNodeCount, int_t** gNodeLists);
1309extern int_t *createSupernode2TreeMap(int_t nsupers, int_t maxLvl, int_t *gNodeCount, int_t **gNodeLists);
1310extern void allocBcastArray(void **array, int_t size, int root, MPI_Comm comm);
1311extern void allocBcastLargeArray(void **array, int64_t size, int root, MPI_Comm comm);
1312extern int_t* create_iperm_c_supno(int_t nsupers, superlu_dist_options_t *options, Glu_persist_t *Glu_persist, int_t *etree, int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr, gridinfo3d_t *grid3d);
1313extern gEtreeInfo_t fillEtreeInfo( int_t nsupers, int_t* setree, treeList_t *treeList);
1314extern sForest_t **compute_sForests(int_t nsupers, Glu_persist_t *Glu_persist, int_t *etree, gridinfo3d_t *grid3d);
1315
1316// 3D SpTRSV
1318
1319extern int* getBrecvTree(int_t nlb, sForest_t* sforest, int* bmod, gridinfo_t * grid);
1320extern int* getBrecvTree_newsolve(int_t nlb, int_t nsupers, int* supernodeMask, int* bmod, gridinfo_t * grid);
1321extern int getNrootUsolveTree(int_t* nbrecvmod, sForest_t* sforest, int* brecv,
1322 int* bmod, gridinfo_t * grid);
1323extern int getNbrecvX(sForest_t* sforest, int_t* Urbs, gridinfo_t * grid);
1324extern int getNbrecvX_newsolve(int_t nsupers, int* supernodeMask, int_t* Urbs, Ucb_indptr_t **Ucb_indptr, gridinfo_t * grid);
1325extern int getNrootUsolveTree_newsolve(int_t* nbrecvmod, int_t nsupers, int* supernodeMask, int* brecv, int* bmod, gridinfo_t * grid);
1326extern int_t getNfrecvmodLeaf(int* nleaf, sForest_t* sforest, int* frecv, int* fmod, gridinfo_t * grid);
1327extern int_t getNfrecvmod_newsolve(int* nleaf, int_t nsupers, int* supernodeMask, int* frecv, int* fmod, gridinfo_t * grid);
1328extern int* getfrecv_newsolve(int_t nsupers, int* supernodeMask, int_t nlb, int* fmod,
1329 int *mod_bit, gridinfo_t * grid);
1330extern int* getfrecvLeaf( sForest_t* sforest, int_t nlb, int* fmod,
1331 int *mod_bit, gridinfo_t * grid);
1332extern int getNfrecvx_newsolve(int_t nsupers, int* supernodeMask, int_t** Lrowind_bc_ptr, int_t** Lindval_loc_bc_ptr, gridinfo_t * grid);
1333extern int getNfrecvxLeaf(sForest_t* sforest, int_t** Lrowind_bc_ptr, gridinfo_t * grid);
1334extern int* getfmod_newsolve(int_t nlb, int_t nsupers, int* supernodeMask, int_t** Lrowind_bc_ptr, int_t** Lindval_loc_bc_ptr, gridinfo_t * grid);
1335extern int* getfmodLeaf(int_t nlb, int* fmod_i);
1336extern int getldu(int_t knsupc, int_t iklrow, int_t* usub );
1337extern int* getBmod3d(int_t treeId, int_t nlb, sForest_t* sforest, int_t* xsup, int_t **Ufstnz_br_ptr, int_t* supernode2treeMap, gridinfo_t * grid);
1338extern int* getBmod3d_newsolve(int_t nlb, int_t nsupers, int* supernodeMask, int_t* xsup, int_t **Ufstnz_br_ptr, gridinfo_t * grid);
1339
1340// permutation from superLU default
1342 int_t *etree, Glu_persist_t *Glu_persist,
1343 int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
1344 gridinfo_t *);
1345
1347 int_t *etree, Glu_persist_t *Glu_persist,
1348 int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
1349 gridinfo3d_t * grid3d);
1350
1351/* Manipulate counters */
1352extern void slu_SCT_init(SCT_t*);
1353extern void slu_SCT_print(gridinfo_t *grid, SCT_t* SCT);
1354extern void slu_SCT_printSummary(gridinfo_t *grid, SCT_t* SCT);
1355extern void slu_SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT);
1356extern void slu_SCT_free(SCT_t*);
1357
1358extern treeList_t* setree2list(int_t nsuper, int_t* setree );
1359extern int free_treelist(int_t nsuper, treeList_t* treeList);
1360
1361// int_t calcTreeWeight(int_t nsupers, treeList_t* treeList, int_t* xsup);
1362extern int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup);
1363extern int_t getDescendList(int_t k, int_t*dlist, treeList_t* treeList);
1364extern int_t getCommonAncestorList(int_t k, int_t* alist, int_t* seTree, treeList_t* treeList);
1365extern int_t getCommonAncsCount(int_t k, treeList_t* treeList);
1366extern int_t* getPermNodeList(int_t nnode, // number of nodes
1367 int_t* nlist, int_t* perm_c_sup,int_t* iperm_c_sup);
1368extern int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder);
1369// extern int_t* getSubTreeRoots(int_t k, treeList_t* treeList);
1370extern int_t* getSubTreeRoots(int_t k, int_t *numSubtrees, treeList_t* treeList);
1371// int_t* treeList2perm(treeList_t* , ..);
1372extern int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms);
1373// returns a concatenated permutation for three permutation arrays
1374
1375extern int_t* getGlobal_iperm(int_t nsupers, int_t nperms, int_t** perms,
1376 int_t* nnodes);
1377extern int_t log2i(int_t index);
1378extern int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup);
1379extern int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount);
1380extern int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel);
1381
1382/*takes supernodal elimination tree and for each
1383 supernode calculates "level" in elimination tree*/
1384extern int_t* topological_ordering(int_t nsuper, int_t* setree);
1385extern int_t* Etree_LevelBoundry(int_t* perm,int_t* tsort_etree, int_t nsuper);
1386
1387/*calculated boundries of the topological levels*/
1388extern int_t* calculate_num_children(int_t nsuper, int_t* setree);
1389extern void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper);
1390extern void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper);
1391extern void print_etree(int_t *setree, int_t* iperm, int_t nsuper);
1392extern int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree);
1393int* getLastDepBtree( int_t nsupers, treeList_t* treeList);
1394
1395/*returns array R with of size maxLevel with either 0 or 1
1396 R[i] = 1; then Tree[level-i] is set to zero= to only
1397 accumulate the results */
1398extern int_t* getReplicatedTrees( gridinfo3d_t* grid3d);
1399
1400/*returns indices in gNodeList of trees that belongs to my layer*/
1401extern int_t* getGridTrees( gridinfo3d_t* grid3d);
1402
1403
1404/*returns global nodelist*/
1405extern int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes,
1406 int_t* treeHeads, treeList_t* treeList);
1407
1408/* calculate number of nodes in subtrees starting from treeHead[i]*/
1409extern int_t* calcNumNodes(int_t maxLvl, int_t* treeHeads, treeList_t* treeList);
1410
1411/*Returns list of (last) node of the trees */
1412extern int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList);
1413
1414extern int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm);
1415
1416extern int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree );
1417
1418extern int_t* getMyEtLims(int_t nnodes, int_t* myTopOrder);
1419
1420
1421extern treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t nsupers,
1422 int_t* myPerm,int_t* setree);
1423
1424extern sForest_t** getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList);
1425
1426extern int_t** getTreePermForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
1427 sForest_t* sForests,
1428 int_t* perm_c_supno, int_t* iperm_c_supno,
1429 gridinfo3d_t* grid3d);
1430extern int_t** getTreePermFr( int_t* myTreeIdxs,
1431 sForest_t** sForests, gridinfo3d_t* grid3d);
1432extern int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs,
1433 sForest_t** sForests);
1434extern int_t** getNodeListFr(int_t maxLvl, sForest_t** sForests);
1435extern int_t* getNodeCountsFr(int_t maxLvl, sForest_t** sForests);
1436// int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* grid3d);
1437extern int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm);
1438extern void printForestWeightCost(sForest_t** sForests, SCT_t* SCT, gridinfo3d_t* grid3d);
1439extern sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* setree, treeList_t* treeList);
1440extern sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList);
1441
1442 /* from trfAux.h */
1444 gridinfo_t *grid, int_t **Lrowind_bc_ptr);
1445extern void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup,
1446 int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
1447 gridinfo3d_t * grid3d);
1448
1449extern void getSCUweight_allgrid(int_t nsupers, treeList_t* treeList, int_t* xsup,
1450 int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
1451 gridinfo3d_t * grid3d
1452 );
1453
1454extern int Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req,
1455 MPI_Request *L_diag_blk_send_req,
1456 gridinfo_t *grid, SCT_t *SCT);
1457extern void applyRowPerm(int_t* colptr, int_t* rowind, int_t* perm_r, int_t n);
1458
1459
1460extern int getNsupers(int n, Glu_persist_t *Glu_persist);
1461extern int set_tag_ub(void);
1462extern int getNumThreads(int);
1463extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup,
1464 gridinfo_t *, int_t *, int_t *);
1465
1466#if 0 // Sherry: conflicting with existing routine
1467extern int_t estimate_bigu_size(int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
1468 Glu_persist_t *, gridinfo_t*, int_t* perm_u);
1469#endif
1470
1471extern int_t* getFactPerm(int_t);
1472extern int_t* getFactIperm(int_t*, int_t);
1473
1474extern int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid);
1475extern int_t initFactStat(int nsupers, factStat_t* factStat);
1476extern int freeFactStat(factStat_t* factStat);
1478extern int freeFactNodelists(factNodelists_t* fNlists);
1479extern int_t initMsgs(msgs_t* msgs);
1481extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
1482extern int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss);
1483
1484extern msgs_t** initMsgsArr(int_t numLA);
1485extern int freeMsgsArr(int_t numLA, msgs_t **msgss);
1486
1489
1490 /* from sec_structs.h */
1491extern int Cmpfunc_R_info (const void * a, const void * b);
1492extern int Cmpfunc_U_info (const void * a, const void * b);
1493extern int sort_R_info( Remain_info_t* Remain_info, int n );
1494extern int sort_U_info( Ublock_info_t* Ublock_info, int n );
1495extern int sort_R_info_elm( Remain_info_t* Remain_info, int n );
1496extern int sort_U_info_elm( Ublock_info_t* Ublock_info, int n );
1497
1498 /* from pdgstrs.h */
1499extern void printTRStimer(xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d);
1500extern void initTRStimer(xtrsTimer_t *xtrsTimer, gridinfo_t *grid);
1501
1502 /* from p3dcomm.c */
1503extern int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
1504 int_t* nodeCount, int_t** nodeList,
1505 int_t* perm_c_supno, int_t* iperm_c_supno,
1506 gridinfo3d_t* grid3d);
1507extern int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount);
1508extern int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d);
1509extern int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t * grid3d);
1510
1511 /* from communication_aux.h */
1512extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
1513 MPI_Request *s, SCT_t*);
1514extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
1515extern int_t Check_LRecv(MPI_Request*, int* msgcnt);
1516extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *);
1517extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *);
1518extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *);
1519extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *);
1520extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *);
1521extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *);
1522extern int_t LDiagBlockRecvWait( int_t k, int* factored_U, MPI_Request *, gridinfo_t *);
1523
1524
1526 int_t kk, int_t *usub, int_t *xsup,
1527 gridinfo_t *grid, int_t *perm_u,
1528 int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */
1529);
1530
1531/*=====================*/
1532
1533#ifdef __cplusplus
1534 }
1535#endif
1536
1537#endif /* __SUPERLU_DEFS */
integer, parameter, public usub
Definition: superlupara.f90:35
int superlu_sort_perm(const void *arg1, const void *arg2)
Definition: psgstrf.c:160
Definitions for parallel symbolic factorization routine.
Definition: superlu_defs.h:1256
int tag_
Definition: superlu_defs.h:1264
int msgSize_
Definition: superlu_defs.h:1263
MPI_Comm comm_
Definition: superlu_defs.h:1258
int destCnt_
Definition: superlu_defs.h:1260
MPI_Datatype type_
Definition: superlu_defs.h:1266
int myIdx
Definition: superlu_defs.h:1267
yes_no_t empty_
Definition: superlu_defs.h:1265
int myRank_
Definition: superlu_defs.h:1262
int myRoot_
Definition: superlu_defs.h:1259
Definition: superlu_defs.h:506
int_t * usub
Definition: superlu_defs.h:509
int_t nzumax
Definition: superlu_defs.h:512
LU_space_t MemModel
Definition: superlu_defs.h:513
int_t nzlmax
Definition: superlu_defs.h:511
int_t * xusub
Definition: superlu_defs.h:510
int_t * lsub
Definition: superlu_defs.h:507
int64_t nnzLU
Definition: superlu_defs.h:516
int_t * xlsub
Definition: superlu_defs.h:508
Definition: superlu_defs.h:451
int_t * xsup
Definition: superlu_defs.h:452
int_t * supno
Definition: superlu_defs.h:453
Definition: superlu_defs.h:854
void * lookAhead_L_buff
Definition: superlu_defs.h:861
int_t buffer_size
Definition: superlu_defs.h:874
int nGPUStreams
Definition: superlu_defs.h:878
int offloadCondition
Definition: superlu_defs.h:876
Remain_info_t * lookAhead_info
Definition: superlu_defs.h:855
int_t bigu_size
Definition: superlu_defs.h:875
int_t Rnbrow
Definition: superlu_defs.h:872
void * bigU_host
Definition: superlu_defs.h:870
int_t RemainBlk
Definition: superlu_defs.h:863
int_t * Lblock_dirty_bit
Definition: superlu_defs.h:860
int_t first_l_block_acc
Definition: superlu_defs.h:858
int_t num_u_blks
Definition: superlu_defs.h:866
void * bigU_Phi
Definition: superlu_defs.h:869
int_t last_offload
Definition: superlu_defs.h:859
int_t Lnbrow
Definition: superlu_defs.h:871
int_t ldu
Definition: superlu_defs.h:865
Ublock_info_t * Ublock_info
Definition: superlu_defs.h:856
int superlu_acc_offload
Definition: superlu_defs.h:877
int_t nsupers
Definition: superlu_defs.h:864
int_t lookAheadBlk
Definition: superlu_defs.h:862
int_t jj_cpu
Definition: superlu_defs.h:868
Definition: psymbfact.h:106
Definition: psymbfact.h:57
Definition: superlu_defs.h:799
int_t lptr
Definition: superlu_defs.h:800
int_t StRow
Definition: superlu_defs.h:805
int_t eo
Definition: superlu_defs.h:802
int_t nrows
Definition: superlu_defs.h:803
int_t ib
Definition: superlu_defs.h:801
int_t FullRow
Definition: superlu_defs.h:804
Definition: util_dist.h:199
Definition: util_dist.h:101
Definition: supermatrix.h:54
Definition: superlu_defs.h:789
int_t eo
Definition: superlu_defs.h:794
int_t jb
Definition: superlu_defs.h:792
int_t StCol
Definition: superlu_defs.h:796
int_t iukp
Definition: superlu_defs.h:791
int_t full_u_cols
Definition: superlu_defs.h:793
int_t ncols
Definition: superlu_defs.h:795
int_t rukp
Definition: superlu_defs.h:790
Definition: superlu_defs.h:781
int_t lbnum
Definition: superlu_defs.h:782
int_t indpos
Definition: superlu_defs.h:783
Definition: superlu_defs.h:1012
MPI_Request * L_diag_blk_send_req
Definition: superlu_defs.h:1015
MPI_Request * send_req
Definition: superlu_defs.h:1020
MPI_Request * U_diag_blk_recv_req
Definition: superlu_defs.h:1016
MPI_Request * L_diag_blk_recv_req
Definition: superlu_defs.h:1014
MPI_Request * U_diag_blk_send_req
Definition: superlu_defs.h:1017
MPI_Request * recv_requ
Definition: superlu_defs.h:1019
MPI_Request * recv_req
Definition: superlu_defs.h:1018
MPI_Request * send_requ
Definition: superlu_defs.h:1021
Definition: superlu_defs.h:940
int_t ksup_size
Definition: superlu_defs.h:953
int_t * kindexU
Definition: superlu_defs.h:950
int_t copyU_kljb
Definition: superlu_defs.h:946
int_t next_col
Definition: superlu_defs.h:941
int_t kijb
Definition: superlu_defs.h:944
int_t next_k
Definition: superlu_defs.h:942
int_t mkrow
Definition: superlu_defs.h:951
int_t copyL_kljb
Definition: superlu_defs.h:945
int_t u_copy_len
Definition: superlu_defs.h:948
int_t kljb
Definition: superlu_defs.h:943
int_t * kindexL
Definition: superlu_defs.h:949
int_t mkcol
Definition: superlu_defs.h:952
int_t l_copy_len
Definition: superlu_defs.h:947
Definition: superlu_defs.h:809
int id
Definition: superlu_defs.h:810
void * next
Definition: superlu_defs.h:811
Definition: superlu_defs.h:1025
int * indirect
Definition: superlu_defs.h:1029
int * indirect2
Definition: superlu_defs.h:1030
int_t * perm_u
Definition: superlu_defs.h:1028
int_t * iperm_c_supno
Definition: superlu_defs.h:1026
int_t * iperm_u
Definition: superlu_defs.h:1027
Definition: superlu_defs.h:927
int * IbcastPanel_U
Definition: superlu_defs.h:934
int * IbcastPanel_L
Definition: superlu_defs.h:933
int * factored
Definition: superlu_defs.h:928
int * gpuLUreduced
Definition: superlu_defs.h:936
int * IrecvPlcd_D
Definition: superlu_defs.h:932
int * factored_U
Definition: superlu_defs.h:931
int * factored_D
Definition: superlu_defs.h:929
int * factored_L
Definition: superlu_defs.h:930
Definition: superlu_defs.h:978
int_t * numChildLeft
Definition: superlu_defs.h:980
int_t * setree
Definition: superlu_defs.h:979
Definition: superlu_defs.h:414
int_t npdep
Definition: superlu_defs.h:423
int_t nprow
Definition: superlu_defs.h:421
gridinfo_t grid2d
Definition: superlu_defs.h:419
superlu_scope_t zscp
Definition: superlu_defs.h:418
superlu_scope_t rscp
Definition: superlu_defs.h:416
int iam
Definition: superlu_defs.h:420
int_t npcol
Definition: superlu_defs.h:422
MPI_Comm comm
Definition: superlu_defs.h:415
int rankorder
Definition: superlu_defs.h:424
superlu_scope_t cscp
Definition: superlu_defs.h:417
Definition: superlu_defs.h:404
int_t nprow
Definition: superlu_defs.h:409
int_t npcol
Definition: superlu_defs.h:410
superlu_scope_t cscp
Definition: superlu_defs.h:407
superlu_scope_t rscp
Definition: superlu_defs.h:406
MPI_Comm comm
Definition: superlu_defs.h:405
int iam
Definition: superlu_defs.h:408
Definition: superlu_defs.h:834
int_t nsupr
Definition: superlu_defs.h:840
int_t luptr0
Definition: superlu_defs.h:838
int_t nlb
Definition: superlu_defs.h:839
int_t * lsub
Definition: superlu_defs.h:835
void * lusup
Definition: superlu_defs.h:837
Definition: superlu_defs.h:903
int_t lptrj
Definition: superlu_defs.h:905
int_t lib
Definition: superlu_defs.h:906
int_t luptrj
Definition: superlu_defs.h:904
Definition: superlu_defs.h:910
int_t iuip
Definition: superlu_defs.h:911
int_t ruip
Definition: superlu_defs.h:912
int_t ljb
Definition: superlu_defs.h:913
Definition: superlu_defs.h:1034
int * msgcntU
Definition: superlu_defs.h:1036
int * msgcnt
Definition: superlu_defs.h:1035
Definition: superlu_defs.h:844
lPanelInfo_t * lPanelInfo
Definition: superlu_defs.h:848
Ublock_info_t * Ublock_info
Definition: superlu_defs.h:845
uPanelInfo_t * uPanelInfo
Definition: superlu_defs.h:847
Remain_info_t * Remain_info
Definition: superlu_defs.h:846
Definition: superlu_defs.h:921
int_t * iperm_c_supno
Definition: superlu_defs.h:923
int_t * perm_c_supno
Definition: superlu_defs.h:922
statistics collected during parallel symbolic factorization
Definition: psymbfact.h:194
Definition: superlu_defs.h:567
int_t b2x
Definition: superlu_defs.h:579
void * recv_dbuf2
Definition: superlu_defs.h:583
void * send_dbuf
Definition: superlu_defs.h:577
int * ptr_to_dbuf
Definition: superlu_defs.h:570
int * X_to_B_vSendCnt
Definition: superlu_defs.h:574
int * X_to_B_SendCnt
Definition: superlu_defs.h:569
void * send_dbuf2
Definition: superlu_defs.h:582
int * X_to_B_iSendCnt
Definition: superlu_defs.h:573
int * disp_ibuf
Definition: superlu_defs.h:575
int * B_to_X_SendCnt
Definition: superlu_defs.h:568
int_t * recv_ibuf2
Definition: superlu_defs.h:581
int_t * send_ibuf
Definition: superlu_defs.h:576
int_t * send_ibuf2
Definition: superlu_defs.h:580
Definition: superlu_defs.h:989
treeTopoInfo_t topoInfo
Definition: superlu_defs.h:999
int_t * nodeList
Definition: superlu_defs.h:992
int_t numLvl
Definition: superlu_defs.h:997
double weight
Definition: superlu_defs.h:1006
int_t numTrees
Definition: superlu_defs.h:998
double cost
Definition: superlu_defs.h:1007
int_t nNodes
Definition: superlu_defs.h:991
Definition: superlu_defs.h:773
int expansions
Definition: superlu_defs.h:776
float for_lu
Definition: superlu_defs.h:774
float total
Definition: superlu_defs.h:775
int64_t nnzL
Definition: superlu_defs.h:777
Definition: superlu_defs.h:728
int superlu_acc_offload
Definition: superlu_defs.h:765
yes_no_t PrintStat
Definition: superlu_defs.h:753
yes_no_t SymmetricMode
Definition: superlu_defs.h:736
double ILU_DropTol
Definition: superlu_defs.h:743
int batchCount
Definition: superlu_defs.h:766
yes_no_t RefineInitialized
Definition: superlu_defs.h:752
yes_no_t Algo3d
Definition: superlu_defs.h:770
yes_no_t PivotGrowth
Definition: superlu_defs.h:737
double DiagPivotThresh
Definition: superlu_defs.h:735
yes_no_t Equil
Definition: superlu_defs.h:730
yes_no_t Use_TensorCore
Definition: superlu_defs.h:769
norm_t ILU_Norm
Definition: superlu_defs.h:745
int superlu_maxsup
Definition: superlu_defs.h:759
int superlu_relax
Definition: superlu_defs.h:758
yes_no_t lookahead_etree
Definition: superlu_defs.h:755
int superlu_num_gpu_streams
Definition: superlu_defs.h:764
int num_lookaheads
Definition: superlu_defs.h:757
yes_no_t ConditionNumber
Definition: superlu_defs.h:738
int ILU_level
Definition: superlu_defs.h:741
trans_t Trans
Definition: superlu_defs.h:733
yes_no_t SymPattern
Definition: superlu_defs.h:768
milu_t ILU_MILU
Definition: superlu_defs.h:747
IterRefine_t IterRefine
Definition: superlu_defs.h:734
double ILU_MILU_Dim
Definition: superlu_defs.h:748
yes_no_t SolveInitialized
Definition: superlu_defs.h:751
int superlu_max_buffer_size
Definition: superlu_defs.h:763
fact_t Fact
Definition: superlu_defs.h:729
yes_no_t DiagInv
Definition: superlu_defs.h:731
double ILU_FillTol
Definition: superlu_defs.h:746
int superlu_n_gemm
Definition: superlu_defs.h:762
double ILU_FillFactor
Definition: superlu_defs.h:744
colperm_t ColPerm
Definition: superlu_defs.h:732
int ILU_DropRule
Definition: superlu_defs.h:742
yes_no_t ParSymbFact
Definition: superlu_defs.h:749
int SolveOnly
Definition: superlu_defs.h:740
rowperm_t RowPerm
Definition: superlu_defs.h:739
yes_no_t ReplaceTinyPivot
Definition: superlu_defs.h:750
Definition: superlu_defs.h:815
int val
Definition: superlu_defs.h:817
int ind
Definition: superlu_defs.h:816
Definition: superlu_defs.h:397
int Np
Definition: superlu_defs.h:399
MPI_Comm comm
Definition: superlu_defs.h:398
int Iam
Definition: superlu_defs.h:400
Definition: superlu_defs.h:956
int_t depth
Definition: superlu_defs.h:963
int_t left
Definition: superlu_defs.h:959
int_t right
Definition: superlu_defs.h:960
double scuWeight
Definition: superlu_defs.h:966
int_t numChild
Definition: superlu_defs.h:957
int_t extra
Definition: superlu_defs.h:961
double iWeight
Definition: superlu_defs.h:965
int_t numDescendents
Definition: superlu_defs.h:958
int_t * childrenList
Definition: superlu_defs.h:962
double weight
Definition: superlu_defs.h:964
Definition: superlu_defs.h:970
int_t * myIperm
Definition: superlu_defs.h:973
int_t numLvl
Definition: superlu_defs.h:971
int_t * eTreeTopLims
Definition: superlu_defs.h:972
Definition: superlu_defs.h:825
int_t nub
Definition: superlu_defs.h:826
int_t * usub
Definition: superlu_defs.h:829
int_t klst
Definition: superlu_defs.h:827
int_t ldu
Definition: superlu_defs.h:828
Local information on vertices distribution.
Definition: psymbfact.h:140
Definition: superlu_defs.h:1040
int_t trsMsgRecvZ
Definition: superlu_defs.h:1062
double t_forwardSolve
Definition: superlu_defs.h:1047
double tbs_comm
Definition: superlu_defs.h:1053
double tfs_tree[2 *MAX_3D_LEVEL]
Definition: superlu_defs.h:1055
double tbs_tree[2 *MAX_3D_LEVEL]
Definition: superlu_defs.h:1054
int_t trsMsgRecvXY
Definition: superlu_defs.h:1061
double tfs_comm
Definition: superlu_defs.h:1049
int_t trsMsgSentZ
Definition: superlu_defs.h:1060
double trsDataSendZ
Definition: superlu_defs.h:1042
double trsDataRecvZ
Definition: superlu_defs.h:1044
double t_pxReDistribute_X_to_B
Definition: superlu_defs.h:1045
double trsDataSendXY
Definition: superlu_defs.h:1041
double ppXmem
Definition: superlu_defs.h:1064
double tbs_compute
Definition: superlu_defs.h:1052
int_t trsMsgSentXY
Definition: superlu_defs.h:1059
double t_backwardSolve
Definition: superlu_defs.h:1051
double trs_comm_z
Definition: superlu_defs.h:1050
double trsDataRecvXY
Definition: superlu_defs.h:1043
double t_pxReDistribute_B_to_X
Definition: superlu_defs.h:1046
double tfs_compute
Definition: superlu_defs.h:1048
Macro definitions.
void superlu_free_dist(void *)
Definition: memory.c:160
void ifill_dist(int_t *, int_t, int_t)
Fills an integer array with a given value.
Definition: util.c:553
int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *)
Definition: communication_aux.c:56
void applyRowPerm(int_t *colptr, int_t *rowind, int_t *perm_r, int_t n)
Definition: trfAux.c:2514
void superlu_gridinit3d(MPI_Comm Bcomm, int nprow, int npcol, int npdep, gridinfo3d_t *grid)
All processes in the MPI communicator must call this routine.
Definition: superlu_grid3d.c:16
void PrintDouble5(char *, int_t, double *)
int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, gridinfo_t *, int_t *, int_t *)
Definition: util.c:1013
void PStatClear(SuperLUStat_t *)
Definition: util.c:360
int_t ** getTreePermForest(int_t *myTreeIdxs, int_t *myZeroTrIdxs, sForest_t *sForests, int_t *perm_c_supno, int_t *iperm_c_supno, gridinfo3d_t *grid3d)
void print_options_dist(superlu_dist_options_t *)
Print the options setting.
Definition: util.c:250
int_t zAllocBcast_gridID(int_t size, void **ptr, int_t gridID, gridinfo3d_t *grid3d)
Definition: supernodalForest.c:1314
int compareDouble(void *a, void *b)
Compares two doubles for equality.
Definition: distCheckArray.c:63
void C_BcTree_forwardMessageSimple(C_Tree *tree, void *localBuffer, int msgSize)
Definition: comm_tree.c:149
void get_perm_c_batch(superlu_dist_options_t *options, int batchCount, handle_t *SparseMatrix_handles, int **CpivPtr)
Gets sparsity permutations for a batch of matrices.
Definition: get_perm_c_batch.c:38
treePartStrat
Definition: superlu_defs.h:983
@ GD
Definition: superlu_defs.h:985
@ ND
Definition: superlu_defs.h:984
int getNsupers(int n, Glu_persist_t *Glu_persist)
Definition: trfAux.c:42
void superlu_gridexit3d(gridinfo3d_t *grid)
Definition: superlu_grid3d.c:268
void C_BcTree_Create_nv(C_Tree *tree, MPI_Comm comm, int *ranks, int rank_cnt, int msgSize, char precision, int *needrecv)
Definition: comm_tree.c:5
int xerr_dist(char *, int *)
Definition: xerr_dist.c:26
void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t *SCT)
Definition: sec_structs.c:533
int_t psymbfact_LUXpand_RL(int_t, int_t, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *)
Definition: psymbfact_util.c:398
double dmach_dist(const char *)
Definition: dmach_dist.c:16
void print_etree(int_t *setree, int_t *iperm, int_t nsuper)
Definition: supernodal_etree.c:1084
int_t log2i(int_t index)
Definition: supernodal_etree.c:17
int * getfmodLeaf(int_t nlb, int *fmod_i)
Definition: supernodalForest.c:1546
int_t get_gpublas_nb(void)
Definition: util.c:893
int_t partitionM(int_t *, int_t, int_t, int_t, int_t, int_t)
Definition: util.c:1235
void Destroy_CompCol_Matrix_dist(SuperMatrix *)
Definition: util.c:35
int_t getDescendList(int_t k, int_t *dlist, treeList_t *treeList)
Definition: supernodal_etree.c:286
int_t * getMyNodeCountsFr(int_t maxLvl, int_t *myTreeIdxs, sForest_t **sForests)
Definition: supernodalForest.c:278
int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *)
Definition: communication_aux.c:112
float smach_dist(const char *)
Definition: smach_dist.c:16
int_t * getMyTopOrder(int_t nnodes, int_t *myPerm, int_t *myIperm, int_t *setree)
Definition: supernodal_etree.c:891
int_t * getFactPerm(int_t)
Definition: trfAux.c:207
void slu_SCT_init(SCT_t *)
Definition: sec_structs.c:166
int free_treelist(int_t nsuper, treeList_t *treeList)
Definition: supernodal_etree.c:119
void C_BcTree_Nullify(C_Tree *tree)
Definition: comm_tree.c:130
int_t * getEtreeLB(int_t nnodes, int_t *perm_l, int_t *gTopOrder)
Definition: supernodal_etree.c:366
int get_thread_per_process(void)
Definition: util.c:869
int_t initCommRequests(commRequests_t *comReqs, gridinfo_t *grid)
Definition: treeFactorization.c:28
int_t * calcNumNodes(int_t maxLvl, int_t *treeHeads, treeList_t *treeList)
Definition: supernodal_etree.c:765
int_t * getFactIperm(int_t *, int_t)
Definition: trfAux.c:220
void DistPrint(char *function_name, double value, char *Units, gridinfo_t *grid)
Definition: sec_structs.c:314
int_t * getPerm_c_supno(int_t nsupers, superlu_dist_options_t *, int_t *etree, Glu_persist_t *Glu_persist, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo_t *)
Definition: trfAux.c:233
int_t * intMalloc_dist(int_t)
Definition: memory.c:210
int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t *grid3d)
Definition: util.c:1366
int get_acc_solve(void)
Definition: sec_structs.c:593
int compareInt(void *a, void *b)
Compares two integers for equality.
Definition: distCheckArray.c:42
int_t * merg_perms(int_t nperms, int_t *nnodes, int_t **perms)
Definition: supernodal_etree.c:511
void allocBcastArray(void **array, int_t size, int root, MPI_Comm comm)
Allocates and broadcasts an array in a MPI environment.
Definition: trfAux.c:2690
int_t static_partition(struct superlu_pair *, int_t, int_t *, int_t, int_t *, int_t *, int)
Definition: util.c:933
int * getfrecv_newsolve(int_t nsupers, int *supernodeMask, int_t nlb, int *fmod, int *mod_bit, gridinfo_t *grid)
Definition: supernodalForest.c:1329
int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *)
Definition: communication_aux.c:172
sForest_t ** getNestDissForests(int_t maxLvl, int_t nsupers, int_t *setree, treeList_t *treeList)
Definition: supernodalForest.c:62
void slu_SCT_free(SCT_t *)
Definition: sec_structs.c:295
int get_acc_offload(superlu_dist_options_t *)
Definition: sec_structs.c:583
int_t * getPerm_c_supno_allgrid(int_t nsupers, superlu_dist_options_t *options, int_t *etree, Glu_persist_t *Glu_persist, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d)
Definition: trfAux.c:1176
int_t ** getNodeListFr(int_t maxLvl, sForest_t **sForests)
Definition: supernodalForest.c:234
int set_tag_ub(void)
Definition: trfAux.c:48
int_t get_min(int_t *, int_t)
Definition: util.c:916
double estimate_cpu_time(int m, int n, int k)
Definition: acc_aux.c:214
void countnz_dist(const int_t, int_t *, int_t *, int_t *, Glu_persist_t *, Glu_freeable_t *)
Definition: util.c:96
int sort_R_info(Remain_info_t *Remain_info, int n)
Definition: sec_structs.c:53
int_t * getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t *treeList)
Definition: supernodal_etree.c:734
void log_memory(int64_t, SuperLUStat_t *)
Definition: util.c:848
int symbfact_SubFree(Glu_freeable_t *)
Definition: memory.c:471
float symbfact_dist(superlu_dist_options_t *, int, int, SuperMatrix *, int_t *, int_t *, int_t *, int_t *, Pslu_freeable_t *, MPI_Comm *, MPI_Comm *, superlu_dist_mem_usage_t *)
Definition: psymbfact.c:153
int freeMsgsArr(int_t numLA, msgs_t **msgss)
Definition: treeFactorization.c:157
sForest_t ** getForests(int_t maxLvl, int_t nsupers, int_t *setree, treeList_t *treeList)
Definition: supernodalForest.c:29
void get_metis_dist(int_t n, int_t bnz, int_t *b_colptr, int_t *b_rowind, int_t *perm_c)
Definition: get_perm_c.c:33
int_t getCommonAncestorList(int_t k, int_t *alist, int_t *seTree, treeList_t *treeList)
Definition: supernodal_etree.c:317
int dist_checkArrayEq(void *arr, int length, MPI_Datatype datatype, int src_rank, int dest_rank, MPI_Comm communicator, int(*compare)(void *, void *))
Checks whether arrays at two MPI ranks are identical.
Definition: distCheckArray.c:93
SupernodeToGridMap_t * createSuperGridMap(int_t nsuper, int_t maxLvl, int_t *myTreeIdxs, int_t *myZeroTrIdxs, int_t *gNodeCount, int_t **gNodeLists)
Definition: trfAux.c:2829
void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *)
Diagnostic print of segment info after panel_dfs().
Definition: util.c:328
int_t symbfact_SubInit(superlu_dist_options_t *options, fact_t, void *, int_t, int_t, int_t, int_t, Glu_persist_t *, Glu_freeable_t *)
Definition: memory.c:295
int getNbrecvX(sForest_t *sforest, int_t *Urbs, gridinfo_t *grid)
Definition: supernodalForest.c:1134
void superlu_abort_and_exit_dist(char *)
Definition: memory.c:48
int sort_R_info_elm(Remain_info_t *Remain_info, int n)
Definition: sec_structs.c:81
void Destroy_SuperNode_Matrix_dist(SuperMatrix *)
Definition: util.c:62
int * getBrecvTree(int_t nlb, sForest_t *sforest, int *bmod, gridinfo_t *grid)
Definition: supernodalForest.c:1013
int getnGPUStreams(void)
Definition: util.c:1495
int_t * getReplicatedTrees(gridinfo3d_t *grid3d)
Definition: supernodal_etree.c:853
int * getfmod_newsolve(int_t nlb, int_t nsupers, int *supernodeMask, int_t **Lrowind_bc_ptr, int_t **Lindval_loc_bc_ptr, gridinfo_t *grid)
Definition: supernodalForest.c:1488
int sp_symetree_dist(int_t *, int_t *, int_t *, int_t, int_t *)
Symmetric elimination tree.
Definition: etree.c:156
int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *, Glu_freeable_t *)
Definition: memory.c:425
int freeFactNodelists(factNodelists_t *fNlists)
Definition: treeFactorization.c:128
int_t psymbfact_LUXpandMem(int, int_t, int_t, int_t, int_t, int, int, int, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *)
Definition: psymbfact_util.c:93
int_t getNfrecvmodLeaf(int *nleaf, sForest_t *sforest, int *frecv, int *fmod, gridinfo_t *grid)
Definition: supernodalForest.c:1234
gEtreeInfo_t fillEtreeInfo(int_t nsupers, int_t *setree, treeList_t *treeList)
Definition: trfAux.c:2787
int_t ** getTreePerm(int_t *myTreeIdxs, int_t *myZeroTrIdxs, int_t *nodeCount, int_t **nodeList, int_t *perm_c_supno, int_t *iperm_c_supno, gridinfo3d_t *grid3d)
Definition: util.c:1300
#define SuperLU_timer_
Definition: superlu_defs.h:288
int compare_pair(const void *, const void *)
Definition: util.c:864
void Destroy_CompCol_Permuted_dist(SuperMatrix *)
A is of type Stype==NCP.
Definition: util.c:74
int file_PrintInt10(FILE *, char *, int_t, int_t *)
Definition: util.c:693
int_t partition(int_t *, int_t, int_t, int_t)
Definition: util.c:1167
void super_stats_dist(int_t, int_t *)
Definition: util.c:607
int_t * getMyEtLims(int_t nnodes, int_t *myTopOrder)
Definition: supernodal_etree.c:925
void PStatPrint(superlu_dist_options_t *, SuperLUStat_t *, gridinfo_t *)
Definition: util.c:374
void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *)
Definition: get_perm_c.c:479
void arrive_at_ublock(int, int_t *, int_t *, int *, int *, int *, int_t, int_t, int_t *, int_t *, int_t *, gridinfo_t *)
Definition: util.c:959
int_t * getNodeCountsFr(int_t maxLvl, sForest_t **sForests)
Definition: supernodalForest.c:216
yes_no_t C_BcTree_IsRoot(C_Tree *tree)
Definition: comm_tree.c:145
void PStatInit(SuperLUStat_t *)
Definition: util.c:342
void slu_SCT_printSummary(gridinfo_t *grid, SCT_t *SCT)
void permCol_SymbolicFact3d(superlu_dist_options_t *options, int n, SuperMatrix *GA, int_t *perm_c, int_t *etree, Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable, SuperLUStat_t *stat, superlu_dist_mem_usage_t *symb_mem_usage, gridinfo3d_t *grid3d)
This function performs the symbolic factorization on matrix Pc*Pr*A*Pc' and sets up the nonzero data ...
Definition: trfAux.c:2546
void C_RdTree_waitSendRequest(C_Tree *Tree)
Definition: comm_tree.c:321
void print_sp_ienv_dist(superlu_dist_options_t *)
Print the blocking parameters.
Definition: util.c:284
int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t **nodeList, int_t *nodeCount)
Definition: supernodal_etree.c:436
int_t * create_iperm_c_supno(int_t nsupers, superlu_dist_options_t *options, Glu_persist_t *Glu_persist, int_t *etree, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d)
Definition: trfAux.c:2800
int_t symbfact(superlu_dist_options_t *, int, SuperMatrix *, int_t *, int_t *, Glu_persist_t *, Glu_freeable_t *)
Definition: symbfact.c:83
int Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, MPI_Request *L_diag_blk_send_req, gridinfo_t *grid, SCT_t *SCT)
Definition: communication_aux.c:195
int sort_U_info_elm(Ublock_info_t *Ublock_info, int n)
Definition: sec_structs.c:89
void PStatFree(SuperLUStat_t *)
Definition: util.c:545
int_t get_max_buffer_size(void)
float get_perm_c_parmetis(SuperMatrix *, int_t *, int_t *, int, int, int_t **, int_t **, gridinfo_t *, MPI_Comm *)
Definition: get_perm_c_parmetis.c:104
int mc64id_dist(int *)
Definition: mc64ad_dist.c:57
int_t QuerySpace_dist(int_t, int_t, Glu_freeable_t *, superlu_dist_mem_usage_t *)
Definition: memory.c:610
int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *)
Definition: communication_aux.c:161
sForest_t ** compute_sForests(int_t nsupers, Glu_persist_t *Glu_persist, int_t *etree, gridinfo3d_t *grid3d)
Definition: trfAux.c:2762
void at_plus_a_dist(const int_t, const int_t, int_t *, int_t *, int_t *, int_t **, int_t **)
Definition: get_perm_c.c:316
void get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *, int_t **, int_t **)
Definition: util.c:560
int sp_coletree_dist(int_t *, int_t *, int_t *, int_t, int_t, int_t *)
Nonsymmetric elimination tree.
Definition: etree.c:223
int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *)
Definition: communication_aux.c:132
int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *, int_t *usub, Glu_persist_t *, SuperLUStat_t *)
Definition: trfAux.c:2272
int sp_ienv_dist(int, superlu_dist_options_t *)
Definition: sp_ienv.c:80
int_t ** getNodeList(int_t maxLvl, int_t *setree, int_t *nnodes, int_t *treeHeads, treeList_t *treeList)
Definition: supernodal_etree.c:791
int * int32Calloc_dist(int)
Definition: memory.c:200
int_t getBigUSize(superlu_dist_options_t *, int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr)
Definition: trfAux.c:160
int Cmpfunc_U_info(const void *a, const void *b)
Definition: sec_structs.c:47
int_t getNumLookAhead(superlu_dist_options_t *)
Definition: treeFactorization.c:186
int superlu_dist_GetVersionNumber(int *, int *, int *)
Definition: superlu_dist_version.c:22
int_t Check_LRecv(MPI_Request *, int *msgcnt)
Definition: communication_aux.c:79
struct xtrsTimer_t xtrsTimer_t
int getNfrecvx_newsolve(int_t nsupers, int *supernodeMask, int_t **Lrowind_bc_ptr, int_t **Lindval_loc_bc_ptr, gridinfo_t *grid)
Definition: supernodalForest.c:1409
SupernodeToGridMap_t
Definition: superlu_defs.h:1291
@ IN_GRID_AIJ
Definition: superlu_defs.h:1294
@ NOT_IN_GRID
Definition: superlu_defs.h:1292
@ IN_GRID_ZERO
Definition: superlu_defs.h:1293
void C_RdTree_forwardMessageSimple(C_Tree *Tree, void *localBuffer, int msgSize)
Definition: comm_tree.c:301
void slu_SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t *SCT)
Definition: sec_structs.c:564
trtype_t
Definition: superlu_defs.h:1317
@ UPPER_TRI
Definition: superlu_defs.h:1317
@ LOWER_TRI
Definition: superlu_defs.h:1317
void printForestWeightCost(sForest_t **sForests, SCT_t *SCT, gridinfo3d_t *grid3d)
Definition: supernodalForest.c:354
yes_no_t C_RdTree_IsRoot(C_Tree *tree)
Definition: comm_tree.c:296
int * int32Malloc_dist(int)
Definition: memory.c:193
int_t * getSubTreeRoots(int_t k, int_t *numSubtrees, treeList_t *treeList)
Definition: supernodal_etree.c:406
void Destroy_CompRow_Matrix_dist(SuperMatrix *)
Definition: util.c:54
int_t num_full_cols_U_mod(int_t kk, int_t *usub, int_t *xsup, gridinfo_t *grid, int_t *perm_u, int_t *ldu)
Definition: util.c:1060
void PrintInt32(char *, int, int *)
Definition: util.c:679
void C_RdTree_Create_nv(C_Tree *tree, MPI_Comm comm, int *ranks, int rank_cnt, int msgSize, char precision, int *needrecvrd, int *needsendrd)
Definition: comm_tree.c:228
void slu_SCT_print3D(gridinfo3d_t *grid3d, SCT_t *SCT)
Definition: sec_structs.c:509
int_t getNfrecvmod_newsolve(int *nleaf, int_t nsupers, int *supernodeMask, int *frecv, int *fmod, gridinfo_t *grid)
Definition: supernodalForest.c:1265
void getSCUweight(int_t nsupers, treeList_t *treeList, int_t *xsup, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d)
Definition: trfAux.c:2305
treeList_t * setree2list(int_t nsuper, int_t *setree)
Definition: supernodal_etree.c:76
int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *)
Definition: communication_aux.c:152
int_t checkIntVector3d(int_t *vec, int_t len, gridinfo3d_t *grid3d)
Definition: util.c:1329
int_t ** getTreePermFr(int_t *myTreeIdxs, sForest_t **sForests, gridinfo3d_t *grid3d)
Definition: supernodalForest.c:292
void print_memorylog(SuperLUStat_t *, char *)
Definition: util.c:858
int64_t fixupL_dist(const int_t, const int_t *, Glu_persist_t *, Glu_freeable_t *)
Definition: util.c:169
void printTRStimer(xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d)
Definition: sec_structs.c:700
int_t get_num_gpu_streams(void)
Definition: util.c:904
void superlu_gridmap(MPI_Comm, int, int, int[], int, gridinfo_t *)
All processes in the MPI communicator must call this routine.
Definition: superlu_grid.c:95
static const int RD_U
Definition: superlu_defs.h:241
int_t calcTreeWeight(int_t nsupers, int_t *setree, treeList_t *treeList, int_t *xsup)
Definition: supernodal_etree.c:210
int get_new3dsolvetreecomm(void)
Definition: sec_structs.c:615
int_t initFactStat(int nsupers, factStat_t *factStat)
Definition: treeFactorization.c:75
int64_t handle_t
Definition: superlu_defs.h:348
int_t * getGridTrees(gridinfo3d_t *grid3d)
Definition: supernodal_etree.c:840
int_t * topological_ordering(int_t nsuper, int_t *setree)
Definition: supernodal_etree.c:59
int_t * calculate_num_children(int_t nsuper, int_t *setree)
Definition: supernodal_etree.c:997
int * getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t *myNodeCount, int_t **treePerm)
Definition: supernodalForest.c:307
void slu_SCT_print(gridinfo_t *grid, SCT_t *SCT)
Definition: sec_structs.c:434
void pxerr_dist(char *, gridinfo_t *, int_t)
Definition: pxerr_dist.c:27
int_t * getPermNodeList(int_t nnode, int_t *nlist, int_t *perm_c_sup, int_t *iperm_c_sup)
Definition: supernodal_etree.c:344
int_t * getMyNodeCounts(int_t maxLvl, int_t *myTreeIdxs, int_t *gNodeCount)
Definition: util.c:1318
int * getLastDepBtree(int_t nsupers, treeList_t *treeList)
int get_new3dsolve(void)
Definition: sec_structs.c:604
int freeCommRequestsArr(int_t mxLeafNode, commRequests_t **comReqss)
Definition: treeFactorization.c:58
int getNumThreads(int)
Definition: trfAux.c:61
int file_PrintInt32(FILE *, char *, int, int *)
Definition: util.c:708
int_t testListPerm(int_t nodeCount, int_t *nodeList, int_t *permList, int_t *gTopLevel)
Definition: supernodal_etree.c:465
int_t * getMyIperm(int_t nnodes, int_t nsupers, int_t *myPerm)
Definition: supernodal_etree.c:873
int sort_U_info(Ublock_info_t *Ublock_info, int n)
Definition: sec_structs.c:60
void getata_dist(const int_t m, const int_t n, const int_t nz, int_t *colptr, int_t *rowind, int_t *atanz, int_t **ata_colptr, int_t **ata_rowind)
Definition: get_perm_c.c:179
int file_PrintLong10(FILE *, char *, int_t, int_t *)
void PrintInt10(char *, int_t, int_t *)
Definition: util.c:665
int compareInt_t(void *a, void *b)
Compares two integers for equality.
Definition: distCheckArray.c:20
treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t nsupers, int_t *myPerm, int_t *setree)
Definition: supernodal_etree.c:952
void superlu_gridexit(gridinfo_t *)
Definition: superlu_grid.c:208
int * getBrecvTree_newsolve(int_t nlb, int_t nsupers, int *supernodeMask, int *bmod, gridinfo_t *grid)
Definition: supernodalForest.c:1059
int_t printFileList(char *sname, int_t nnodes, int_t *dlist, int_t *setree)
Definition: supernodal_etree.c:268
int_t initMsgs(msgs_t *msgs)
Definition: treeFactorization.c:137
void gemm_division_cpu_gpu(superlu_dist_options_t *, int *, int *, int *, int, int, int, int *, int, int_t)
Definition: util.c:1394
void DistPrint3D(char *function_name, double value, char *Units, gridinfo3d_t *grid3d)
Definition: sec_structs.c:342
int_t psymbfact_LUXpand(int_t, int_t, int_t, int_t, int_t *, int_t, int_t, int_t, int_t, Pslu_freeable_t *, Llu_symbfact_t *, vtcsInfo_symbfact_t *, psymbfact_stat_t *)
Definition: psymbfact_util.c:224
int getNbrecvX_newsolve(int_t nsupers, int *supernodeMask, int_t *Urbs, Ucb_indptr_t **Ucb_indptr, gridinfo_t *grid)
Definition: supernodalForest.c:1165
void C_RdTree_Nullify(C_Tree *tree)
Definition: comm_tree.c:280
int check_perm_dist(char *what, int_t n, int_t *perm)
Definition: sp_colorder.c:224
void C_BcTree_waitSendRequest(C_Tree *tree)
Definition: comm_tree.c:168
void initTRStimer(xtrsTimer_t *xtrsTimer, gridinfo_t *grid)
Definition: sec_structs.c:745
void * superlu_malloc_dist(size_t)
Definition: memory.c:155
void isort1(int_t N, int_t *ARRAY)
Definition: util.c:803
int_t * TreePostorder_dist(int_t, int_t *)
Definition: etree.c:393
int_t getCommonAncsCount(int_t k, treeList_t *treeList)
Definition: supernodal_etree.c:305
int_t psymbfact_prLUXpand(int_t, int_t, int, Llu_symbfact_t *, psymbfact_stat_t *)
Definition: psymbfact_util.c:519
commRequests_t ** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t *grid)
Definition: treeFactorization.c:44
int * getBmod3d_newsolve(int_t nlb, int_t nsupers, int *supernodeMask, int_t *xsup, int_t **Ufstnz_br_ptr, gridinfo_t *grid)
Definition: supernodalForest.c:1639
int64_t int_t
Definition: superlu_defs.h:119
int_t initFactNodelists(int_t, int_t, int_t, factNodelists_t *)
Definition: treeFactorization.c:113
void Destroy_SuperMatrix_Store_dist(SuperMatrix *)
Deallocate the structure pointing to the actual storage of the matrix.
Definition: util.c:30
int * getBmod3d(int_t treeId, int_t nlb, sForest_t *sforest, int_t *xsup, int_t **Ufstnz_br_ptr, int_t *supernode2treeMap, gridinfo_t *grid)
Definition: supernodalForest.c:1576
void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper)
Definition: supernodal_etree.c:1013
void quickSortM(int_t *, int_t, int_t, int_t, int_t, int_t)
Definition: util.c:1219
static const int BC_U
Definition: superlu_defs.h:240
void allocBcastLargeArray(void **array, int64_t size, int root, MPI_Comm comm)
Definition: trfAux.c:2724
int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *s, SCT_t *)
Definition: communication_aux.c:32
int_t zAllocBcast(int_t size, void **ptr, gridinfo3d_t *grid3d)
Definition: supernodalForest.c:1298
int_t * intCalloc_dist(int_t)
Definition: memory.c:217
int_t * createSupernode2TreeMap(int_t nsupers, int_t maxLvl, int_t *gNodeCount, int_t **gNodeLists)
Definition: trfAux.c:2813
int Cmpfunc_R_info(const void *a, const void *b)
Definition: sec_structs.c:41
int_t * Etree_LevelBoundry(int_t *perm, int_t *tsort_etree, int_t nsuper)
Definition: supernodal_etree.c:966
int getNrootUsolveTree(int_t *nbrecvmod, sForest_t *sforest, int *brecv, int *bmod, gridinfo_t *grid)
Definition: supernodalForest.c:1098
int * getfrecvLeaf(sForest_t *sforest, int_t nlb, int *fmod, int *mod_bit, gridinfo_t *grid)
Definition: supernodalForest.c:1371
void set_default_options_dist(superlu_dist_options_t *)
Set the default values for the options argument.
Definition: util.c:209
int genmmd_dist_(int_t *, int_t *, int_t *a, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *, int_t *)
Definition: mmd.c:64
void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2)
Definition: util.c:752
int_t * getGlobal_iperm(int_t nsupers, int_t nperms, int_t **perms, int_t *nnodes)
Definition: supernodal_etree.c:708
void getSCUweight_allgrid(int_t nsupers, treeList_t *treeList, int_t *xsup, int_t **Lrowind_bc_ptr, int_t **Ufstnz_br_ptr, gridinfo3d_t *grid3d)
Definition: trfAux.c:2382
void quickSort(int_t *, int_t, int_t, int_t)
Definition: util.c:1154
msgs_t ** initMsgsArr(int_t numLA)
Definition: treeFactorization.c:144
void print_etree_leveled(int_t *setree, int_t *tsort_etree, int_t nsuper)
Definition: supernodal_etree.c:1025
int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *)
Definition: util.c:723
void get_colamd_dist(const int m, const int n, const int nnz, int_t *colptr, int_t *rowind, int_t *perm_c)
Definition: get_perm_c.c:121
int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *)
Definition: communication_aux.c:182
void sp_colorder(superlu_dist_options_t *, SuperMatrix *, int_t *, int_t *, SuperMatrix *)
Definition: sp_colorder.c:81
void check_repfnz_dist(int_t, int_t, int_t, int_t *)
Check whether repfnz[] == SLU_EMPTY after reset.
Definition: util.c:651
int getldu(int_t knsupc, int_t iklrow, int_t *usub)
Definition: supernodalForest.c:1558
void superlu_gridinit(MPI_Comm, int, int, gridinfo_t *)
All processes in the MPI communicator must call this routine.
Definition: superlu_grid.c:37
sForest_t ** getGreedyLoadBalForests(int_t maxLvl, int_t nsupers, int_t *setree, treeList_t *treeList)
Definition: supernodalForest.c:833
int getNfrecvxLeaf(sForest_t *sforest, int_t **Lrowind_bc_ptr, gridinfo_t *grid)
Definition: supernodalForest.c:1454
int get_mpi_process_per_gpu(void)
Definition: util.c:1511
int_t estimate_bigu_size(int_t, int_t **, Glu_persist_t *, gridinfo_t *, int_t *, int_t *)
Definition: util.c:1103
void Destroy_CompRowLoc_Matrix_dist(SuperMatrix *)
Definition: util.c:45
void superlu_gridmap3d(MPI_Comm, int, int, int, int[], gridinfo3d_t *)
All processes in the MPI communicator must call this routine. On output, if a process is not in the S...
Definition: superlu_grid3d.c:73
int freeFactStat(factStat_t *factStat)
Definition: treeFactorization.c:100
int_t * supernodal_etree(int_t nsuper, int_t *etree, int_t *supno, int_t *xsup)
Definition: supernodal_etree.c:32
void printGPUStats(int nsupers, SuperLUStat_t *stat, gridinfo3d_t *)
static const int BC_L
Definition: superlu_defs.h:238
void bcast_tree(void *, int, MPI_Datatype, int, int, gridinfo_t *, int, int *)
Definition: comm.c:72
int_t LDiagBlockRecvWait(int_t k, int *factored_U, MPI_Request *, gridinfo_t *)
Definition: communication_aux.c:216
int getNrootUsolveTree_newsolve(int_t *nbrecvmod, int_t nsupers, int *supernodeMask, int *brecv, int *bmod, gridinfo_t *grid)
Definition: supernodalForest.c:1202
static const int RD_L
Definition: superlu_defs.h:239
enum constants header file
trans_t
Definition: superlu_enum_consts.h:34
milu_t
Definition: superlu_enum_consts.h:46
LU_space_t
Definition: superlu_enum_consts.h:40
DiagScale_t
Definition: superlu_enum_consts.h:35
IterRefine_t
Definition: superlu_enum_consts.h:36
rowperm_t
Definition: superlu_enum_consts.h:31
colperm_t
Definition: superlu_enum_consts.h:32
MemType
Definition: superlu_enum_consts.h:37
fact_t
Definition: superlu_enum_consts.h:30
norm_t
Definition: superlu_enum_consts.h:41
yes_no_t
Definition: superlu_enum_consts.h:29
PhaseType
Definition: superlu_enum_consts.h:66
Matrix type definitions.
Header for utilities.
#define MAX_3D_LEVEL
Definition: util_dist.h:83