SuperLU Distributed 8.2.1
Distributed memory sparse direct solver
util_dist.h
Go to the documentation of this file.
1
15#ifndef __SUPERLU_UTIL /* allow multiple inclusions */
16#define __SUPERLU_UTIL
17
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include <assert.h>
22
23#include "superlu_enum_consts.h"
24
25/*
26 * Macros
27 */
28#ifndef USER_ABORT
29#define USER_ABORT(msg) superlu_abort_and_exit_dist(msg)
30#endif
31
32#define ABORT(err_msg) \
33 { char msg[256];\
34 sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
35 USER_ABORT(msg); }
36
37
38#ifndef USER_MALLOC
39#define USER_MALLOC(size) superlu_malloc_dist(size)
40#endif
41
42#define SUPERLU_MALLOC(size) USER_MALLOC(size)
43
44#ifndef USER_FREE
45#define USER_FREE(addr) superlu_free_dist(addr)
46#endif
47
48#define SUPERLU_FREE(addr) USER_FREE(addr)
49
50#define CHECK_MALLOC(pnum, where) { \
51 extern long int superlu_malloc_total; \
52 printf("(%d) %s: superlu_malloc_total (MB) %.6f\n", \
53 pnum, where, superlu_malloc_total*1e-6); \
54 fflush(stdout); \
55}
56
57#define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) )
58#define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) )
59
60// allocating macros
61#define MPI_REQ_ALLOC(x) ((MPI_Request *) SUPERLU_MALLOC ( (x) * sizeof (MPI_Request)))
62#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
63#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double)))
64
65/*
66 * Constants
67 */
68#define EMPTY (-1)
69#ifndef FALSE
70#define FALSE (0)
71#endif
72#ifndef TRUE
73#define TRUE (1)
74#endif
75
76/*==== For 3D code ====*/
77#define MAX_3D_LEVEL 32 /*allows for z dimensions of 2^32*/
78#define CBLOCK 192
79#define CACHE_LINE_SIZE 8
80#define CSTEPPING 8
81/*=====================*/
82
83/*
84 * Type definitions
85 */
86typedef float flops_t;
87typedef unsigned char Logical;
88
89/*
90#ifdef _CRAY
91#define int short
92#endif
93*/
94
95typedef struct {
96 int *panel_histo; /* histogram of panel size distribution */
97 double *utime; /* running time at various phases */
98 flops_t *ops; /* operation count at various phases */
99 int TinyPivots; /* number of tiny pivots */
100 int RefineSteps; /* number of iterative refinement steps */
101 int num_look_aheads; /* number of look ahead */
102 /*-- new --*/
103 float current_buffer; /* bytes allocated for buffer in numerical factorization */
104 float peak_buffer; /* monitor the peak buffer size (bytes) */
105 float gpu_buffer; /* monitor the buffer allocated on GPU (bytes) */
109
110/* Headers for 2 types of dynamatically managed memory */
111typedef struct e_node {
112 int size; /* length of the memory that has been used */
113 void *mem; /* pointer to the new malloc'd store */
115
116typedef struct {
117 int size;
118 int used;
119 int top1; /* grow upward, relative to &array[0] */
120 int top2; /* grow downward */
121 void *array;
123
124/* Constants */
125#define SuperLU_GluIntArray(n) (5 * (n) + 5)
126
127#if 0 // defined in superlu_enum_consts.h -- 1/20/2018
128#define SuperLU_NO_MEMTYPE 6 /* 0: lusup;
129 1: ucol;
130 2: lsub;
131 3: usub
132 4: llvl; level number in L for ILU(k)
133 5: ulvl; level number in U for ILU(k)
134 */
135#endif
136
137/* Macros to manipulate stack */
138#define SuperLU_StackFull(x) ( x + stack.used >= stack.size )
139#define SuperLU_NotDoubleAlign(addr) ( (long)addr & 7 )
140#define SuperLU_DoubleAlign(addr) ( ((long)addr + 7) & ~7L )
141#define SuperLU_TempSpace(n, w) ( (2*w + 4 + NO_MARKER)*m*sizeof(int) + \
142 (w + 1)*n*sizeof(double) )
143#define SuperLU_Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */
144
145#define SuperLU_FIRSTCOL_OF_SNODE(i) (xsup[i])
146
147#if ( PROFlevel>=1 )
148#define TIC(t) t = SuperLU_timer_()
149#define TOC(t2, t1) t2 = SuperLU_timer_() - t1
150#else
151#define TIC(t)
152#define TOC(t2, t1)
153#endif
154
155/*********************************************************
156 * Macros used for easy access of sparse matrix entries. *
157 *********************************************************/
158#define SuperLU_L_SUB_START(col) ( Lstore->rowind_colptr[col] )
159#define SuperLU_L_SUB(ptr) ( Lstore->rowind[ptr] )
160#define SuperLU_L_NZ_START(col) ( Lstore->nzval_colptr[col] )
161#define SuperLU_L_FST_SUPC(superno) ( Lstore->sup_to_col[superno] )
162#define SuperLU_U_NZ_START(col) ( Ustore->colptr[col] )
163#define SuperLU_U_SUB(ptr) ( Ustore->rowind[ptr] )
164
165/***********************************************************************
166 * For 3D code */
167/* SCT_t was initially Schur-complement counter to compute different
168 metrics of Schur-complement Update.
169 Later, it includes counters to keep track of many other metrics.
170*/
171typedef struct
172{
182 double GatherTimer ;
183 double GatherMOP ;
201
203 double trf2_time;
204 double offloadable_flops; /*flops that can be done on ACC*/
205 double offloadable_mops; /*mops that can be done on ACC*/
206
210
211#ifdef SCATTER_PROFILE
212 double *Host_TheadScatterMOP ;
213 double *Host_TheadScatterTimer;
214#endif
215
216#ifdef OFFLOAD_PROFILE
217 double *Predicted_acc_scatter_time_strat1;
218 double *Predicted_host_sch_time_strat1;
219 size_t pci_transfer_count[18]; /*number of transfers*/
220 double pci_transfer_time[18]; /*time for each transfer */
221 double pci_transfer_prediction_error[18]; /*error in prediction*/
222 double host_sch_time[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
223 double host_sch_flop[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
224#endif
225
230
231// new timers for different wait times
232 //convention: tl suffix refers to times measured from rdtsc
233 // td : suffix refers to times measured in SuerpLU_timer
234
235 /* diagonal block factorization; part of pdgstrf2; called from thread*/
236 // double Local_Dgstrf2_tl;
238 /*wait for receiving U diagonal block: part of mpf*/
240 /*wait for receiving L diagonal block: part of mpf*/
242
243
244 /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */
246 /*wait for previous U block send to finish; part of pdgstrf2 */
248 /*after obtaining U block, time spent in calculating L panel*/
250 /*Synchronous Broadcasting L and U panel*/
253 /*Wait for L send to finish */
255
256 /*Wait for U send to finish */
258 /*Wait for U receive */
260 /*Wait for L receive */
262
263 /*time to get lock*/
265
266 /*U_panelupdate*/
268
269 /*profiling by phases */
273
274 /*3D timers*/
275 double ancsReduce; /*timer for reducing ancestors before factorization*/
276 double gatherLUtimer; /*timer for gather LU factors into bottom layer*/
277 double tFactor3D[MAX_3D_LEVEL];
278 double tSchCompUdt3d[MAX_3D_LEVEL];
279
280 /*ASync Profiler timing*/
282
283 /*double t_Startup time before factorization starts*/
284 double tStartup;
285
286 /*keeping track of data sent*/
289
290} SCT_t;
291
292
293#endif /* __SUPERLU_UTIL */
int int_t
Definition: superlu_defs.h:114
#define CBLOCK
Definition: util_dist.h:78
#define MAX_3D_LEVEL
Definition: util_dist.h:77
unsigned char Logical
Definition: util_dist.h:87
float flops_t
Definition: util_dist.h:86
struct e_node SuperLU_ExpHeader
#define CSTEPPING
Definition: util_dist.h:80
Definition: util_dist.h:172
double * Predicted_host_sch_time
Definition: util_dist.h:208
double LookAheadGEMMFlOp
Definition: util_dist.h:177
double Bcast_UPanel_tl
Definition: util_dist.h:251
double LookAheadRowSepMOP
Definition: util_dist.h:185
double PDGSTRS2_tl
Definition: util_dist.h:267
double scatter_mem_op_counter
Definition: util_dist.h:184
double L_PanelUpdate_tl
Definition: util_dist.h:249
double tAsyncPipeTail
Definition: util_dist.h:281
double Recv_UDiagBlock_tl
Definition: util_dist.h:245
double * GetAijLock_Thread_tl
Definition: util_dist.h:264
double commVolFactor
Definition: util_dist.h:287
double * Predicted_acc_sch_time
Definition: util_dist.h:198
double trf2_time
Definition: util_dist.h:203
double LookAheadScatterMOP
Definition: util_dist.h:195
double LookAheadGEMMTimer
Definition: util_dist.h:179
double lookaheadupdatetimer
Definition: util_dist.h:228
double Wait_UDiagBlock_Recv_tl
Definition: util_dist.h:239
double datatransfer_timer
Definition: util_dist.h:194
double Phase_Factor_tl
Definition: util_dist.h:270
double NetSchurUpTimer
Definition: util_dist.h:191
double * SchurCompUdtThreadTime
Definition: util_dist.h:207
double Wait_USend_tl
Definition: util_dist.h:257
double Wait_UDiagBlockSend_tl
Definition: util_dist.h:247
double ancsReduce
Definition: util_dist.h:275
double trf2_flops
Definition: util_dist.h:202
double schur_flop_counter
Definition: util_dist.h:187
double Phase_SC_Update_tl
Definition: util_dist.h:272
double offloadable_mops
Definition: util_dist.h:205
double * Predicted_acc_scatter_time
Definition: util_dist.h:200
double gatherLUtimer
Definition: util_dist.h:276
double Wait_URecv_tl
Definition: util_dist.h:259
double PhiWaitTimer
Definition: util_dist.h:190
double PhiMemCpyTimer
Definition: util_dist.h:193
double scatter_mem_op_timer
Definition: util_dist.h:186
double GatherMOP
Definition: util_dist.h:183
double Wait_LDiagBlock_Recv_tl
Definition: util_dist.h:241
double Wait_LRecv_tl
Definition: util_dist.h:261
int_t datatransfer_count
Definition: util_dist.h:173
double GatherTimer
Definition: util_dist.h:182
double schurPhiCallTimer
Definition: util_dist.h:196
double * Predicted_acc_gemm_time
Definition: util_dist.h:199
double AssemblyTimer
Definition: util_dist.h:192
double PhiWaitTimer_2
Definition: util_dist.h:178
double offloadable_flops
Definition: util_dist.h:204
double LookAheadRowSepTimer
Definition: util_dist.h:180
int_t schurPhiCallCount
Definition: util_dist.h:174
int_t PhiMemCpyCounter
Definition: util_dist.h:175
double Bcast_LPanel_tl
Definition: util_dist.h:252
double Wait_LSend_tl
Definition: util_dist.h:254
double * Measured_host_sch_time
Definition: util_dist.h:209
double pdgstrf2_timer
Definition: util_dist.h:227
double LookAheadScatterTimer
Definition: util_dist.h:181
double autotunetime
Definition: util_dist.h:197
double * Local_Dgstrf2_Thread_tl
Definition: util_dist.h:237
double tStartup
Definition: util_dist.h:284
double schur_flop_timer
Definition: util_dist.h:188
double CPUOffloadTimer
Definition: util_dist.h:189
double Phase_LU_Update_tl
Definition: util_dist.h:271
double pdgstrfTimer
Definition: util_dist.h:229
double acc_load_imbal
Definition: util_dist.h:176
double commVolRed
Definition: util_dist.h:288
double pdgstrs2_timer
Definition: util_dist.h:226
Definition: util_dist.h:95
int num_look_aheads
Definition: util_dist.h:101
float peak_buffer
Definition: util_dist.h:104
int RefineSteps
Definition: util_dist.h:100
flops_t * ops
Definition: util_dist.h:98
float current_buffer
Definition: util_dist.h:103
int * panel_histo
Definition: util_dist.h:96
double * utime
Definition: util_dist.h:97
float gpu_buffer
Definition: util_dist.h:105
int_t MaxActiveRTrees
Definition: util_dist.h:107
int TinyPivots
Definition: util_dist.h:99
int_t MaxActiveBTrees
Definition: util_dist.h:106
Definition: util_dist.h:116
int top2
Definition: util_dist.h:120
void * array
Definition: util_dist.h:121
int size
Definition: util_dist.h:117
int top1
Definition: util_dist.h:119
int used
Definition: util_dist.h:118
Definition: util_dist.h:111
int size
Definition: util_dist.h:112
void * mem
Definition: util_dist.h:113
enum constants header file