SuperLU Distributed 9.0.0
gpu3d
util_dist.h
Go to the documentation of this file.
1
21#ifndef __SUPERLU_DIST_UTIL /* allow multiple inclusions */
22#define __SUPERLU_DIST_UTIL
23
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <assert.h>
28
29#include "superlu_enum_consts.h"
30
31/*
32 * Macros
33 */
34#ifndef USER_ABORT
35#define USER_ABORT(msg) superlu_abort_and_exit_dist(msg)
36#endif
37
38#define ABORT(err_msg) \
39 { char msg[256];\
40 sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
41 USER_ABORT(msg); }
42
43
44#ifndef USER_MALLOC
45#define USER_MALLOC(size) superlu_malloc_dist(size)
46#endif
47
48#define SUPERLU_MALLOC(size) USER_MALLOC(size)
49
50#ifndef USER_FREE
51#define USER_FREE(addr) superlu_free_dist(addr)
52#endif
53
54#define SUPERLU_FREE(addr) USER_FREE(addr)
55
56#define CHECK_MALLOC(pnum, where) { \
57 extern long int superlu_malloc_total; \
58 printf("(%d) %s: superlu_malloc_total (MB) %.6f\n", \
59 pnum, where, superlu_malloc_total*1e-6); \
60 fflush(stdout); \
61}
62
63#define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) )
64#define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) )
65
66// allocating macros
67#define MPI_REQ_ALLOC(x) ((MPI_Request *) SUPERLU_MALLOC ( (x) * sizeof (MPI_Request)))
68#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
69#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double)))
70
71/*
72 * Constants
73 */
74#define SLU_EMPTY (-1)
75#ifndef FALSE
76#define FALSE (0)
77#endif
78#ifndef TRUE
79#define TRUE (1)
80#endif
81
82/*==== For 3D code ====*/
83#define MAX_3D_LEVEL 32 /*allows for z dimensions of 2^32*/
84#define CBLOCK 192
85#define CACHE_LINE_SIZE 8
86#define CSTEPPING 8
87/*=====================*/
88
89/*
90 * Type definitions
91 */
92typedef float flops_t;
93typedef unsigned char Logical;
94
95/*
96#ifdef _CRAY
97#define int short
98#endif
99*/
100
101typedef struct {
102 int *panel_histo; /* histogram of panel size distribution */
103 double *utime; /* running time at various phases */
104 flops_t *ops; /* operation count at various phases */
105 int TinyPivots; /* number of tiny pivots */
106 int RefineSteps; /* number of iterative refinement steps */
107 int num_look_aheads; /* number of look ahead */
108 /*-- new --*/
109 float current_buffer; /* bytes allocated for buffer in numerical factorization */
110 float peak_buffer; /* monitor the peak buffer size (bytes) */
111 float gpu_buffer; /* monitor the buffer allocated on GPU (bytes) */
114
115#ifdef GPU_ACC /*-- For GPU --*/
116 double ScatterMOPCounter;
117 double ScatterMOPTimer;
118 double GemmFLOPCounter;
119 double GemmFLOPTimer;
120
121 double cPCIeH2D;
122 double cPCIeD2H;
123 double tHost_PCIeH2D;
124 double tHost_PCIeD2H;
125
126 /*GPU events to measure DGEMM and SCATTER timing */
127 int *isOffloaded; /* record whether any elimination step is offloaded or not */
128 gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/
129 gpuEvent_t *ePCIeH2D;
130 gpuEvent_t *ePCIeD2H_Start;
131 gpuEvent_t *ePCIeD2H_End;
132#endif /*-- end for GPU --*/
133
135
136
137/* Headers for 2 types of dynamatically managed memory */
138typedef struct e_node {
139 int size; /* length of the memory that has been used */
140 void *mem; /* pointer to the new malloc'd store */
142
143typedef struct {
144 int size;
145 int used;
146 int top1; /* grow upward, relative to &array[0] */
147 int top2; /* grow downward */
148 void *array;
150
151/* Constants */
152#define SuperLU_GluIntArray(n) (5 * (n) + 5)
153
154#if 0 // defined in superlu_enum_consts.h -- 1/20/2018
155#define SuperLU_NO_MEMTYPE 6 /* 0: lusup;
156 1: ucol;
157 2: lsub;
158 3: usub
159 4: llvl; level number in L for ILU(k)
160 5: ulvl; level number in U for ILU(k)
161 */
162#endif
163
164/* Macros to manipulate stack */
165#define SuperLU_StackFull(x) ( x + stack.used >= stack.size )
166#define SuperLU_NotDoubleAlign(addr) ( (long)addr & 7 )
167#define SuperLU_DoubleAlign(addr) ( ((long)addr + 7) & ~7L )
168#define SuperLU_TempSpace(n, w) ( (2*w + 4 + NO_MARKER)*m*sizeof(int) + \
169 (w + 1)*n*sizeof(double) )
170#define SuperLU_Reduce(alpha) ((alpha + 1) / 2) /* i.e. (alpha-1)/2 + 1 */
171
172#define SuperLU_FIRSTCOL_OF_SNODE(i) (xsup[i])
173
174#if defined(PROFlevel) && PROFlevel>=1
175#define TIC(t) t = SuperLU_timer_()
176#define TOC(t2, t1) t2 = SuperLU_timer_() - t1
177#else
178#define TIC(t)
179#define TOC(t2, t1)
180#endif
181
182/*********************************************************
183 * Macros used for easy access of sparse matrix entries. *
184 *********************************************************/
185#define SuperLU_L_SUB_START(col) ( Lstore->rowind_colptr[col] )
186#define SuperLU_L_SUB(ptr) ( Lstore->rowind[ptr] )
187#define SuperLU_L_NZ_START(col) ( Lstore->nzval_colptr[col] )
188#define SuperLU_L_FST_SUPC(superno) ( Lstore->sup_to_col[superno] )
189#define SuperLU_U_NZ_START(col) ( Ustore->colptr[col] )
190#define SuperLU_U_SUB(ptr) ( Ustore->rowind[ptr] )
191
192/***********************************************************************
193 * For 3D code */
194/* SCT_t was initially Schur-complement counter to compute different
195 metrics of Schur-complement Update.
196 Later, it includes counters to keep track of many other metrics.
197*/
198typedef struct
199{
209 double GatherTimer ;
210 double GatherMOP ;
228
230 double trf2_time;
231 double offloadable_flops; /*flops that can be done on ACC*/
232 double offloadable_mops; /*mops that can be done on ACC*/
233
237
238#ifdef SCATTER_PROFILE
239 double *Host_TheadScatterMOP ;
240 double *Host_TheadScatterTimer;
241#endif
242
243#ifdef OFFLOAD_PROFILE
244 double *Predicted_acc_scatter_time_strat1;
245 double *Predicted_host_sch_time_strat1;
246 size_t pci_transfer_count[18]; /*number of transfers*/
247 double pci_transfer_time[18]; /*time for each transfer */
248 double pci_transfer_prediction_error[18]; /*error in prediction*/
249 double host_sch_time[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
250 double host_sch_flop[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
251#endif
252
258
259// new timers for different wait times
260 //convention: tl suffix refers to times measured from rdtsc
261 // td : suffix refers to times measured in SuerpLU_timer
262
263 /* diagonal block factorization; part of pdgstrf2; called from thread*/
264 // double Local_Dgstrf2_tl;
266 /*wait for receiving U diagonal block: part of mpf*/
268 /*wait for receiving L diagonal block: part of mpf*/
270
271
272 /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */
274 /*wait for previous U block send to finish; part of pdgstrf2 */
276 /*after obtaining U block, time spent in calculating L panel*/
279 /*Synchronous Broadcasting L and U panel*/
282 /*Wait for L send to finish */
284
285 /*Wait for U send to finish */
287 /*Wait for U receive */
289 /*Wait for L receive */
291
292 /*time to get lock*/
294
295 /*U_panelupdate*/
297
298 /*profiling by phases */
302
303 /*3D timers*/
304 double ancsReduce; /*timer for reducing ancestors before factorization*/
305 double gatherLUtimer; /*timer for gather LU factors into bottom layer*/
306 double tFactor3D[MAX_3D_LEVEL];
307 double tSchCompUdt3d[MAX_3D_LEVEL];
308
309 /*ASync Profiler timing*/
311
312 /*double t_Startup time before factorization starts*/
313 double tStartup;
315
316 /*keeping track of data sent*/
319
320 /*timer for new code */
323
324} SCT_t;
325
326#endif /* __SUPERLU_DIST_UTIL */
327
Definition: util_dist.h:199
double LookAheadGEMMFlOp
Definition: util_dist.h:204
double Bcast_UPanel_tl
Definition: util_dist.h:280
double * Predicted_acc_sch_time
Definition: util_dist.h:225
double LookAheadRowSepMOP
Definition: util_dist.h:212
double PDGSTRS2_tl
Definition: util_dist.h:296
double scatter_mem_op_counter
Definition: util_dist.h:211
double L_PanelUpdate_tl
Definition: util_dist.h:277
double tAsyncPipeTail
Definition: util_dist.h:310
double Recv_UDiagBlock_tl
Definition: util_dist.h:273
double * SchurCompUdtThreadTime
Definition: util_dist.h:234
double commVolFactor
Definition: util_dist.h:317
double * Local_Dgstrf2_Thread_tl
Definition: util_dist.h:265
double * Predicted_acc_scatter_time
Definition: util_dist.h:227
double OffloadSectionTimer
Definition: util_dist.h:256
double trf2_time
Definition: util_dist.h:230
double LookAheadScatterMOP
Definition: util_dist.h:222
double LookAheadGEMMTimer
Definition: util_dist.h:206
double lookaheadupdatetimer
Definition: util_dist.h:255
double Wait_UDiagBlock_Recv_tl
Definition: util_dist.h:267
double datatransfer_timer
Definition: util_dist.h:221
double tStartupGPU
Definition: util_dist.h:314
double tPanelBcast
Definition: util_dist.h:322
double Phase_Factor_tl
Definition: util_dist.h:299
double NetSchurUpTimer
Definition: util_dist.h:218
double Wait_USend_tl
Definition: util_dist.h:286
double Wait_UDiagBlockSend_tl
Definition: util_dist.h:275
double * Predicted_host_sch_time
Definition: util_dist.h:235
double ancsReduce
Definition: util_dist.h:304
double trf2_flops
Definition: util_dist.h:229
double schur_flop_counter
Definition: util_dist.h:214
double * Predicted_acc_gemm_time
Definition: util_dist.h:226
double Phase_SC_Update_tl
Definition: util_dist.h:301
double offloadable_mops
Definition: util_dist.h:232
double gatherLUtimer
Definition: util_dist.h:305
double tDiagFactorPanelSolve
Definition: util_dist.h:321
double Wait_URecv_tl
Definition: util_dist.h:288
double PhiWaitTimer
Definition: util_dist.h:217
double PhiMemCpyTimer
Definition: util_dist.h:220
double scatter_mem_op_timer
Definition: util_dist.h:213
double GatherMOP
Definition: util_dist.h:210
double Wait_LDiagBlock_Recv_tl
Definition: util_dist.h:269
double * Measured_host_sch_time
Definition: util_dist.h:236
double Wait_LRecv_tl
Definition: util_dist.h:290
int_t datatransfer_count
Definition: util_dist.h:200
double GatherTimer
Definition: util_dist.h:209
double schurPhiCallTimer
Definition: util_dist.h:223
double AssemblyTimer
Definition: util_dist.h:219
double PhiWaitTimer_2
Definition: util_dist.h:205
double offloadable_flops
Definition: util_dist.h:231
double U_PanelUpdate_tl
Definition: util_dist.h:278
double LookAheadRowSepTimer
Definition: util_dist.h:207
int_t schurPhiCallCount
Definition: util_dist.h:201
int_t PhiMemCpyCounter
Definition: util_dist.h:202
double Bcast_LPanel_tl
Definition: util_dist.h:281
double Wait_LSend_tl
Definition: util_dist.h:283
double pdgstrf2_timer
Definition: util_dist.h:254
double LookAheadScatterTimer
Definition: util_dist.h:208
double autotunetime
Definition: util_dist.h:224
double tStartup
Definition: util_dist.h:313
double * GetAijLock_Thread_tl
Definition: util_dist.h:293
double schur_flop_timer
Definition: util_dist.h:215
double CPUOffloadTimer
Definition: util_dist.h:216
double Phase_LU_Update_tl
Definition: util_dist.h:300
double pdgstrfTimer
Definition: util_dist.h:257
double acc_load_imbal
Definition: util_dist.h:203
double commVolRed
Definition: util_dist.h:318
double pdgstrs2_timer
Definition: util_dist.h:253
Definition: util_dist.h:101
int num_look_aheads
Definition: util_dist.h:107
float peak_buffer
Definition: util_dist.h:110
int RefineSteps
Definition: util_dist.h:106
double * utime
Definition: util_dist.h:103
float current_buffer
Definition: util_dist.h:109
int * panel_histo
Definition: util_dist.h:102
float gpu_buffer
Definition: util_dist.h:111
int_t MaxActiveRTrees
Definition: util_dist.h:113
int TinyPivots
Definition: util_dist.h:105
int_t MaxActiveBTrees
Definition: util_dist.h:112
flops_t * ops
Definition: util_dist.h:104
Definition: util_dist.h:143
int top2
Definition: util_dist.h:147
void * array
Definition: util_dist.h:148
int size
Definition: util_dist.h:144
int top1
Definition: util_dist.h:146
int used
Definition: util_dist.h:145
Definition: util_dist.h:138
int size
Definition: util_dist.h:139
void * mem
Definition: util_dist.h:140
int64_t int_t
Definition: superlu_defs.h:119
enum constants header file
#define CBLOCK
Definition: util_dist.h:84
#define MAX_3D_LEVEL
Definition: util_dist.h:83
unsigned char Logical
Definition: util_dist.h:93
float flops_t
Definition: util_dist.h:92
struct e_node SuperLU_ExpHeader
#define CSTEPPING
Definition: util_dist.h:86