SuperLU Distributed 8.2.1
Distributed memory sparse direct solver
dlustruct_gpu.h
Go to the documentation of this file.
1
2
16#pragma once // so that this header file is included onle once
17
18#include "superlu_ddefs.h"
19
20#ifdef GPU_ACC // enable GPU
21#include "gpu_api_utils.h"
22// #include "mkl.h"
23// #include "sec_structs.h"
24// #include "supernodal_etree.h"
25
26/* Constants */
27//#define SLU_TARGET_GPU 0
28//#define MAX_BLOCK_SIZE 10000
29#define MAX_NGPU_STREAMS 32
30
31static
32void check(gpuError_t result, char const *const func, const char *const file, int const line)
33{
34 if (result)
35 {
36 fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n",
37 file, line, gpuGetErrorString(result), func);
38
39 // Make sure we call GPU Device Reset before exiting
40 exit(EXIT_FAILURE);
41 }
42}
43
44#define checkGPUErrors(val) check ( (val), #val, __FILE__, __LINE__ )
45
46typedef struct //SCUbuf_gpu_
47{
48 /*Informations for various buffers*/
49 double *bigV;
50 double *bigU;
51 double *bigU_host; /*pinned location*/
52 int_t *indirect; /*for indirect address calculations*/
53 int_t *indirect2; /*for indirect address calculations*/
54
55 double *Remain_L_buff; /* on GPU */
56 double *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
57
58 int_t *lsub;
59 int_t *usub;
60
61 int_t *lsub_buf, *usub_buf;
62
63 Ublock_info_t *Ublock_info; /* on GPU */
64 Remain_info_t *Remain_info;
65 Ublock_info_t *Ublock_info_host;
66 Remain_info_t *Remain_info_host;
67
68 int_t* usub_IndirectJ3; /* on GPU */
69 int_t* usub_IndirectJ3_host;
70
71} dSCUbuf_gpu_t;
72
73/* Holds the L & U data structures on the GPU side */
74typedef struct //LUstruct_gpu_
75{
76 int_t *LrowindVec; /* A single vector */
77 int_t *LrowindPtr; /* A single vector */
78
79 double *LnzvalVec; /* A single vector */
80 int_t *LnzvalPtr; /* A single vector */
81 int_t *LnzvalPtr_host; /* A single vector */
82
83 int_t *UrowindVec; /* A single vector */
84 int_t *UrowindPtr; /* A single vector */
85 int_t *UrowindPtr_host; /* A single vector */
86 int_t *UnzvalPtr_host;
87
88 double *UnzvalVec; /* A single vector */
89 int_t *UnzvalPtr; /* A single vector */
90
91 /*gpu pointers for easy block accesses */
92 local_l_blk_info_t *local_l_blk_infoVec;
93 int_t *local_l_blk_infoPtr;
94 int_t *jib_lookupVec;
95 int_t *jib_lookupPtr;
96 local_u_blk_info_t *local_u_blk_infoVec;
97
98 int_t *local_u_blk_infoPtr;
99 int_t *ijb_lookupVec;
100 int_t *ijb_lookupPtr;
101
102 // GPU buffers for performing Schur Complement Update on GPU
103 dSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
104 double *acc_L_buff, *acc_U_buff;
105
106 /*Informations for various buffers*/
107 int_t buffer_size;
108 int_t nsupers; /*should have number of supernodes*/
109 int_t *xsup;
110 gridinfo_t *grid;
111
112 double ScatterMOPCounter;
113 double ScatterMOPTimer;
114 double GemmFLOPCounter;
115 double GemmFLOPTimer;
116
117 double cPCIeH2D;
118 double cPCIeD2H;
119 double tHost_PCIeH2D;
120 double tHost_PCIeD2H;
121
122 /*GPU events to measure DGEMM and SCATTER timing */
123 int *isOffloaded; /*stores if any iteration is offloaded or not*/
124 gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/
125 gpuEvent_t *ePCIeH2D;
126 gpuEvent_t *ePCIeD2H_Start;
127 gpuEvent_t *ePCIeD2H_End;
128
129 int_t *xsup_host;
130 int_t* perm_c_supno;
131 int_t first_l_block_gpu, first_u_block_gpu;
132} dLUstruct_gpu_t;
133
134typedef struct //sluGPU_t_
135{
136 int_t gpuId; // if there are multiple GPUs
137 dLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
138 gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream;
139 gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS];
140 int_t lastOffloadStream[MAX_NGPU_STREAMS];
141 int_t nGPUStreams;
142 int* isNodeInMyGrid;
143 double acc_async_cost;
144} dsluGPU_t;
145
146
147#ifdef __cplusplus
148extern "C" {
149#endif
150
152 sForest_t *sforest,
153 commRequests_t **comReqss, // lists of communication requests,
154 // size = maxEtree level
155 dscuBufs_t *scuBufs, // contains buffers for schur complement update
156 packLUInfo_t *packLUInfo,
157 msgs_t **msgss, // size = num Look ahead
158 dLUValSubBuf_t **LUvsbs, // size = num Look ahead
159 ddiagFactBufs_t **dFBufs, // size = maxEtree level
160 factStat_t *factStat,
161 factNodelists_t *fNlists,
162 gEtreeInfo_t *gEtreeInfo, // global etree info
163 superlu_dist_options_t *options,
164 int_t *gIperm_c_supno,
165 int ldt,
166 dsluGPU_t *sluGPU,
167 d2Hreduce_t *d2Hred,
168 HyP_t *HyP,
169 dLUstruct_t *LUstruct, gridinfo3d_t *grid3d,
170 SuperLUStat_t *stat,
171 double thresh, SCT_t *SCT, int tag_ub,
172 int *info);
173
174int dinitD2Hreduce(
175 int next_k,
176 d2Hreduce_t* d2Hred,
177 int last_flag,
178 // int_t *perm_c_supno,
179 HyP_t* HyP,
180 dsluGPU_t *sluGPU,
181 gridinfo_t *grid,
182 dLUstruct_t *LUstruct, SCT_t* SCT
183);
184
185extern int dreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred,
186 dsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid,
187 dLUstruct_t *LUstruct);
188
189extern int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT);
190extern int dsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, dsluGPU_t *sluGPU);
191extern int dsendSCUdataHost2GPU(
192 int_t streamId, int_t* lsub, int_t* usub, double* bigU, int_t bigu_send_size,
193 int_t Remain_lbuf_send_size, dsluGPU_t *sluGPU, HyP_t* HyP
194);
195
196extern int dinitSluGPU3D_t(
197 dsluGPU_t *sluGPU,
198 dLUstruct_t *LUstruct,
199 gridinfo3d_t * grid3d,
200 int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt
201);
202int dSchurCompUpdate_GPU(
203 int_t streamId,
204 int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
205 int_t Rnbrow, int_t RemainBlk,
206 int_t Remain_lbuf_send_size,
207 int_t bigu_send_size, int_t ldu,
208 int_t mcb,
209 int_t buffer_size, int_t lsub_len, int_t usub_len,
210 int_t ldt, int_t k0,
211 dsluGPU_t *sluGPU, gridinfo_t *grid
212);
213
214
215extern void dCopyLUToGPU3D (int* isNodeInMyGrid, dLocalLU_t *A_host,
216 dsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
217 gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt);
218
219extern int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
220 int_t** treePerm, dLUValSubBuf_t*LUvsb,
221 dLUstruct_t* LUstruct, gridinfo3d_t* grid3d,
222 dsluGPU_t *sluGPU, d2Hreduce_t* d2Hred,
223 factStat_t *factStat, HyP_t* HyP, SCT_t* SCT );
224
225extern void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT);
226extern int dfree_LUstruct_gpu (dLUstruct_gpu_t *A_gpu);
227
228//int freeSluGPU(dsluGPU_t *sluGPU);
229
230extern void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda );
231
232/*to print out various statistics*/
233void dprintGPUStats(dLUstruct_gpu_t *A_gpu);
234
235#ifdef __cplusplus
236}
237#endif
238
239#endif // matching: enable GPU
int int_t
Definition: superlu_defs.h:114
int dsparseTreeFactor_ASYNC_GPU(sForest_t *sforest, commRequests_t **comReqss, scuBufs_t *scuBufs, packLUInfo_t *packLUInfo, msgs_t **msgss, dLUValSubBuf_t **LUvsbs, diagFactBufs_t **dFBufs, factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t *gEtreeInfo, superlu_dist_options_t *options, int_t *gIperm_c_supno, int_t ldt, sluGPU_t *sluGPU, d2Hreduce_t *d2Hred, HyP_t *HyP, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, double thresh, SCT_t *SCT, int tag_ub, int *info)
Definition: treeFactorizationGPU.c:44
double acc_async_cost
Definition: acc_aux.c:56
integer, parameter, public lsub
Definition: superlupara.f90:35
integer, parameter, public usub
Definition: superlupara.f90:35
Definition: superlu_defs.h:435
Definition: superlu_ddefs.h:329
Definition: superlu_defs.h:770
Definition: util_dist.h:172
Definition: util_dist.h:95
Definition: superlu_defs.h:760
Definition: superlu_defs.h:924
Definition: superlu_defs.h:852
Definition: superlu_ddefs.h:357
Definition: superlu_ddefs.h:254
Definition: superlu_ddefs.h:97
Definition: superlu_ddefs.h:391
Definition: superlu_ddefs.h:385
Definition: superlu_defs.h:937
Definition: superlu_defs.h:839
Definition: superlu_defs.h:890
Definition: superlu_defs.h:398
Definition: superlu_defs.h:388
Definition: superlu_defs.h:815
Definition: superlu_defs.h:822
Definition: superlu_defs.h:947
Definition: superlu_ddefs.h:397
Definition: superlu_defs.h:901
Definition: superlu_defs.h:712
Distributed SuperLU data types and function prototypes.