SuperLU Distributed 9.0.0
gpu3d
slustruct_gpu.h
Go to the documentation of this file.
1
2
16#pragma once // so that this header file is included onle once
17
18#include "superlu_sdefs.h"
19
20#ifdef GPU_ACC // enable GPU
21#include "gpu_api_utils.h"
22// #include "mkl.h"
23// #include "sec_structs.h"
24// #include "supernodal_etree.h"
25
26/* Constants */
27//#define SLU_TARGET_GPU 0
28//#define MAX_BLOCK_SIZE 10000
29#define MAX_NGPU_STREAMS 32
30
31static
32void check(gpuError_t result, char const *const func, const char *const file, int const line)
33{
34 if (result)
35 {
36 fprintf(stderr, "GPU error at file %s: line %d code=(%s) \"%s\" \n",
37 file, line, gpuGetErrorString(result), func);
38
39 // Make sure we call GPU Device Reset before exiting
40 exit(EXIT_FAILURE);
41 }
42}
43
44#define checkGPUErrors(val) check ( (val), #val, __FILE__, __LINE__ )
45
46typedef struct //SCUbuf_gpu_
47{
48 /*Informations for various buffers*/
49 float *bigV;
50 float *bigU;
51 float *bigU_host; /*pinned location*/
52 int_t *indirect; /*for indirect address calculations*/
53 int_t *indirect2; /*for indirect address calculations*/
54
55 float *Remain_L_buff; /* on GPU */
56 float *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
57
60
61 int_t *lsub_buf, *usub_buf;
62
67
68 int_t* usub_IndirectJ3; /* on GPU */
70
72
73/* Holds the L & U data structures on the GPU side */
74typedef struct //LUstruct_gpu_
75{
76 int_t *LrowindVec; /* A single vector */
77 int_t *LrowindPtr; /* A single vector */
78
79 float *LnzvalVec; /* A single vector */
80 int_t *LnzvalPtr; /* A single vector */
81 int_t *LnzvalPtr_host; /* A single vector */
82
83 int_t *UrowindVec; /* A single vector */
84 int_t *UrowindPtr; /* A single vector */
85 int_t *UrowindPtr_host; /* A single vector */
87
88 float *UnzvalVec; /* A single vector */
89 int_t *UnzvalPtr; /* A single vector */
90
91 /*gpu pointers for easy block accesses */
94 int_t *jib_lookupVec; /* NOT USED ? */
95 int_t *jib_lookupPtr; /* NOT USED ? */
97
99
100 // GPU buffers for performing Schur Complement Update on GPU
102 float *acc_L_buff, *acc_U_buff;
103
104 /*Informations for various buffers*/
106 int_t nsupers; /*should have number of supernodes*/
108 // gridinfo_t *grid; // Sherry: this is not used
109
110#if 0 // Sherry: moved to 'SuperLUStat_t'
111 double ScatterMOPCounter;
112 double ScatterMOPTimer;
113 double GemmFLOPCounter;
114 double GemmFLOPTimer;
115
116 double cPCIeH2D;
117 double cPCIeD2H;
118 double tHost_PCIeH2D;
119 double tHost_PCIeD2H;
120
121 /*GPU events to measure DGEMM and SCATTER timing */
122 int *isOffloaded; /*stores if any iteration is offloaded or not*/
123 gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/
124 gpuEvent_t *ePCIeH2D;
125 gpuEvent_t *ePCIeD2H_Start;
126 gpuEvent_t *ePCIeD2H_End;
127#endif
128
131 int_t first_l_block_gpu, first_u_block_gpu;
133
134typedef struct //sluGPU_t_
135{
136 //int gpuId; // if there are multiple GPUs ( NOT USED )
137 sLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
138 gpuStream_t funCallStreams[MAX_NGPU_STREAMS], CopyStream;
139 gpublasHandle_t gpublasHandles[MAX_NGPU_STREAMS];
140 int lastOffloadStream[MAX_NGPU_STREAMS];
144} ssluGPU_t;
145
146
147#ifdef __cplusplus
148extern "C" {
149#endif
150
152 sForest_t *sforest,
153 commRequests_t **comReqss, // lists of communication requests,
154 // size = maxEtree level
155 sscuBufs_t *scuBufs, // contains buffers for schur complement update
156 packLUInfo_t *packLUInfo,
157 msgs_t **msgss, // size = num Look ahead
158 sLUValSubBuf_t **LUvsbs, // size = num Look ahead
159 sdiagFactBufs_t **dFBufs, // size = maxEtree level
160 factStat_t *factStat,
161 factNodelists_t *fNlists,
162 gEtreeInfo_t *gEtreeInfo, // global etree info
163 superlu_dist_options_t *options,
164 int_t *gIperm_c_supno,
165 int ldt,
166 ssluGPU_t *sluGPU,
167 d2Hreduce_t *d2Hred,
168 HyP_t *HyP,
169 sLUstruct_t *LUstruct, gridinfo3d_t *grid3d,
170 SuperLUStat_t *stat,
171 double thresh, SCT_t *SCT, int tag_ub,
172 int *info);
173
175 int next_k,
176 d2Hreduce_t* d2Hred,
177 int last_flag,
178 // int_t *perm_c_supno,
179 HyP_t* HyP,
180 ssluGPU_t *sluGPU,
181 gridinfo_t *grid,
182 sLUstruct_t *LUstruct, SCT_t* SCT
183);
184
185extern int sreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred,
186 ssluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid,
187 sLUstruct_t *LUstruct);
188
189extern int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT);
190extern int ssendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred,
191 ssluGPU_t *sluGPU, SuperLUStat_t *);
193 int_t streamId, int_t* lsub, int_t* usub, float* bigU, int_t bigu_send_size,
194 int_t Remain_lbuf_send_size, ssluGPU_t *sluGPU, HyP_t* HyP
195);
196
198 ssluGPU_t *sluGPU,
199 sLUstruct_t *LUstruct,
200 gridinfo3d_t * grid3d,
201 int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt,
203);
205 int_t streamId,
206 int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
207 int_t Rnbrow, int_t RemainBlk,
208 int_t Remain_lbuf_send_size,
209 int_t bigu_send_size, int_t ldu,
210 int_t mcb,
211 int_t buffer_size, int_t lsub_len, int_t usub_len,
212 int_t ldt, int_t k0,
213 ssluGPU_t *sluGPU, gridinfo_t *grid,
215);
216
217
218extern void sCopyLUToGPU3D (int* isNodeInMyGrid, sLocalLU_t *A_host,
219 ssluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
220 gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt,
222 );
223
224extern int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
225 int_t** treePerm, sLUValSubBuf_t*LUvsb,
226 sLUstruct_t* LUstruct, gridinfo3d_t* grid3d,
227 ssluGPU_t *sluGPU, d2Hreduce_t* d2Hred,
228 factStat_t *factStat, HyP_t* HyP, SCT_t* SCT,
230 );
231
232extern void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT);
234
235//int freeSluGPU(ssluGPU_t *sluGPU);
236
237extern void sPrint_matrix( char *desc, int_t m, int_t n, float *dA, int_t lda );
238
239#ifdef __cplusplus
240}
241#endif
242
243#endif // matching: enable GPU
integer, parameter, public lsub
Definition: superlupara.f90:35
integer, parameter, public usub
Definition: superlupara.f90:35
int sreduceGPUlu(int last_flag, d2Hreduce_t *d2Hred, ssluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, sLUstruct_t *LUstruct)
static void check(gpuError_t result, char const *const func, const char *const file, int const line)
Definition: slustruct_gpu.h:32
void sPrint_matrix(char *desc, int_t m, int_t n, float *dA, int_t lda)
int sinitSluGPU3D_t(ssluGPU_t *sluGPU, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, int_t *perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt, SuperLUStat_t *)
int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT)
int ssendSCUdataHost2GPU(int_t streamId, int_t *lsub, int_t *usub, float *bigU, int_t bigu_send_size, int_t Remain_lbuf_send_size, ssluGPU_t *sluGPU, HyP_t *HyP)
int sSchurCompUpdate_GPU(int_t streamId, int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, int_t Rnbrow, int_t RemainBlk, int_t Remain_lbuf_send_size, int_t bigu_send_size, int_t ldu, int_t mcb, int_t buffer_size, int_t lsub_len, int_t usub_len, int_t ldt, int_t k0, ssluGPU_t *sluGPU, gridinfo_t *grid, SuperLUStat_t *)
int sinitD2Hreduce(int next_k, d2Hreduce_t *d2Hred, int last_flag, HyP_t *HyP, ssluGPU_t *sluGPU, gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
#define MAX_NGPU_STREAMS
Definition: slustruct_gpu.h:29
int sreduceAllAncestors3d_GPU(int_t ilvl, int_t *myNodeCount, int_t **treePerm, sLUValSubBuf_t *LUvsb, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, ssluGPU_t *sluGPU, d2Hreduce_t *d2Hred, factStat_t *factStat, HyP_t *HyP, SCT_t *SCT, SuperLUStat_t *)
int ssendLUpanelGPU2HOST(int_t k0, d2Hreduce_t *d2Hred, ssluGPU_t *sluGPU, SuperLUStat_t *)
int sfree_LUstruct_gpu(ssluGPU_t *sluGPU, SuperLUStat_t *)
int ssparseTreeFactor_ASYNC_GPU(sForest_t *sforest, commRequests_t **comReqss, sscuBufs_t *scuBufs, packLUInfo_t *packLUInfo, msgs_t **msgss, sLUValSubBuf_t **LUvsbs, sdiagFactBufs_t **dFBufs, factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t *gEtreeInfo, superlu_dist_options_t *options, int_t *gIperm_c_supno, int ldt, ssluGPU_t *sluGPU, d2Hreduce_t *d2Hred, HyP_t *HyP, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, double thresh, SCT_t *SCT, int tag_ub, int *info)
void sCopyLUToGPU3D(int *isNodeInMyGrid, sLocalLU_t *A_host, ssluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt, SuperLUStat_t *)
void ssyncAllfunCallStreams(ssluGPU_t *sluGPU, SCT_t *SCT)
Definition: superlu_defs.h:451
Definition: superlu_defs.h:854
Definition: superlu_defs.h:799
Definition: util_dist.h:199
Definition: util_dist.h:101
Definition: superlu_defs.h:789
Definition: superlu_defs.h:1012
Definition: superlu_defs.h:940
Definition: superlu_defs.h:1025
Definition: superlu_defs.h:927
Definition: superlu_defs.h:978
Definition: superlu_defs.h:414
Definition: superlu_defs.h:404
Definition: superlu_defs.h:903
Definition: superlu_defs.h:910
Definition: superlu_defs.h:1034
Definition: superlu_defs.h:844
Definition: superlu_defs.h:989
Definition: superlu_sdefs.h:310
Definition: slustruct_gpu.h:75
int_t nsupers
Definition: slustruct_gpu.h:106
float * acc_L_buff
Definition: slustruct_gpu.h:102
local_u_blk_info_t * local_u_blk_infoVec
Definition: slustruct_gpu.h:96
local_l_blk_info_t * local_l_blk_infoVec
Definition: slustruct_gpu.h:92
int_t * LrowindPtr
Definition: slustruct_gpu.h:77
int_t * jib_lookupPtr
Definition: slustruct_gpu.h:95
float * LnzvalVec
Definition: slustruct_gpu.h:79
int_t * local_u_blk_infoPtr
Definition: slustruct_gpu.h:98
int_t buffer_size
Definition: slustruct_gpu.h:105
int_t * xsup_host
Definition: slustruct_gpu.h:129
int_t * UrowindPtr
Definition: slustruct_gpu.h:84
int_t * UrowindVec
Definition: slustruct_gpu.h:83
int_t * UrowindPtr_host
Definition: slustruct_gpu.h:85
int_t * LnzvalPtr
Definition: slustruct_gpu.h:80
float * UnzvalVec
Definition: slustruct_gpu.h:88
int_t * LrowindVec
Definition: slustruct_gpu.h:76
int_t * UnzvalPtr
Definition: slustruct_gpu.h:89
int_t * jib_lookupVec
Definition: slustruct_gpu.h:94
int_t * xsup
Definition: slustruct_gpu.h:107
int_t first_l_block_gpu
Definition: slustruct_gpu.h:131
int_t * perm_c_supno
Definition: slustruct_gpu.h:130
int_t * local_l_blk_infoPtr
Definition: slustruct_gpu.h:93
int_t * LnzvalPtr_host
Definition: slustruct_gpu.h:81
int_t * UnzvalPtr_host
Definition: slustruct_gpu.h:86
Definition: superlu_sdefs.h:340
Definition: superlu_sdefs.h:97
Definition: slustruct_gpu.h:47
float * bigU
Definition: slustruct_gpu.h:50
int_t * indirect2
Definition: slustruct_gpu.h:53
Ublock_info_t * Ublock_info_host
Definition: slustruct_gpu.h:65
Ublock_info_t * Ublock_info
Definition: slustruct_gpu.h:63
Remain_info_t * Remain_info_host
Definition: slustruct_gpu.h:66
int_t * indirect
Definition: slustruct_gpu.h:52
int_t * usub_IndirectJ3
Definition: slustruct_gpu.h:68
float * bigU_host
Definition: slustruct_gpu.h:51
int_t * lsub
Definition: slustruct_gpu.h:58
int_t * usub
Definition: slustruct_gpu.h:59
int_t * usub_IndirectJ3_host
Definition: slustruct_gpu.h:69
int_t * lsub_buf
Definition: slustruct_gpu.h:61
float * Remain_L_buff
Definition: slustruct_gpu.h:55
float * bigV
Definition: slustruct_gpu.h:49
Remain_info_t * Remain_info
Definition: slustruct_gpu.h:64
float * Remain_L_buff_host
Definition: slustruct_gpu.h:56
Definition: superlu_sdefs.h:467
Definition: superlu_sdefs.h:461
Definition: slustruct_gpu.h:135
double acc_async_cost
Definition: slustruct_gpu.h:143
gpuStream_t CopyStream
Definition: slustruct_gpu.h:138
int * isNodeInMyGrid
Definition: slustruct_gpu.h:142
sLUstruct_gpu_t * A_gpu
Definition: slustruct_gpu.h:137
int nGPUStreams
Definition: slustruct_gpu.h:141
Definition: superlu_defs.h:728
int64_t int_t
Definition: superlu_defs.h:119
Distributed SuperLU data types and function prototypes.