SuperLU Distributed 9.0.0
gpu3d
pdgstrf3d_upacked_impl.hpp
Go to the documentation of this file.
1#pragma once
2#include "superlu_ddefs.h"
3
4#ifdef MAP_PROFILE
5#include "mapsampler_api.h"
6#endif
7
8#ifdef HAVE_CUDA
9#include "dlustruct_gpu.h"
10// #include "acc_aux.c"
11#endif
12
13
14#include "lupanels.hpp"
15#include "superlu_upacked.h"
17#include "anc25d-GPU_impl.hpp"
19#include "anc25d_impl.hpp"
20#include "dsparseTreeFactorGPU_impl.hpp" //needed???
23#include "l_panels_impl.hpp"
24#include "u_panels_impl.hpp"
25#include "lupanels_impl.hpp"
26#include "lupanels_GPU_impl.hpp"
29// #include "sparseTreeFactor_impl.hpp"
30// pxgstrf3d<double>
31template <typename Ftype>
33 trf3dpartitionType<Ftype> *trf3Dpartition, SCT_t *SCT,
34 LUStruct_type<Ftype> *LUstruct, gridinfo3d_t *grid3d,
35 SuperLUStat_t *stat, int *info)
36{
37 gridinfo_t *grid = &(grid3d->grid2d);
38 LocalLU_type<Ftype> *Llu = LUstruct->Llu;
39
40 // problem specific contants
41 int_t ldt = sp_ienv_dist(3, options); /* Size of maximum supernode */
42 // double s_eps = slamch_ ("Epsilon"); -Sherry
43 AnormType<Ftype> s_eps = smach_dist("Epsilon");
44 AnormType<Ftype> thresh = s_eps * anorm;
45
46#if (DEBUGlevel >= 1)
47 CHECK_MALLOC(grid3d->iam, "Enter pdgstrf3d_upacked()");
48#endif
49
50 // Initilize stat
51 stat->ops[FACT] = 0;
52 stat->current_buffer = 0.0;
53 stat->peak_buffer = 0.0;
54 stat->gpu_buffer = 0.0;
55 // if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
56
57 // getting Nsupers
58 int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
59
60 // Grid related Variables
61 int_t iam = grid->iam; // in 2D grid
62 int num_threads = getNumThreads(grid3d->iam);
63
64 SCT->tStartup = SuperLU_timer_();
65
66 // tag_ub initialization
67 int tag_ub = set_tag_ub();
68 int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
69
70#if (PRNTlevel >= 1)
71 if (!iam)
72 {
73 printf("MPI tag upper bound = %d\n", tag_ub);
74 fflush(stdout);
75 }
76#endif
77
78 gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
79 int_t *iperm_c_supno = trf3Dpartition->iperm_c_supno;
80 int_t *myNodeCount = trf3Dpartition->myNodeCount;
81 int_t *myTreeIdxs = trf3Dpartition->myTreeIdxs;
82 int_t *myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
83 sForest_t **sForests = trf3Dpartition->sForests;
84 int_t **treePerm = trf3Dpartition->treePerm;
85 LUValSubBuf_type<Ftype> *LUvsb = trf3Dpartition->LUvsb;
86
87 /* Initializing factorization specific buffers */
88
89 int_t numLA = getNumLookAhead(options);
90
91 int_t mxLeafNode = 0;
92 for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
93 {
94 if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode)
95 mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
96 }
97 diagFactBufs_type<Ftype> **dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid);
98
99 /*******************************************
100 *
101 * New code starts
102 * ******************************************/
103 // Create the new LU structure
104 int *isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm);
105 int superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
106 double tConst = SuperLU_timer_();
107 xLUstruct_t<Ftype> LU_packed(nsupers, ldt, trf3Dpartition, LUstruct, grid3d,
108 SCT, options, stat, thresh, info);
109
110 tConst = SuperLU_timer_() - tConst;
111 printf("Time to intialize New DS= %g\n", tConst);
112
113 /*==== starting main factorization loop =====*/
114 MPI_Barrier(grid3d->comm);
115 SCT->tStartup = SuperLU_timer_() - SCT->tStartup;
116#if 1
117 LU_packed.pdgstrf3d();
118#else
120
121 for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)
122 {
123 /* if I participate in this level */
124 if (!myZeroTrIdxs[ilvl])
125 {
126
127 sForest_t *sforest = sForests[myTreeIdxs[ilvl]];
128
129 /* main loop over all the supernodes */
130 if (sforest) /* 2D factorization at individual subtree */
131 {
132 double tilvl = SuperLU_timer_();
133
134 if (superlu_acc_offload)
135 #ifdef HAVE_CUDA
136 LU_packed.dsparseTreeFactorGPU(sforest, dFBufs,
137 &gEtreeInfo,
138 tag_ub);
139 #endif
140 else
141 LU_packed.dsparseTreeFactor(sforest, dFBufs,
142 &gEtreeInfo,
143 tag_ub);
144
145 /*now reduce the updates*/
146 SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl;
147 sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl];
148 }
149
150 if (ilvl < maxLvl - 1) /*then reduce before factorization*/
151 {
152 if (superlu_acc_offload)
153 {
154#ifdef HAVE_CUDA
155 //#define NDEBUG
156#ifndef NDEBUG
157 LU_packed.checkGPU();
158 LU_packed.ancestorReduction3d(ilvl, myNodeCount, treePerm);
159#endif
160 LU_packed.ancestorReduction3dGPU(ilvl, myNodeCount, treePerm);
161#ifndef NDEBUG
162 LU_packed.checkGPU();
163#endif
164#endif
165 }
166
167 else
168 LU_packed.ancestorReduction3d(ilvl, myNodeCount, treePerm);
169 }
170 } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/
171
172 SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
173 : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
174 } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
175
176 MPI_Barrier(grid3d->comm);
178
179#endif /* match line 106 */
180
181 double tXferGpu2Host = SuperLU_timer_();
182 if (superlu_acc_offload)
183 {
184 #ifdef HAVE_CUDA
185 cudaStreamSynchronize(LU_packed.A_gpu.cuStreams[0]); // in theory I don't need it
186 LU_packed.copyLUGPUtoHost();
187 #endif
188 }
189
190 LU_packed.packedU2skyline(LUstruct);
191 tXferGpu2Host = SuperLU_timer_() - tXferGpu2Host;
192 printf("Time to send data back= %g\n", tXferGpu2Host);
193
194 if (!grid3d->zscp.Iam)
195 {
196 // SCT_printSummary(grid, SCT);
197 // if (superlu_acc_offload )
198 // printGPUStats(sluGPU->A_gpu, grid);
199 }
200
201#ifdef ITAC_PROF
202 VT_traceoff();
203#endif
204
205#ifdef MAP_PROFILE
206 allinea_stop_sampling();
207#endif
208
209 reduceStat(FACT, stat, grid3d);
210
211 /* free L panels and U panels. called in LUstruct_v100 destructor */
212 // dfreeDiagFactBufsArr(mxLeafNode, dFBufs);
213
214#if (DEBUGlevel >= 1)
215 CHECK_MALLOC(grid3d->iam, "Exit pdgstrf3d_upacked()");
216#endif
217 return 0;
218
219} /* pdgstrf3d_upacked */
220
221
222/* This can be accessed from the C handle */
223template <typename Ftype>
225{
226 int tag_ub = set_tag_ub();
227 gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
228 // int_t *iperm_c_supno = trf3Dpartition->iperm_c_supno;
229 int_t *myNodeCount = trf3Dpartition->myNodeCount;
230 int_t *myTreeIdxs = trf3Dpartition->myTreeIdxs;
231 int_t *myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
232 sForest_t **sForests = trf3Dpartition->sForests;
233 int_t **treePerm = trf3Dpartition->treePerm;
234
235#if (DEBUGlevel >= 1)
236 CHECK_MALLOC(grid3d->iam, "Enter pdgstrf3d()");
237 printf(".maxLvl %d\n", maxLvl);
238#endif
239
240 SCT->pdgstrfTimer = SuperLU_timer_();
241 // get environment variables ANC25D
242 int useAnc25D = 0;
243 if (getenv("ANC25D"))
244 useAnc25D = atoi(getenv("ANC25D"));
245 if (useAnc25D)
246 printf("-- Using ANC25D; ONLY CPU supported \n");
247
248 for (int ilvl = 0; ilvl < maxLvl; ++ilvl) /* maxLvel is the tree level
249 along Z-dim process grid */
250 {
251 if (useAnc25D)
252 {
253 sForest_t *sforest = sForests[myTreeIdxs[ilvl]];
254 if (sforest) /* 2D factorization at individual subtree */
255 {
256 double tilvl = SuperLU_timer_();
257 if (superlu_acc_offload)
258 {
259 printf("-- ANC25D on GPU is not working yet!!!!! \n");
260 if (ilvl == 0)
261 dsparseTreeFactorGPU(sforest, dFBufs,
262 &gEtreeInfo,
263 tag_ub);
264 else
265 dAncestorFactorBaselineGPU(ilvl, sforest, dFBufs,
266 &gEtreeInfo,
267 tag_ub);
268 }
269 else
270 {
271 if (ilvl == 0)
272 dsparseTreeFactor(sforest, dFBufs,
273 &gEtreeInfo,
274 tag_ub);
275 else
276 dAncestorFactorBaseline(ilvl, sforest, dFBufs,
277 &gEtreeInfo,
278 tag_ub);
279 }
280
281 /*now reduce the updates*/
282 SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl;
283 sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl];
284 }
285 }
286 else
287 {
288 /* if I participate in this level */
289 if (!myZeroTrIdxs[ilvl])
290 {
291
292 sForest_t *sforest = sForests[myTreeIdxs[ilvl]];
293
294 /* main loop over all the supernodes */
295 if (sforest) /* 2D factorization at individual subtree */
296 {
297 double tilvl = SuperLU_timer_();
298
299 if ( superlu_acc_offload ) {
300 if ( options->batchCount==0 )
301 dsparseTreeFactorGPU(sforest, dFBufs, &gEtreeInfo, tag_ub);
302 else {
303 printf("Batch ERROR: should not get to this branch!\n");
304 // Sherry commented out the following
305 //dsparseTreeFactorBatchGPU(sforest, dFBufs, &gEtreeInfo, tag_ub);
306 }
307 } else {
308 dsparseTreeFactor(sforest, dFBufs,
309 &gEtreeInfo,
310 tag_ub);
311
312 }
313
314 /*now reduce the updates*/
315 SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl;
316 sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl];
317 }
318
319 if (ilvl < maxLvl - 1) /*then reduce before factorization*/
320 {
321 if (superlu_acc_offload)
322 {
323 //#define NDEBUG
324 // #ifndef NDEBUG
325 // checkGPU();
326 // ancestorReduction3d(ilvl, myNodeCount, treePerm);
327 // #endif
328
329 ancestorReduction3dGPU(ilvl, myNodeCount, treePerm);
330
331 // #ifndef NDEBUG
332 // checkGPU();
333 // #endif
334 }
335
336 else
337 this->ancestorReduction3d(ilvl, myNodeCount, treePerm);
338 }
339 } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/
340 }
341
342 SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
343 : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
344 } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
345
346 MPI_Barrier(grid3d->comm);
347 SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
348
349#if (DEBUGlevel >= 1)
350 CHECK_MALLOC(grid3d->iam, "Exit pdgstrf3d()");
351#endif
352
353 // dfreeDiagFactBufsArr(maxLeafNodes, dFBufs); // called in LUstruct_v100 destructor
354
355 return 0;
356} /* pdgstrf3d */
357
358
359
360// UrowindPtr_host
Descriptions and declarations for structures used in GPU.
typename std::conditional< std::is_same< Ftype, double >::value, dLocalLU_t, typename std::conditional< std::is_same< Ftype, float >::value, sLocalLU_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zLocalLU_t, void >::type >::type >::type LocalLU_type
Definition: luAuxStructTemplated.hpp:117
typename std::conditional< std::is_same< Ftype, double >::value, dLUstruct_t, typename std::conditional< std::is_same< Ftype, float >::value, sLUstruct_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zLUstruct_t, void >::type >::type >::type LUStruct_type
Definition: luAuxStructTemplated.hpp:102
typename std::conditional< std::is_same< Ftype, double >::value, dtrf3Dpartition_t, typename std::conditional< std::is_same< Ftype, float >::value, strf3Dpartition_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, ztrf3Dpartition_t, void >::type >::type >::type trf3dpartitionType
Definition: luAuxStructTemplated.hpp:87
typename std::conditional< std::is_same< Ftype, double >::value, dLUValSubBuf_t, typename std::conditional< std::is_same< Ftype, float >::value, sLUValSubBuf_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zLUValSubBuf_t, void >::type >::type >::type LUValSubBuf_type
Definition: luAuxStructTemplated.hpp:132
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type AnormType
Definition: luAuxStructTemplated.hpp:59
typename std::conditional< std::is_same< Ftype, double >::value, ddiagFactBufs_t, typename std::conditional< std::is_same< Ftype, float >::value, sdiagFactBufs_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zdiagFactBufs_t, void >::type >::type >::type diagFactBufs_type
Definition: luAuxStructTemplated.hpp:147
int_t pdgstrf3d_upacked(superlu_dist_options_t *options, int m, int n, AnormType< Ftype > anorm, trf3dpartitionType< Ftype > *trf3Dpartition, SCT_t *SCT, LUStruct_type< Ftype > *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, int *info)
Definition: pdgstrf3d_upacked_impl.hpp:32
Definition: util_dist.h:199
double NetSchurUpTimer
Definition: util_dist.h:218
double tSchCompUdt3d[MAX_3D_LEVEL]
Definition: util_dist.h:307
double tStartup
Definition: util_dist.h:313
double tFactor3D[MAX_3D_LEVEL]
Definition: util_dist.h:306
double pdgstrfTimer
Definition: util_dist.h:257
Definition: util_dist.h:101
float peak_buffer
Definition: util_dist.h:110
float current_buffer
Definition: util_dist.h:109
float gpu_buffer
Definition: util_dist.h:111
flops_t * ops
Definition: util_dist.h:104
Definition: superlu_defs.h:978
Definition: superlu_defs.h:414
gridinfo_t grid2d
Definition: superlu_defs.h:419
superlu_scope_t zscp
Definition: superlu_defs.h:418
int iam
Definition: superlu_defs.h:420
MPI_Comm comm
Definition: superlu_defs.h:415
Definition: superlu_defs.h:404
int iam
Definition: superlu_defs.h:408
Definition: superlu_defs.h:989
treeTopoInfo_t topoInfo
Definition: superlu_defs.h:999
double cost
Definition: superlu_defs.h:1007
Definition: superlu_defs.h:728
int Np
Definition: superlu_defs.h:399
int Iam
Definition: superlu_defs.h:400
int_t * eTreeTopLims
Definition: superlu_defs.h:972
Definition: xlupanels.hpp:335
xLUstructGPU_t< Ftype > A_gpu
Definition: xlupanels.hpp:412
int_t packedU2skyline(LUStruct_type< Ftype > *LUstruct)
Definition: lupanels_impl.hpp:427
int_t dsparseTreeFactor(sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Definition: dsparseTreeFactor_upacked_impl.hpp:8
int_t pdgstrf3d()
Definition: pdgstrf3d_upacked_impl.hpp:224
int_t ancestorReduction3d(int_t ilvl, int_t *myNodeCount, int_t **treePerm)
Definition: lupanels_comm3d_impl.hpp:9
Distributed SuperLU data types and function prototypes.
int getNsupers(int, Glu_persist_t *)
Definition: trfAux.c:42
ddiagFactBufs_t ** dinitDiagFactBufsArr(int mxLeafNode, int ldt, gridinfo_t *grid)
int_t dsparseTreeFactor(int_t nnodes, int_t *perm_c_supno, treeTopoInfo_t *treeTopoInfo, commRequests_t *comReqs, dscuBufs_t *scuBufs, packLUInfo_t *packLUInfo, msgs_t *msgs, dLUValSubBuf_t *LUvsb, ddiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, int_t *gIperm_c_supno, int_t ldt, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, double thresh, SCT_t *SCT, int *info)
int_t log2i(int_t index)
Definition: supernodal_etree.c:17
float smach_dist(const char *)
Definition: smach_dist.c:16
int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t *grid3d)
Definition: util.c:1366
int set_tag_ub(void)
Definition: trfAux.c:48
int sp_ienv_dist(int, superlu_dist_options_t *)
Definition: sp_ienv.c:80
int_t getNumLookAhead(superlu_dist_options_t *)
Definition: treeFactorization.c:186
int * getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t *myNodeCount, int_t **treePerm)
Definition: supernodalForest.c:307
int getNumThreads(int)
Definition: trfAux.c:61
int64_t int_t
Definition: superlu_defs.h:119
@ FACT
Definition: superlu_enum_consts.h:74
double SuperLU_timer_()
Definition: superlu_timer.c:72
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56