SuperLU Distributed 9.0.0
gpu3d
lupanels_impl.hpp
Go to the documentation of this file.
1#pragma once
2#include <algorithm>
3#include <iostream>
4#include <cassert>
5#include "superlu_defs.h"
7#ifdef HAVE_CUDA
8#include "lupanels_GPU.cuh"
9#include "xlupanels_GPU.cuh"
10#endif
11#include "lupanels.hpp" //unneeded??
12#include "xlupanels.hpp"
13#include "superlu_blas.hpp"
14
15template <typename Ftype>
17{
18
19 // diagFactBufs_type<Ftype> **dFBufs = new diagFactBufs_type<Ftype> *[num_bufs]; // use SuperLU_MALLOC instead
21 for (int i = 0; i < num_bufs; i++)
22 {
23 // dFBufs[i] = new diagFactBufs_type<Ftype>; // use SuperLU_MALLOC instead
25 dFBufs[i]->BlockUFactor = (Ftype *)SUPERLU_MALLOC(ldt * ldt * sizeof(Ftype));
26 dFBufs[i]->BlockLFactor = (Ftype *)SUPERLU_MALLOC(ldt * ldt * sizeof(Ftype));
27 }
28 return dFBufs;
29}
30
31template <typename Ftype>
33{
34 for (int i = 0; i < num_bufs; i++)
35 {
36 SUPERLU_FREE(dFBufs[i]->BlockUFactor);
37 SUPERLU_FREE(dFBufs[i]->BlockLFactor);
38 SUPERLU_FREE(dFBufs[i]);
39 }
40 /* Sherry fix:
41 * mxLeafNode can be 0 for the replicated layers of the processes ?? */
42 if ( num_bufs ) SUPERLU_FREE(dFBufs);
43
44 return 0;
45}
46
47
48#ifdef HAVE_CUDA
49template <typename Ftype>
51{
52 return (
53 myrow == krow(k) ?
54 uPanelVec[g2lRow(k)] :
55 xupanel_t<Ftype>(UidxRecvBufs[offset], UvalRecvBufs[offset],
56 A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset])
57 );
58}
59
60template <typename Ftype>
62{
63 return (
64 mycol == kcol(k) ?
65 lPanelVec[g2lCol(k)] :
66 xlpanel_t<Ftype>(LidxRecvBufs[offset], LvalRecvBufs[offset],
67 A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset])
68 );
69}
70#endif
71
72template <typename Ftype>
73Ftype* getBigV(int_t ldt, int_t num_threads)
74{
75 Ftype *bigV;
76 if (!(bigV = (Ftype*) SUPERLU_MALLOC (8 * ldt * ldt * num_threads * sizeof(Ftype))))
77 ABORT ("Malloc failed for dgemm buffV");
78 return bigV;
79}
80
81/* Constructor */
82template <typename Ftype>
84 trf3dpartitionType<Ftype> *trf3Dpartition_,
85 LUStruct_type<Ftype> *LUstruct,
86 gridinfo3d_t *grid3d_in,
87 SCT_t *SCT_, superlu_dist_options_t *options_,
88 SuperLUStat_t *stat_,
89 threshPivValType<Ftype> thresh_, int *info_) :
90 nsupers(nsupers_), trf3Dpartition(trf3Dpartition_),
91 ldt(ldt_), /* maximum supernode size */
92 grid3d(grid3d_in), SCT(SCT_),
93 options(options_), stat(stat_),
94 thresh(thresh_), info(info_), anc25d(grid3d_in)
95{
96 maxLvl = log2i(grid3d->zscp.Np) + 1;
98 superlu_acc_offload = sp_ienv_dist(10, options); // get_acc_offload();
99
100#if (DEBUGlevel >= 1)
101 CHECK_MALLOC(grid3d_in->iam, "Enter xLUstruct_t constructor");
102#endif
103 grid = &(grid3d->grid2d);
104 iam = grid->iam;
105 Pc = grid->npcol;
106 Pr = grid->nprow;
107 myrow = MYROW(iam, grid);
108 mycol = MYCOL(iam, grid);
109 xsup = LUstruct->Glu_persist->xsup;
110 int_t **Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr;
111 int_t **Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
112 Ftype **Lnzval_bc_ptr = LUstruct->Llu->Lnzval_bc_ptr;
113 Ftype **Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
114
117 // create the lvectors
118 maxLvalCount = 0;
119 maxLidxCount = 0;
120 maxUvalCount = 0;
121 maxUidxCount = 0;
122
123 std::vector<int_t> localLvalSendCounts(CEILING(nsupers, Pc), 0);
124 std::vector<int_t> localUvalSendCounts(CEILING(nsupers, Pr), 0);
125 std::vector<int_t> localLidxSendCounts(CEILING(nsupers, Pc), 0);
126 std::vector<int_t> localUidxSendCounts(CEILING(nsupers, Pr), 0);
127
128 for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
129 {
130 int_t k0 = i * Pc + mycol;
131 if (Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[k0] == 1)
132 {
133 int_t isDiagIncluded = 0;
134
135 if (myrow == krow(k0))
136 isDiagIncluded = 1;
137 xlpanel_t<Ftype> lpanel(k0, Lrowind_bc_ptr[i], Lnzval_bc_ptr[i], xsup, isDiagIncluded);
138 lPanelVec[i] = lpanel;
139 maxLvalCount = std::max(lPanelVec[i].nzvalSize(), maxLvalCount);
140 maxLidxCount = std::max(lPanelVec[i].indexSize(), maxLidxCount);
141 localLvalSendCounts[i] = lPanelVec[i].nzvalSize();
142 localLidxSendCounts[i] = lPanelVec[i].indexSize();
143 }
144 }
145
146 // create the vectors
147 for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
148 {
149 if (Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
150 {
151 int_t globalId = i * Pr + myrow;
152 xupanel_t<Ftype> upanel(globalId, Ufstnz_br_ptr[i], Unzval_br_ptr[i], xsup);
153 uPanelVec[i] = upanel;
154 maxUvalCount = std::max(uPanelVec[i].nzvalSize(), maxUvalCount);
155 maxUidxCount = std::max(uPanelVec[i].indexSize(), maxUidxCount);
156 localUvalSendCounts[i] = uPanelVec[i].nzvalSize();
157 localUidxSendCounts[i] = uPanelVec[i].indexSize();
158 }
159 }
160
161 // compute the send sizes
162 // send and recv count for 2d comm
163 LvalSendCounts.resize(nsupers);
164 UvalSendCounts.resize(nsupers);
165 LidxSendCounts.resize(nsupers);
166 UidxSendCounts.resize(nsupers);
167
168 std::vector<int_t> recvBuf(std::max(CEILING(nsupers, Pr), CEILING(nsupers, Pc)), 0);
169
170 for (int pr = 0; pr < Pr; pr++)
171 {
172 int npr = CEILING(nsupers, Pr);
173 std::copy(localUvalSendCounts.begin(), localUvalSendCounts.end(), recvBuf.begin());
174 // Send the value counts ;
175 MPI_Bcast((void *)recvBuf.data(), npr, mpi_int_t, pr, grid3d->cscp.comm);
176 for (int i = 0; i * Pr + pr < nsupers; i++)
177 {
178 UvalSendCounts[i * Pr + pr] = recvBuf[i];
179 }
180
181 std::copy(localUidxSendCounts.begin(), localUidxSendCounts.end(), recvBuf.begin());
182 // send the index count
183 MPI_Bcast((void *)recvBuf.data(), npr, mpi_int_t, pr, grid3d->cscp.comm);
184 for (int i = 0; i * Pr + pr < nsupers; i++)
185 {
186 UidxSendCounts[i * Pr + pr] = recvBuf[i];
187 }
188 }
189
190 for (int pc = 0; pc < Pc; pc++)
191 {
192 int npc = CEILING(nsupers, Pc);
193 std::copy(localLvalSendCounts.begin(), localLvalSendCounts.end(), recvBuf.begin());
194 // Send the value counts ;
195 MPI_Bcast((void *)recvBuf.data(), npc, mpi_int_t, pc, grid3d->rscp.comm);
196 for (int i = 0; i * Pc + pc < nsupers; i++)
197 {
198 LvalSendCounts[i * Pc + pc] = recvBuf[i];
199 }
200
201 std::copy(localLidxSendCounts.begin(), localLidxSendCounts.end(), recvBuf.begin());
202 // send the index count
203 MPI_Bcast((void *)recvBuf.data(), npc, mpi_int_t, pc, grid3d->rscp.comm);
204 for (int i = 0; i * Pc + pc < nsupers; i++)
205 {
206 LidxSendCounts[i * Pc + pc] = recvBuf[i];
207 }
208 }
209
210 maxUvalCount = *std::max_element(UvalSendCounts.begin(), UvalSendCounts.end());
211 maxUidxCount = *std::max_element(UidxSendCounts.begin(), UidxSendCounts.end());
212 maxLvalCount = *std::max_element(LvalSendCounts.begin(), LvalSendCounts.end());
213 maxLidxCount = *std::max_element(LidxSendCounts.begin(), LidxSendCounts.end());
214
215 // Allocate bigV, indirect
217 // bigV = dgetBigV(ldt, nThreads);
218 bigV = getBigV<Ftype>(ldt, nThreads);
219 indirect = (int_t *)SUPERLU_MALLOC(nThreads * ldt * sizeof(int_t));
222
223 // allocating communication buffers
228 // bcastLval.resize(options->num_lookaheads);
229 // bcastUval.resize(options->num_lookaheads);
230 // bcastLidx.resize(options->num_lookaheads);
231 // bcastUidx.resize(options->num_lookaheads);
232
233 for (int i = 0; i < options->num_lookaheads; i++)
234 {
235 LvalRecvBufs[i] = (Ftype *)SUPERLU_MALLOC(sizeof(Ftype) * maxLvalCount);
236 UvalRecvBufs[i] = (Ftype *)SUPERLU_MALLOC(sizeof(Ftype) * maxUvalCount);
239
240 //TODO: check if setup correctly
241 #pragma warning disabling bcaststruct
242 #if 0
243 bcastStruct bcLval(grid3d->rscp.comm, MPI_DOUBLE, SYNC);
244 bcastLval[i] = bcLval;
245 bcastStruct bcUval(grid3d->cscp.comm, MPI_DOUBLE, SYNC);
246 bcastUval[i] = bcUval;
248 bcastLidx[i] = bcLidx;
250 bcastUidx[i] = bcUidx;
251 #endif
252 }
253
255 diagFactBufs.resize(numDiagBufs); /* Sherry?? numDiagBufs == 32 hard-coded */
256 // bcastDiagRow.resize(numDiagBufs);
257 // bcastDiagCol.resize(numDiagBufs);
258
259 for (int i = 0; i < numDiagBufs; i++) /* Sherry?? these strcutures not used */
260 {
261 diagFactBufs[i] = (Ftype *)SUPERLU_MALLOC(sizeof(Ftype) * ldt * ldt);
262 // bcastStruct bcDiagRow(grid3d->rscp.comm, MPI_DOUBLE, SYNC);
263 // bcastDiagRow[i] = bcDiagRow;
264 // bcastStruct bcDiagCol(grid3d->cscp.comm, MPI_DOUBLE, SYNC);
265 // bcastDiagCol[i] = bcDiagCol;
266 }
267
268 int mxLeafNode = 0;
269 int_t *myTreeIdxs = trf3Dpartition->myTreeIdxs;
270 // int_t *myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
271 sForest_t **sForests = trf3Dpartition->sForests;
272 for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
273 {
274 if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode)
275 mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
276 }
277 //Yang: how is dFBufs being used in the c++ factorization code? Shall we call dinitDiagFactBufsArrMod instead to save memory?
279 maxLeafNodes = mxLeafNode;
280
281
282 double tGPU = SuperLU_timer_();
284 {
285 #ifdef HAVE_CUDA
286 setLUstruct_GPU(); /* Set up LU structure and buffers on GPU */
287
288 // TODO: remove it, checking is very slow
289 if(0)
290 checkGPU();
291 #endif
292 }
293
294 tGPU = SuperLU_timer_() -tGPU;
295#if ( PRNTlevel >= 1 )
296 printf("Time to intialize GPU DS= %g\n",tGPU );
297#endif
298
299 // if (superluAccOffload)
300
301 // for(int pc=0;pc<Pc; pc++)
302 // {
303 // MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
304 // ...
305 // }
306
307#if (DEBUGlevel >= 1)
308 CHECK_MALLOC(grid3d_in->iam, "Exit xLUstruct_t constructor");
309#endif
310
311} /* constructor xLUstruct_t */
312
313template <typename Ftype>
315 int_t k, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel)
316{
317 if (lpanel.isEmpty() || upanel.isEmpty())
318 return 0;
319
320 int_t st_lb = 0;
321 if (myrow == krow(k))
322 st_lb = 1;
323
324 int_t nlb = lpanel.nblocks();
325 int_t nub = upanel.nblocks();
326
327#pragma omp parallel for
328 for (size_t ij = 0; ij < (nlb - st_lb) * nub; ij++)
329 {
330 int_t ii = ij / nub + st_lb;
331 int_t jj = ij % nub;
332 blockUpdate(k, ii, jj, lpanel, upanel);
333 }
334
335 return 0;
336}
337
338// should be called from an openMP region
339template <typename Ftype>
341 int_t dstLen, int_t *dstVec)
342{
343 if (dstVec == NULL) /*uncompressed dimension*/
344 {
345 return srcVec;
346 }
347 int_t thread_id;
348#ifdef _OPENMP
349 thread_id = omp_get_thread_num();
350#else
351 thread_id = 0;
352#endif
353 int_t *dstIdx = indirect + thread_id * ldt;
354 for (int_t i = 0; i < dstLen; i++)
355 {
356 // if(thread_id < dstLen)
357 dstIdx[dstVec[i]] = i;
358 }
359
360 int_t *RCmap = (direction == ROW_MAP) ? indirectRow : indirectCol;
361 RCmap += thread_id * ldt;
362
363 for (int_t i = 0; i < srcLen; i++)
364 {
365 RCmap[i] = dstIdx[srcVec[i]];
366 }
367
368 return RCmap;
369}
370
371template <typename Ftype>
373 int_t gi, int_t gj,
374 Ftype *Src, int_t ldsrc,
375 int_t *srcRowList, int_t *srcColList)
376{
377
378 Ftype *Dst;
379 int_t lddst;
380 int_t dstRowLen, dstColLen;
381 int_t *dstRowList;
382 int_t *dstColList;
383 if (gj > gi) // its in upanel
384 {
385 int li = g2lRow(gi);
386 int lj = uPanelVec[li].find(gj);
387 Dst = uPanelVec[li].blkPtr(lj);
388 lddst = supersize(gi);
389 dstRowLen = supersize(gi);
390 dstRowList = NULL;
391 dstColLen = uPanelVec[li].nbcol(lj);
392 dstColList = uPanelVec[li].colList(lj);
393 // std::cout<<li<<" "<<lj<<" Dst[0] is"<<Dst[0] << "\n";
394 }
395 else
396 {
397 int lj = g2lCol(gj);
398 int li = lPanelVec[lj].find(gi);
399 Dst = lPanelVec[lj].blkPtr(li);
400 lddst = lPanelVec[lj].LDA();
401 dstRowLen = lPanelVec[lj].nbrow(li);
402 dstRowList = lPanelVec[lj].rowList(li);
403 dstColLen = supersize(gj);
404 dstColList = NULL;
405 }
406
407 // compute source row to dest row mapping
408 int_t *rowS2D = computeIndirectMap(ROW_MAP, m, srcRowList,
409 dstRowLen, dstRowList);
410
411 // compute source col to dest col mapping
412 int_t *colS2D = computeIndirectMap(COL_MAP, n, srcColList,
413 dstColLen, dstColList);
414
415 for (int j = 0; j < n; j++)
416 {
417 for (int i = 0; i < m; i++)
418 {
419 Dst[rowS2D[i] + lddst * colS2D[j]] -= Src[i + ldsrc * j];
420 }
421 }
422
423 return 0;
424}
425
426template <typename Ftype>
428{
429
430 int_t **Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
431 Ftype **Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
432
433 for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
434 {
435 if (Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
436 {
437 int_t globalId = i * Pr + myrow;
438 uPanelVec[i].packed2skyline(globalId, Ufstnz_br_ptr[i], Unzval_br_ptr[i], xsup);
439 }
440 }
441
442 return 0;
443}
444
445int numProcsPerNode(MPI_Comm baseCommunicator);
446// int numProcsPerNode(MPI_Comm baseCommunicator)
447// {
448// MPI_Comm sharedComm;
449// MPI_Comm_split_type(baseCommunicator, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &sharedComm);
450// int count = 0;
451// MPI_Comm_size(sharedComm, &count);
452// return count;
453// }
454
455
456template <typename Ftype>
458 int_t k, int_t laIdx, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel)
459{
460 if (lpanel.isEmpty() || upanel.isEmpty())
461 return 0;
462
463 int_t st_lb = 0;
464 if (myrow == krow(k))
465 st_lb = 1;
466
467 int_t nlb = lpanel.nblocks();
468 int_t laILoc = lpanel.find(laIdx);
469 int_t nub = upanel.nblocks();
470 int_t laJLoc = upanel.find(laIdx);
471
472#pragma omp parallel
473 {
474 /*Next lpanelUpdate*/
475#pragma omp for nowait
476 for (size_t ii = st_lb; ii < nlb; ii++)
477 {
478 int_t jj = laJLoc;
479 if (laJLoc != GLOBAL_BLOCK_NOT_FOUND)
480 blockUpdate(k, ii, jj, lpanel, upanel);
481 }
482
483 /*Next upanelUpdate*/
484#pragma omp for nowait
485 for (size_t jj = 0; jj < nub; jj++)
486 {
487 int_t ii = laILoc;
488 if (laILoc != GLOBAL_BLOCK_NOT_FOUND && jj != laJLoc)
489 blockUpdate(k, ii, jj, lpanel, upanel);
490 }
491 }
492
493 return 0;
494}
495
496template <typename Ftype>
498 int_t ii, int_t jj, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel)
499{
500 int thread_id;
501#ifdef _OPENMP
502 thread_id = omp_get_thread_num();
503#else
504 thread_id = 0;
505#endif
506
507 Ftype *V = bigV + thread_id * ldt * ldt;
508
509 Ftype alpha = one<Ftype>();
510 Ftype beta = zeroT<Ftype>();
511 superlu_gemm<Ftype>("N", "N",
512 lpanel.nbrow(ii), upanel.nbcol(jj), supersize(k), alpha,
513 lpanel.blkPtr(ii), lpanel.LDA(),
514 upanel.blkPtr(jj), upanel.LDA(), beta,
515 V, lpanel.nbrow(ii));
516
517 // now do the scatter
518 int_t ib = lpanel.gid(ii);
519 int_t jb = upanel.gid(jj);
520
521 dScatter(lpanel.nbrow(ii), upanel.nbcol(jj),
522 ib, jb, V, lpanel.nbrow(ii),
523 lpanel.rowList(ii), upanel.colList(jj));
524 return 0;
525}
526
527template <typename Ftype>
529 int_t k, int_t ex, // suypernodes to be excluded
530 xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel)
531{
532 if (lpanel.isEmpty() || upanel.isEmpty())
533 return 0;
534
535 int_t st_lb = 0;
536 if (myrow == krow(k))
537 st_lb = 1;
538
539 int_t nlb = lpanel.nblocks();
540 int_t nub = upanel.nblocks();
541
542 int_t exILoc = lpanel.find(ex);
543 int_t exJLoc = upanel.find(ex);
544
545#pragma omp parallel for
546 for (size_t ij = 0; ij < (nlb - st_lb) * nub; ij++)
547 {
548 int_t ii = ij / nub + st_lb;
549 int_t jj = ij % nub;
550
551 if (ii != exILoc && jj != exJLoc)
552 blockUpdate(k, ii, jj, lpanel, upanel);
553 }
554 return 0;
555}
556
557template <typename Ftype>
559{
560
561 int_t ksupc = SuperSize(k);
562 /*======= Diagonal Factorization ======*/
563 if (iam == procIJ(k, k))
564 {
565 lPanelVec[g2lCol(k)].diagFactor(k, dFBufs[offset]->BlockUFactor, ksupc,
566 thresh, xsup, options, stat, info);
567 lPanelVec[g2lCol(k)].packDiagBlock(dFBufs[offset]->BlockLFactor, ksupc);
568 }
569
570 /*======= Diagonal Broadcast ======*/
571 if (myrow == krow(k))
572 MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
573 MPI_DOUBLE, kcol(k), (grid->rscp).comm);
574 if (mycol == kcol(k))
575 MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
576 MPI_DOUBLE, krow(k), (grid->cscp).comm);
577
578 /*======= Panel Update ======*/
579 if (myrow == krow(k))
580 uPanelVec[g2lRow(k)].panelSolve(ksupc, dFBufs[offset]->BlockLFactor, ksupc);
581
582 if (mycol == kcol(k))
583 lPanelVec[g2lCol(k)].panelSolve(ksupc, dFBufs[offset]->BlockUFactor, ksupc);
584
585 return 0;
586}
587
588template <typename Ftype>
590{
591 /*======= Panel Broadcast ======*/
592 xupanel_t<Ftype> k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset]);
593 xlpanel_t<Ftype> k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset]);
594 if (myrow == krow(k))
595 k_upanel = uPanelVec[g2lRow(k)];
596
597 if (mycol == kcol(k))
598 k_lpanel = lPanelVec[g2lCol(k)];
599
600 if (UidxSendCounts[k] > 0)
601 {
602 MPI_Bcast(k_upanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
603 MPI_Bcast(k_upanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm);
604 }
605
606 if (LidxSendCounts[k] > 0)
607 {
608 MPI_Bcast(k_lpanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
609 MPI_Bcast(k_lpanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm);
610 }
611 return 0;
612}
613
Definition: commWrapper.hpp:7
Definition: xlupanels.hpp:22
int_t find(int_t k)
Definition: l_panels_impl.hpp:52
int_t * rowList(int_t k)
Definition: xlupanels.hpp:80
int_t gid(int_t k)
Definition: xlupanels.hpp:63
int_t nblocks()
Definition: xlupanels.hpp:53
Ftype * blkPtr(int_t k)
Definition: xlupanels.hpp:90
int_t * index
Definition: xlupanels.hpp:24
int_t nbrow(int_t k)
Definition: xlupanels.hpp:69
Ftype * val
Definition: xlupanels.hpp:25
int_t isEmpty()
Definition: xlupanels.hpp:107
int_t LDA()
Definition: xlupanels.hpp:100
Definition: xlupanels.hpp:176
int_t find(int_t k)
Definition: u_panels_impl.hpp:112
int_t * index
Definition: xlupanels.hpp:178
int_t isEmpty()
Definition: xlupanels.hpp:258
int_t LDA()
Definition: xlupanels.hpp:214
int_t nbcol(int_t k)
Definition: xlupanels.hpp:223
Ftype * blkPtr(int_t k)
Definition: xlupanels.hpp:238
Ftype * val
Definition: xlupanels.hpp:179
int_t nblocks()
Definition: xlupanels.hpp:203
int_t gid(int_t k)
Definition: xlupanels.hpp:217
int_t * colList(int_t k)
Definition: xlupanels.hpp:228
@ SYNC
Definition: commWrapper.hpp:4
typename std::conditional< std::is_same< Ftype, double >::value, dLUstruct_t, typename std::conditional< std::is_same< Ftype, float >::value, sLUstruct_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zLUstruct_t, void >::type >::type >::type LUStruct_type
Definition: luAuxStructTemplated.hpp:102
typename std::conditional< std::is_same< Ftype, double >::value, dtrf3Dpartition_t, typename std::conditional< std::is_same< Ftype, float >::value, strf3Dpartition_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, ztrf3Dpartition_t, void >::type >::type >::type trf3dpartitionType
Definition: luAuxStructTemplated.hpp:87
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type threshPivValType
Definition: luAuxStructTemplated.hpp:70
typename std::conditional< std::is_same< Ftype, double >::value, ddiagFactBufs_t, typename std::conditional< std::is_same< Ftype, float >::value, sdiagFactBufs_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zdiagFactBufs_t, void >::type >::type >::type diagFactBufs_type
Definition: luAuxStructTemplated.hpp:147
#define GLOBAL_BLOCK_NOT_FOUND
Definition: lupanels.hpp:16
Ftype * getBigV(int_t ldt, int_t num_threads)
Definition: lupanels_impl.hpp:73
int numProcsPerNode(MPI_Comm baseCommunicator)
Definition: lupanels.cpp:397
Definition: util_dist.h:199
Definition: util_dist.h:101
Definition: superlu_defs.h:414
gridinfo_t grid2d
Definition: superlu_defs.h:419
superlu_scope_t zscp
Definition: superlu_defs.h:418
superlu_scope_t rscp
Definition: superlu_defs.h:416
int iam
Definition: superlu_defs.h:420
superlu_scope_t cscp
Definition: superlu_defs.h:417
int_t nprow
Definition: superlu_defs.h:409
int_t npcol
Definition: superlu_defs.h:410
int iam
Definition: superlu_defs.h:408
Definition: superlu_defs.h:989
treeTopoInfo_t topoInfo
Definition: superlu_defs.h:999
Definition: superlu_defs.h:728
int num_lookaheads
Definition: superlu_defs.h:757
int Np
Definition: superlu_defs.h:399
MPI_Comm comm
Definition: superlu_defs.h:398
int_t * eTreeTopLims
Definition: superlu_defs.h:972
Definition: xlupanels.hpp:335
int_t Pr
Definition: xlupanels.hpp:340
int_t dSchurCompUpdateExcludeOne(int_t k, int_t ex, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:528
diagFactBufs_type< Ftype > ** dFBufs
Definition: xlupanels.hpp:363
int_t maxUvalCount
Definition: xlupanels.hpp:375
int_t lookAheadUpdate(int_t k, int_t laIdx, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:457
std::vector< int_t * > LidxRecvBufs
Definition: xlupanels.hpp:382
int_t dDiagFactorPanelSolve(int_t k, int_t offset, diagFactBufs_type< Ftype > **dFBufs)
Definition: lupanels_impl.hpp:558
int_t mycol
Definition: xlupanels.hpp:340
int numDiagBufs
Definition: xlupanels.hpp:351
gridinfo_t * grid
Definition: xlupanels.hpp:339
xlpanel_t< Ftype > * lPanelVec
Definition: xlupanels.hpp:336
int_t blockUpdate(int_t k, int_t ii, int_t jj, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:497
int_t * indirectCol
Definition: xlupanels.hpp:345
int_t myrow
Definition: xlupanels.hpp:340
int_t packedU2skyline(LUStruct_type< Ftype > *LUstruct)
Definition: lupanels_impl.hpp:427
int nThreads
Definition: xlupanels.hpp:344
int_t maxLvl
Definition: xlupanels.hpp:360
superlu_dist_options_t * options
Definition: xlupanels.hpp:355
std::vector< int_t > UvalSendCounts
Definition: xlupanels.hpp:387
int_t * indirect
Definition: xlupanels.hpp:345
Ftype * bigV
Definition: xlupanels.hpp:346
int_t maxUidxCount
Definition: xlupanels.hpp:376
xLUstruct_t(int_t nsupers, int_t ldt_, trf3dpartitionType< Ftype > *trf3Dpartition, LUStruct_type< Ftype > *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT_, superlu_dist_options_t *options_, SuperLUStat_t *stat, threshPivValType< Ftype > thresh_, int *info_)
Definition: lupanels_impl.hpp:83
std::vector< int_t * > UidxRecvBufs
Definition: xlupanels.hpp:383
std::vector< Ftype * > LvalRecvBufs
Definition: xlupanels.hpp:380
int_t maxLidxCount
Definition: xlupanels.hpp:374
diagFactBufs_type< Ftype > ** initDiagFactBufsArr(int_t mxLeafNode, int_t ldt)
Definition: lupanels_impl.hpp:16
int_t nsupers
Definition: xlupanels.hpp:342
int_t ldt
Definition: xlupanels.hpp:340
int_t maxLvalCount
Definition: xlupanels.hpp:373
int_t dScatter(int_t m, int_t n, int_t gi, int_t gj, Ftype *V, int_t ldv, int_t *srcRowList, int_t *srcColList)
Definition: lupanels_impl.hpp:372
trf3dpartitionType< Ftype > * trf3Dpartition
Definition: xlupanels.hpp:359
indirectMapType
Definition: xlupanels.hpp:429
std::vector< int_t > LidxSendCounts
Definition: xlupanels.hpp:388
int superlu_acc_offload
Definition: xlupanels.hpp:364
int * isNodeInMyGrid
Definition: xlupanels.hpp:347
std::vector< int_t > UidxSendCounts
Definition: xlupanels.hpp:389
std::vector< Ftype * > UvalRecvBufs
Definition: xlupanels.hpp:381
int maxLeafNodes
Definition: xlupanels.hpp:361
xupanel_t< Ftype > * uPanelVec
Definition: xlupanels.hpp:337
int_t dPanelBcast(int_t k, int_t offset)
Definition: lupanels_impl.hpp:589
int_t * computeIndirectMap(indirectMapType direction, int_t srcLen, int_t *srcVec, int_t dstLen, int_t *dstVec)
Definition: lupanels_impl.hpp:340
int_t krow(int_t k)
Definition: xlupanels.hpp:402
int_t dSchurComplementUpdate(int_t k, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:314
std::vector< int_t > LvalSendCounts
Definition: xlupanels.hpp:386
int_t * indirectRow
Definition: xlupanels.hpp:345
int_t * xsup
Definition: xlupanels.hpp:341
std::vector< Ftype * > diagFactBufs
Definition: xlupanels.hpp:377
int_t iam
Definition: xlupanels.hpp:340
gridinfo3d_t * grid3d
Definition: xlupanels.hpp:338
int_t Pc
Definition: xlupanels.hpp:340
Definitions which are precision-neutral.
#define CEILING(a, b)
Definition: superlu_defs.h:277
int_t log2i(int_t index)
Definition: supernodal_etree.c:17
#define SuperSize(bnum)
Definition: superlu_defs.h:271
#define mpi_int_t
Definition: superlu_defs.h:120
int sp_ienv_dist(int, superlu_dist_options_t *)
Definition: sp_ienv.c:80
int * getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t *myNodeCount, int_t **treePerm)
Definition: supernodalForest.c:307
int getNumThreads(int)
Definition: trfAux.c:61
#define MYCOL(iam, grid)
Definition: superlu_defs.h:268
int64_t int_t
Definition: superlu_defs.h:119
#define MYROW(iam, grid)
Definition: superlu_defs.h:267
double SuperLU_timer_()
Definition: superlu_timer.c:72
int j
Definition: sutil_dist.c:287
int i
Definition: sutil_dist.c:287
#define SUPERLU_MALLOC(size)
Definition: util_dist.h:48
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56
#define SUPERLU_FREE(addr)
Definition: util_dist.h:54
#define ABORT(err_msg)
Definition: util_dist.h:38