SuperLU Distributed 9.0.0
gpu3d
xlupanels.hpp
Go to the documentation of this file.
1#pragma once
2#include <vector>
3#include <iostream>
4#include "superlu_ddefs.h" // superlu_defs.h ??
5#include "lu_common.hpp"
6#ifdef HAVE_CUDA
7#include "lupanels_GPU.cuh"
8#include "xlupanels_GPU.cuh"
9#include "gpuCommon.hpp"
10#endif
11#include "commWrapper.hpp"
12#include "anc25d.hpp"
14// class lpanelGPU_t;
15// class upanelGPU_t;
16#define GLOBAL_BLOCK_NOT_FOUND -1
17// it can be templatized for Ftype and complex Ftype
18
19
20template <typename Ftype>
22{
23public:
25 Ftype *val;
26 // ifdef GPU acceraleration
27
28#ifdef HAVE_CUDA
29 xlpanelGPU_t<Ftype> gpuPanel;
30#endif
31 // bool isDiagIncluded;
32
33 xlpanel_t(int_t k, int_t *lsub, Ftype *nzval, int_t *xsup, int_t isDiagIncluded);
34
35 // default constuctor
36#ifdef HAVE_CUDA
37 xlpanel_t() : gpuPanel(NULL, NULL)
38 {
39 index = NULL;
40 val = NULL;
41 }
42#else
44 {
45 index = NULL;
46 val = NULL;
47 }
48#endif
49
50 xlpanel_t(int_t *index_, Ftype *val_) : index(index_), val(val_) { return; };
51
52 // index[0] is number of blocks
54 {
55 return index[0];
56 }
57 // number of rows
58 int_t nzrows() { return index[1]; }
59 int_t haveDiag() { return index[2]; }
60 int_t ncols() { return index[3]; }
61
62 // global block id of k-th block in the panel
64 {
65 return index[LPANEL_HEADER_SIZE + k];
66 }
67
68 // number of rows in the k-th block
70 {
71 return index[LPANEL_HEADER_SIZE + nblocks() + k + 1] - index[LPANEL_HEADER_SIZE + nblocks() + k];
72 }
73
74 //
75 int_t stRow(int k)
76 {
77 return index[LPANEL_HEADER_SIZE + nblocks() + k];
78 }
79 // row
81 {
82 // LPANEL_HEADER
83 // nblocks() : blocks list
84 // nblocks()+1 : blocks st_points
85 // index[LPANEL_HEADER_SIZE + nblocks() + k] statrting of the block
87 2 * nblocks() + 1 + index[LPANEL_HEADER_SIZE + nblocks() + k]];
88 }
89
90 Ftype *blkPtr(int_t k)
91 {
92 return &val[index[LPANEL_HEADER_SIZE + nblocks() + k]];
93 }
94
96 {
97 return index[LPANEL_HEADER_SIZE + nblocks() + k];
98 }
99
100 int_t LDA() { return index[1]; }
101 int_t find(int_t k);
102 // for L panel I don't need any special transformation function
103 int_t panelSolve(int_t ksupsz, Ftype *DiagBlk, int_t LDD);
104 int_t diagFactor(int_t k, Ftype *UBlk, int_t LDU, threshPivValType<Ftype> thresh, int_t *xsup,
105 superlu_dist_options_t *options, SuperLUStat_t *stat, int *info);
106 int_t packDiagBlock(Ftype *DiagLBlk, int_t LDD);
107 int_t isEmpty() { return index == NULL; }
109 {
110 if (index == NULL)
111 return 0;
112 return ncols() * nzrows();
113 }
114
116 {
117 if (index == NULL)
118 return 0;
119 return LPANEL_HEADER_SIZE + 2 * nblocks() + 1 + nzrows();
120 }
121
122 size_t totalSize()
123 {
124 return sizeof(int_t) * indexSize() + sizeof(Ftype) * nzvalSize();
125 }
126
127 // return the maximal iEnd such that stRow(iEnd)-stRow(iSt) < maxRow;
128 int getEndBlock(int iSt, int maxRows);
129
130 // ~lpanel_t()
131 // {
132 // SUPERLU_FREE(index);
133 // // SUPERLU_FREE(val);
134 // }
135
136#ifdef HAVE_CUDA
137 xlpanelGPU_t<Ftype> copyToGPU();
138 xlpanelGPU_t<Ftype> copyToGPU(void *basePtr); // when we are doing a single allocation
139 int checkGPU();
140 int copyBackToGPU();
141
142 int_t panelSolveGPU(cublasHandle_t handle, cudaStream_t cuStream,
143 int_t ksupsz,
144 Ftype *DiagBlk, // device pointer
145 int_t LDD);
146
147 int_t diagFactorPackDiagBlockGPU(int_t k,
148 Ftype *UBlk, int_t LDU, // CPU pointers
149 Ftype *DiagLBlk, int_t LDD, // CPU pointers
150 Ftype thresh, int_t *xsup,
151 superlu_dist_options_t *options,
152 SuperLUStat_t *stat, int *info);
153 int_t diagFactorCuSolver(int_t k,
154 cusolverDnHandle_t cusolverH, cudaStream_t cuStream,
155 Ftype *dWork, int *dInfo, // GPU pointers
156 Ftype *dDiagBuf, int_t LDD, // GPU pointers
157 threshPivValType<Ftype> thresh, int_t *xsup,
158 superlu_dist_options_t *options,
159 SuperLUStat_t *stat, int *info);
160
161 Ftype *blkPtrGPU(int k)
162 {
163 return &gpuPanel.val[blkPtrOffset(k)];
164 }
165
166 xlpanel_t(int_t *index_, Ftype *val_, int_t *indexGPU, Ftype *valGPU) : index(index_), val(val_), gpuPanel(indexGPU, valGPU)
167 {
168 return;
169 };
170 int_t copyFromGPU();
171#endif
172};
173
174template <typename Ftype>
176{
177public:
179 Ftype *val;
180#ifdef HAVE_CUDA
181 // xupanelGPU_t<Ftype>* upanelGPU;
182 xupanelGPU_t<Ftype> gpuPanel;
183#endif
184
185 // xupanel_t(int_t *usub, Ftype *uval);
186 xupanel_t(int_t k, int_t *usub, Ftype *uval, int_t *xsup);
187#ifdef HAVE_CUDA
188 xupanel_t() : gpuPanel(NULL, NULL)
189 {
190 index = NULL;
191 val = NULL;
192 }
193#else
195 {
196 index = NULL;
197 val = NULL;
198 }
199#endif
200 // constructing from recevied index and val
201 xupanel_t(int_t *index_, Ftype *val_) : index(index_), val(val_) { return; };
202 // index[0] is number of blocks
204 {
205 return index[0];
206 }
207 // number of rows
209 {
210 if (index == NULL)
211 return 0; /* Sherry added this check. 2/22/2023 */
212 return index[1];
213 }
214 int_t LDA() { return index[2]; } // is also supersize of that coloumn
215
216 // global block id of k-th block in the panel
218 {
219 return index[UPANEL_HEADER_SIZE + k];
220 }
221
222 // number of rows in the k-th block
224 {
225 return index[UPANEL_HEADER_SIZE + nblocks() + k + 1] - index[UPANEL_HEADER_SIZE + nblocks() + k];
226 }
227 // row
229 {
230 // UPANEL_HEADER
231 // nblocks() : blocks list
232 // nblocks()+1 : blocks st_points
233 // index[UPANEL_HEADER_SIZE + nblocks() + k] statrting of the block
234 return &index[UPANEL_HEADER_SIZE +
235 2 * nblocks() + 1 + index[UPANEL_HEADER_SIZE + nblocks() + k]];
236 }
237
238 Ftype *blkPtr(int_t k)
239 {
240 return &val[LDA() * index[UPANEL_HEADER_SIZE + nblocks() + k]];
241 }
242
244 {
245 return LDA() * index[UPANEL_HEADER_SIZE + nblocks() + k];
246 }
247 // for U panel
248 // int_t packed2skyline(int_t* usub, Ftype* uval );
249 int_t packed2skyline(int_t k, int_t *usub, Ftype *uval, int_t *xsup);
250 int_t panelSolve(int_t ksupsz, Ftype *DiagBlk, int_t LDD);
251 int_t diagFactor(int_t k, Ftype *UBlk, int_t LDU, Ftype thresh, int_t *xsup,
252 superlu_dist_options_t *options,
253 SuperLUStat_t *stat, int *info);
254
255 // Ftype* blkPtr(int_t k);
256 // int_t LDA();
257 int_t find(int_t k);
258 int_t isEmpty() { return index == NULL; }
260 {
261 if (index == NULL)
262 return 0;
263 return LDA() * nzcols();
264 }
265
267 {
268 if (index == NULL)
269 return 0;
270 return UPANEL_HEADER_SIZE + 2 * nblocks() + 1 + nzcols();
271 }
272 size_t totalSize()
273 {
274 return sizeof(int_t) * indexSize() + sizeof(Ftype) * nzvalSize();
275 }
277 {
278 if (index == NULL)
279 {
280 std::cout << "## Warning: Empty Panel"
281 << "\n";
282 return 0;
283 }
284 int_t alternateNzcols = index[UPANEL_HEADER_SIZE + 2 * nblocks()];
285 // std::cout<<nblocks()<<" nzcols "<<nzcols()<<" alternate nzcols "<< alternateNzcols << "\n";
286 if (nzcols() != alternateNzcols)
287 {
288 printf("Error 175\n");
289 exit(-1);
290 }
291
292 return UPANEL_HEADER_SIZE + 2 * nblocks() + 1 + nzcols();
293 }
294
296 {
297 return index[UPANEL_HEADER_SIZE + nblocks() + k];
298 }
299 int getEndBlock(int jSt, int maxCols);
300
301 // ~upanel_t()
302 // {
303 // SUPERLU_FREE(index);
304 // SUPERLU_FREE(val);
305 // }
306
307#ifdef HAVE_CUDA
308 xupanelGPU_t<Ftype> copyToGPU();
309 // TODO: implement with baseptr
310 xupanelGPU_t<Ftype> copyToGPU(void *basePtr);
311 int copyBackToGPU();
312
313 int_t panelSolveGPU(cublasHandle_t handle, cudaStream_t cuStream,
314 int_t ksupsz, Ftype *DiagBlk, int_t LDD);
315 int checkGPU();
316
317 Ftype *blkPtrGPU(int k)
318 {
319 return &gpuPanel.val[blkPtrOffset(k)];
320 }
321
322 xupanel_t(int_t *index_, Ftype *val_, int_t *indexGPU, Ftype *valGPU) : index(index_), val(val_), gpuPanel(indexGPU, valGPU)
323 {
324 return;
325 };
326 int_t copyFromGPU();
327#endif
328};
329
330// Defineing GPU data types
331// lapenGPU_t has exact same structure has lapanel_t but
332// the pointers are on GPU
333template <typename Ftype>
335{
343 // variables for scattering ldt*THREAD_Size
346 Ftype *bigV; // size = THREAD_Size*ldt*ldt
349 int *info;
350 // TODO: get it from environment
351 int numDiagBufs = 32; /* Sherry: not fixed yet */
352
353 // Add SCT_t here
357
358 // Adding more variables for factorization
361 int maxLeafNodes; /* Sherry added 12/31/22. Computed in xLUstruct_t constructor */
362
363 diagFactBufs_type<Ftype>** dFBufs; /* stores L and U diagonal blocks */
365 // myNodeCount,
366 // treePerm
367 // myZeroTrIdxs
368 // sForests
369 // myTreeIdxs
370 // gEtreeInfo
371
372 // buffers for communication
377 std::vector<Ftype *> diagFactBufs; /* stores diagonal blocks,
378 each one is a normal dense matrix.
379 Sherry: where are they free'd ?? */
380 std::vector<Ftype *> LvalRecvBufs;
381 std::vector<Ftype *> UvalRecvBufs;
382 std::vector<int_t *> LidxRecvBufs;
383 std::vector<int_t *> UidxRecvBufs;
384
385 // send and recv count for 2d comm
386 std::vector<int_t> LvalSendCounts;
387 std::vector<int_t> UvalSendCounts;
388 std::vector<int_t> LidxSendCounts;
389 std::vector<int_t> UidxSendCounts;
390
391 //
392 #pragma warning disabling bcastStruct
393 #if 0
394 std::vector<bcastStruct> bcastDiagRow;
395 std::vector<bcastStruct> bcastDiagCol;
396 std::vector<bcastStruct> bcastLval;
397 std::vector<bcastStruct> bcastUval;
398 std::vector<bcastStruct> bcastLidx;
399 std::vector<bcastStruct> bcastUidx;
400 #endif
401
402 int_t krow(int_t k) { return k % Pr; }
403 int_t kcol(int_t k) { return k % Pc; }
404 int_t procIJ(int_t i, int_t j) { return PNUM(krow(i), kcol(j), grid); }
405 int_t supersize(int_t k) { return xsup[k + 1] - xsup[k]; }
406 int_t g2lRow(int_t k) { return k / Pr; }
407 int_t g2lCol(int_t k) { return k / Pc; }
408
410 // For GPU acceleration
411 xLUstructGPU_t<Ftype> *dA_gpu; // pointing to memory on GPU
412 xLUstructGPU_t<Ftype> A_gpu; // pointing to memory accessible on CPU
413
415 // Intermediate for flat batched
419
424
427
429 {
431 COL_MAP
432 };
433
440 threshPivValType<Ftype> thresh_, int *info_);
441
443 {
444
445 /* Yang: Deallocate the lPanelVec[i] and uPanelVec[i] here instead of using destructors ~lpanel_t or ~upanel_t,
446 as xlpanel_t/upanel_t is used for holding temporary communication buffers as well. Note that lPanelVec[i].val is not deallocated here as it's pointing to the L data in the C code*/
447
448 for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
449 if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
450 {
451 if (lPanelVec[i].index)
452 SUPERLU_FREE(lPanelVec[i].index);
453 // SUPERLU_FREE(lPanelVec[i].val);
454 }
455
456 for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
457 if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
458 {
459 if (uPanelVec[i].index)
460 SUPERLU_FREE(uPanelVec[i].index);
461 if (uPanelVec[i].val)
463 }
464
465 delete[] lPanelVec;
466 delete[] uPanelVec;
467
468 /* free diagonal L and U blocks */
469 // dfreeDiagFactBufsArr(maxLeafNodes, dFBufs);
470 freeDiagFactBufsArr(numDiagBufs, dFBufs);
471
476
477 int i;
478 for (i = 0; i < options->num_lookaheads; i++)
479 {
484 }
485
486 for (i = 0; i < numDiagBufs; i++)
488
489 /* Sherry added the following, which comes from batch setup */
490 superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
492 {
493 // printf(".. free batch buffers\n"); fflush(stdout);
494 SUPERLU_FREE(A_gpu.dFBufs);
495 SUPERLU_FREE(A_gpu.gpuGemmBuffs);
496
497 for (int stream = 0; stream < A_gpu.numCudaStreams; stream++)
498 {
499 cusolverDnDestroy(A_gpu.cuSolveHandles[stream]);
500 cublasDestroy(A_gpu.cuHandles[stream]);
501 cublasDestroy(A_gpu.lookAheadLHandle[stream]);
502 cublasDestroy(A_gpu.lookAheadUHandle[stream]);
503 }
504 }
505
507
508 } /* end destructor xLUstruct_t */
509
515 int_t *computeIndirectMap(indirectMapType direction, int_t srcLen, int_t *srcVec,
516 int_t dstLen, int_t *dstVec);
517
519 int_t gi, int_t gj,
520 Ftype *V, int_t ldv,
521 int_t *srcRowList, int_t *srcColList);
522
524 int_t k, int_t laIdx, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
526 int_t ii, int_t jj, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
528 int_t k, int_t ex, // suypernodes to be excluded
529 xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
530
532 sForest_t *sforest,
533 diagFactBufs_type<Ftype>** dFBufs, // size maxEtree level
534 gEtreeInfo_t *gEtreeInfo, // global etree info
535 int tag_ub);
536
538 sForest_t *sforest,
539 diagFactBufs_type<Ftype>** dFBufs, // size maxEtree level
540 gEtreeInfo_t *gEtreeInfo, // global etree info
541 int tag_ub);
542
544
545 // Helper routine to marshall batch LU data into the device data in A_gpu
546 void marshallBatchedLUData(int k_st, int k_end, int_t *perm_c_supno);
547 void marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno);
548 void marshallBatchedTRSMUData(int k_st, int k_end, int_t *perm_c_supno);
549 void marshallBatchedTRSMLData(int k_st, int k_end, int_t *perm_c_supno);
550 void marshallBatchedSCUData(int k_st, int k_end, int_t *perm_c_supno);
551 void initSCUMarshallData(int k_st, int k_end, int_t *perm_c_supno);
552 int marshallSCUBatchedDataInner(int k_st, int k_end, int_t *perm_c_supno);
553 int marshallSCUBatchedDataOuter(int k_st, int k_end, int_t *perm_c_supno);
554 void dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno);
555
556 //
558 int_t dPanelBcast(int_t k, int_t offset);
560 sForest_t *sforest,
561 diagFactBufs_type<Ftype>** dFBufs, // size maxEtree level
562 gEtreeInfo_t *gEtreeInfo, // global etree info
563 int tag_ub);
564
566
567 int_t ancestorReduction3d(int_t ilvl, int_t *myNodeCount,
568 int_t **treePerm);
569
570 int_t zSendLPanel(int_t k0, int_t receiverGrid);
571 int_t zRecvLPanel(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta);
572 int_t zSendUPanel(int_t k0, int_t receiverGrid);
573 int_t zRecvUPanel(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta);
574
576 int_t alvl,
577 sForest_t *sforest,
578 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
579 gEtreeInfo_t *gEtreeInfo, // global etree info
580 int tag_ub);
581
583 int_t alvl,
584 sForest_t *sforest,
585 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
586 gEtreeInfo_t *gEtreeInfo, // global etree info
587 int tag_ub);
588 // GPU related functions
589#ifdef HAVE_CUDA
590 int_t setLUstruct_GPU();
591 int_t dsparseTreeFactorGPU(
592 sForest_t *sforest,
593 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
594 gEtreeInfo_t *gEtreeInfo, // global etree info
595 int tag_ub);
596 int_t dsparseTreeFactorGPUBaseline(
597 sForest_t *sforest,
598 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
599 gEtreeInfo_t *gEtreeInfo, // global etree info
600 int tag_ub);
601
602 int_t dAncestorFactorBaselineGPU(
603 int_t alvl,
604 sForest_t *sforest,
605 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
606 gEtreeInfo_t *gEtreeInfo, // global etree info
607 int tag_ub);
608
609 int_t dSchurComplementUpdateGPU(
610 int streamId,
611 int_t k, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
612 int_t dSchurCompUpdatePartGPU(
613 int_t iSt, int_t iEnd, int_t jSt, int_t jEnd,
614 int_t k, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel,
615 cublasHandle_t handle, cudaStream_t cuStream,
616 Ftype *gemmBuff);
617 int_t lookAheadUpdateGPU(
618 int streamId,
619 int_t k, int_t laIdx, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
620 int_t dSchurCompUpLimitedMem(
621 int streamId,
622 int_t lStart, int_t lEnd,
623 int_t uStart, int_t uEnd,
624 int_t k, xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
625 int_t dSchurCompUpdateExcludeOneGPU(
626 int streamId,
627 int_t k, int_t ex, // suypernodes to be excluded
628 xlpanel_t<Ftype> &lpanel, xupanel_t<Ftype> &upanel);
629
630 int_t dDiagFactorPanelSolveGPU(int_t k, int_t offset, diagFactBufs_type<Ftype>** dFBufs);
631 int_t dPanelBcastGPU(int_t k, int_t offset);
632
633 int_t ancestorReduction3dGPU(int_t ilvl, int_t *myNodeCount,
634 int_t **treePerm);
635 int_t zSendLPanelGPU(int_t k0, int_t receiverGrid);
636 int_t zRecvLPanelGPU(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta);
637 int_t zSendUPanelGPU(int_t k0, int_t receiverGrid);
638 int_t zRecvUPanelGPU(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta);
639 int_t copyLUGPUtoHost();
640 int_t copyLUHosttoGPU();
641 int_t checkGPU();
642
643 // some more helper functions
644 xupanel_t<Ftype> getKUpanel(int_t k, int_t offset);
645 xlpanel_t<Ftype> getKLpanel(int_t k, int_t offset);
646 int_t SyncLookAheadUpdate(int streamId);
647
648 Ftype *gpuLvalBasePtr, *gpuUvalBasePtr;
649 int_t *gpuLidxBasePtr, *gpuUidxBasePtr;
650 size_t gpuLvalSize, gpuUvalSize, gpuLidxSize, gpuUidxSize;
651
652 xlpanelGPU_t<Ftype> *copyLpanelsToGPU();
653 xupanelGPU_t<Ftype> *copyUpanelsToGPU();
654
655 int freeDiagFactBufsArr(int_t num_bufs, diagFactBufs_type<Ftype>** dFBufs);
656
657 // to perform diagFactOn GPU
658 int_t dDFactPSolveGPU(int_t k, int_t offset, diagFactBufs_type<Ftype>** dFBufs);
659 int_t dDFactPSolveGPU(int_t k, int_t handle_offset, int buffer_offset, diagFactBufs_type<Ftype>** dFBufs);
660#endif
661};
662
Definition: xlupanels.hpp:22
int_t nzvalSize()
Definition: xlupanels.hpp:108
int_t find(int_t k)
Definition: l_panels_impl.hpp:52
int_t packDiagBlock(Ftype *DiagLBlk, int_t LDD)
Definition: l_panels_impl.hpp:101
int_t * rowList(int_t k)
Definition: xlupanels.hpp:80
int_t haveDiag()
Definition: xlupanels.hpp:59
int_t stRow(int k)
Definition: xlupanels.hpp:75
int_t gid(int_t k)
Definition: xlupanels.hpp:63
int_t diagFactor(int_t k, Ftype *UBlk, int_t LDU, threshPivValType< Ftype > thresh, int_t *xsup, superlu_dist_options_t *options, SuperLUStat_t *stat, int *info)
Definition: l_panels_impl.hpp:86
int_t nblocks()
Definition: xlupanels.hpp:53
size_t totalSize()
Definition: xlupanels.hpp:122
int_t indexSize()
Definition: xlupanels.hpp:115
Ftype * blkPtr(int_t k)
Definition: xlupanels.hpp:90
int_t * index
Definition: xlupanels.hpp:24
int_t nbrow(int_t k)
Definition: xlupanels.hpp:69
Ftype * val
Definition: xlupanels.hpp:25
int_t isEmpty()
Definition: xlupanels.hpp:107
int_t ncols()
Definition: xlupanels.hpp:60
xlpanel_t(int_t *index_, Ftype *val_)
Definition: xlupanels.hpp:50
int_t nzrows()
Definition: xlupanels.hpp:58
int getEndBlock(int iSt, int maxRows)
Definition: l_panels_impl.hpp:114
size_t blkPtrOffset(int_t k)
Definition: xlupanels.hpp:95
int_t LDA()
Definition: xlupanels.hpp:100
int_t panelSolve(int_t ksupsz, Ftype *DiagBlk, int_t LDD)
Definition: l_panels_impl.hpp:64
xlpanel_t()
Definition: xlupanels.hpp:43
Definition: xlupanels.hpp:176
int_t diagFactor(int_t k, Ftype *UBlk, int_t LDU, Ftype thresh, int_t *xsup, superlu_dist_options_t *options, SuperLUStat_t *stat, int *info)
int_t find(int_t k)
Definition: u_panels_impl.hpp:112
int_t indexSize()
Definition: xlupanels.hpp:266
int_t packed2skyline(int_t k, int_t *usub, Ftype *uval, int_t *xsup)
Definition: u_panels_impl.hpp:76
xupanel_t(int_t *index_, Ftype *val_)
Definition: xlupanels.hpp:201
int_t stCol(int k)
Definition: xlupanels.hpp:295
int_t * index
Definition: xlupanels.hpp:178
int_t checkCorrectness()
Definition: xlupanels.hpp:276
int_t isEmpty()
Definition: xlupanels.hpp:258
int_t nzcols()
Definition: xlupanels.hpp:208
int_t nzvalSize()
Definition: xlupanels.hpp:259
int_t LDA()
Definition: xlupanels.hpp:214
int_t nbcol(int_t k)
Definition: xlupanels.hpp:223
Ftype * blkPtr(int_t k)
Definition: xlupanels.hpp:238
Ftype * val
Definition: xlupanels.hpp:179
size_t totalSize()
Definition: xlupanels.hpp:272
int getEndBlock(int jSt, int maxCols)
Definition: u_panels_impl.hpp:134
xupanel_t()
Definition: xlupanels.hpp:194
int_t nblocks()
Definition: xlupanels.hpp:203
int_t gid(int_t k)
Definition: xlupanels.hpp:217
int_t * colList(int_t k)
Definition: xlupanels.hpp:228
int_t panelSolve(int_t ksupsz, Ftype *DiagBlk, int_t LDD)
Definition: u_panels_impl.hpp:124
size_t blkPtrOffset(int_t k)
Definition: xlupanels.hpp:243
typename std::conditional< std::is_same< Ftype, double >::value, dLUstruct_t, typename std::conditional< std::is_same< Ftype, float >::value, sLUstruct_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zLUstruct_t, void >::type >::type >::type LUStruct_type
Definition: luAuxStructTemplated.hpp:102
typename std::conditional< std::is_same< Ftype, double >::value, dtrf3Dpartition_t, typename std::conditional< std::is_same< Ftype, float >::value, strf3Dpartition_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, ztrf3Dpartition_t, void >::type >::type >::type trf3dpartitionType
Definition: luAuxStructTemplated.hpp:87
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type threshPivValType
Definition: luAuxStructTemplated.hpp:70
typename std::conditional< std::is_same< Ftype, double >::value, ddiagFactBufs_t, typename std::conditional< std::is_same< Ftype, float >::value, sdiagFactBufs_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zdiagFactBufs_t, void >::type >::type >::type diagFactBufs_type
Definition: luAuxStructTemplated.hpp:147
#define UPANEL_HEADER_SIZE
Definition: lu_common.hpp:7
#define LPANEL_HEADER_SIZE
Definition: lu_common.hpp:6
integer, parameter, public lsub
Definition: superlupara.f90:35
integer, parameter, public usub
Definition: superlupara.f90:35
Definition: util_dist.h:199
Definition: util_dist.h:101
Definition: anc25d.hpp:13
Definition: superlu_ddefs.h:97
Definition: superlu_defs.h:978
Definition: superlu_defs.h:414
Definition: superlu_defs.h:404
Definition: superlu_defs.h:989
Definition: superlu_defs.h:728
int num_lookaheads
Definition: superlu_defs.h:757
Definition: xlupanels.hpp:335
int_t Pr
Definition: xlupanels.hpp:340
int_t dSchurCompUpdateExcludeOne(int_t k, int_t ex, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:528
int * d_lblock_start_dat
Definition: xlupanels.hpp:421
xLUstructGPU_t< Ftype > A_gpu
Definition: xlupanels.hpp:412
dLocalLU_t d_localLU
Definition: xlupanels.hpp:418
diagFactBufs_type< Ftype > ** dFBufs
Definition: xlupanels.hpp:363
int_t g2lCol(int_t k)
Definition: xlupanels.hpp:407
int marshallSCUBatchedDataOuter(int k_st, int k_end, int_t *perm_c_supno)
dLocalLU_t * host_Llu
Definition: xlupanels.hpp:417
int_t maxUvalCount
Definition: xlupanels.hpp:375
int_t lookAheadUpdate(int_t k, int_t laIdx, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:457
std::vector< int_t * > LidxRecvBufs
Definition: xlupanels.hpp:382
int_t dDiagFactorPanelSolve(int_t k, int_t offset, diagFactBufs_type< Ftype > **dFBufs)
Definition: lupanels_impl.hpp:558
int_t mycol
Definition: xlupanels.hpp:340
SuperLUStat_t * stat
Definition: xlupanels.hpp:356
int numDiagBufs
Definition: xlupanels.hpp:351
gridinfo_t * grid
Definition: xlupanels.hpp:339
~xLUstruct_t()
Definition: xlupanels.hpp:442
void marshallBatchedTRSMUData(int k_st, int k_end, int_t *perm_c_supno)
xlpanel_t< Ftype > * lPanelVec
Definition: xlupanels.hpp:336
int marshallSCUBatchedDataInner(int k_st, int k_end, int_t *perm_c_supno)
void marshallBatchedTRSMLData(int k_st, int k_end, int_t *perm_c_supno)
int_t blockUpdate(int_t k, int_t ii, int_t jj, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:497
int_t * indirectCol
Definition: xlupanels.hpp:345
int_t myrow
Definition: xlupanels.hpp:340
int_t procIJ(int_t i, int_t j)
Definition: xlupanels.hpp:404
void dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno)
int ** d_lblock_gid_ptrs
Definition: xlupanels.hpp:420
int_t packedU2skyline(LUStruct_type< Ftype > *LUstruct)
Definition: lupanels_impl.hpp:427
int nThreads
Definition: xlupanels.hpp:344
int_t maxLvl
Definition: xlupanels.hpp:360
int_t g2lRow(int_t k)
Definition: xlupanels.hpp:406
superlu_dist_options_t * options
Definition: xlupanels.hpp:355
std::vector< int_t > UvalSendCounts
Definition: xlupanels.hpp:387
int_t * indirect
Definition: xlupanels.hpp:345
Ftype * bigV
Definition: xlupanels.hpp:346
int_t kcol(int_t k)
Definition: xlupanels.hpp:403
int_t maxUidxCount
Definition: xlupanels.hpp:376
void computeLBlockData()
xLUstruct_t(int_t nsupers, int_t ldt_, trf3dpartitionType< Ftype > *trf3Dpartition, LUStruct_type< Ftype > *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT_, superlu_dist_options_t *options_, SuperLUStat_t *stat, threshPivValType< Ftype > thresh_, int *info_)
Definition: lupanels_impl.hpp:83
threshPivValType< Ftype > thresh
Definition: xlupanels.hpp:348
std::vector< int_t * > UidxRecvBufs
Definition: xlupanels.hpp:383
int_t dsparseTreeFactor(sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Definition: dsparseTreeFactor_upacked_impl.hpp:8
std::vector< Ftype * > LvalRecvBufs
Definition: xlupanels.hpp:380
int64_t total_start_size
Definition: xlupanels.hpp:423
int_t pdgstrf3d()
Definition: pdgstrf3d_upacked_impl.hpp:224
int_t maxLidxCount
Definition: xlupanels.hpp:374
int64_t * d_lblock_gid_offsets
Definition: xlupanels.hpp:422
int_t dAncestorFactorBaseline(int_t alvl, sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Definition: anc25d_impl.hpp:29
void initSCUMarshallData(int k_st, int k_end, int_t *perm_c_supno)
diagFactBufs_type< Ftype > ** initDiagFactBufsArr(int_t mxLeafNode, int_t ldt)
Definition: lupanels_impl.hpp:16
void marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno)
int_t nsupers
Definition: xlupanels.hpp:342
void marshallBatchedLUData(int k_st, int k_end, int_t *perm_c_supno)
int_t ldt
Definition: xlupanels.hpp:340
int ** d_lblock_start_ptrs
Definition: xlupanels.hpp:421
int_t maxLvalCount
Definition: xlupanels.hpp:373
int_t dScatter(int_t m, int_t n, int_t gi, int_t gj, Ftype *V, int_t ldv, int_t *srcRowList, int_t *srcColList)
Definition: lupanels_impl.hpp:372
SCT_t * SCT
Definition: xlupanels.hpp:354
int_t zSendLPanel(int_t k0, int_t receiverGrid)
Definition: lupanels_comm3d_impl.hpp:62
int64_t total_l_blocks
Definition: xlupanels.hpp:423
trf3dpartitionType< Ftype > * trf3Dpartition
Definition: xlupanels.hpp:359
indirectMapType
Definition: xlupanels.hpp:429
@ ROW_MAP
Definition: xlupanels.hpp:430
@ COL_MAP
Definition: xlupanels.hpp:431
int_t zSendUPanel(int_t k0, int_t receiverGrid)
Definition: lupanels_comm3d_impl.hpp:100
std::vector< int_t > LidxSendCounts
Definition: xlupanels.hpp:388
int superlu_acc_offload
Definition: xlupanels.hpp:364
int * isNodeInMyGrid
Definition: xlupanels.hpp:347
int dsparseTreeFactorBatchGPU(sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
int_t zRecvUPanel(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta)
Definition: lupanels_comm3d_impl.hpp:117
std::vector< int_t > UidxSendCounts
Definition: xlupanels.hpp:389
std::vector< Ftype * > UvalRecvBufs
Definition: xlupanels.hpp:381
int_t ancestorReduction3d(int_t ilvl, int_t *myNodeCount, int_t **treePerm)
Definition: lupanels_comm3d_impl.hpp:9
int_t supersize(int_t k)
Definition: xlupanels.hpp:405
int * d_lblock_gid_dat
Definition: xlupanels.hpp:420
void marshallBatchedSCUData(int k_st, int k_end, int_t *perm_c_supno)
int maxLeafNodes
Definition: xlupanels.hpp:361
xupanel_t< Ftype > * uPanelVec
Definition: xlupanels.hpp:337
int_t dPanelBcast(int_t k, int_t offset)
Definition: lupanels_impl.hpp:589
anc25d_t anc25d
Definition: xlupanels.hpp:409
int_t dAncestorFactor(int_t alvl, sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Definition: dAncestorFactor_impl.hpp:6
int_t zRecvLPanel(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta)
Definition: lupanels_comm3d_impl.hpp:79
int_t * computeIndirectMap(indirectMapType direction, int_t srcLen, int_t *srcVec, int_t dstLen, int_t *dstVec)
Definition: lupanels_impl.hpp:340
int_t krow(int_t k)
Definition: xlupanels.hpp:402
int64_t * d_lblock_start_offsets
Definition: xlupanels.hpp:422
int_t dSchurComplementUpdate(int_t k, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:314
std::vector< int_t > LvalSendCounts
Definition: xlupanels.hpp:386
int_t * indirectRow
Definition: xlupanels.hpp:345
int_t * xsup
Definition: xlupanels.hpp:341
std::vector< Ftype * > diagFactBufs
Definition: xlupanels.hpp:377
int_t dsparseTreeFactorBaseline(sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Definition: dsparseTreeFactor_upacked_impl.hpp:220
int_t iam
Definition: xlupanels.hpp:340
gridinfo3d_t * grid3d
Definition: xlupanels.hpp:338
xLUstructGPU_t< Ftype > * dA_gpu
Definition: xlupanels.hpp:411
int_t Pc
Definition: xlupanels.hpp:340
int * info
Definition: xlupanels.hpp:349
Distributed SuperLU data types and function prototypes.
#define CEILING(a, b)
Definition: superlu_defs.h:277
int sp_ienv_dist(int, superlu_dist_options_t *)
Definition: sp_ienv.c:80
int64_t int_t
Definition: superlu_defs.h:119
#define PNUM(i, j, grid)
Definition: superlu_defs.h:276
int j
Definition: sutil_dist.c:287
int i
Definition: sutil_dist.c:287
#define SUPERLU_FREE(addr)
Definition: util_dist.h:54