15template <
typename Ftype>
21 for (
int i = 0;
i < num_bufs;
i++)
25 dFBufs[
i]->BlockUFactor = (Ftype *)
SUPERLU_MALLOC(ldt * ldt *
sizeof(Ftype));
26 dFBufs[
i]->BlockLFactor = (Ftype *)
SUPERLU_MALLOC(ldt * ldt *
sizeof(Ftype));
31template <
typename Ftype>
34 for (
int i = 0;
i < num_bufs;
i++)
49template <
typename Ftype>
54 uPanelVec[g2lRow(k)] :
56 A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset])
60template <
typename Ftype>
65 lPanelVec[g2lCol(k)] :
67 A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset])
72template <
typename Ftype>
76 if (!(bigV = (Ftype*)
SUPERLU_MALLOC (8 * ldt * ldt * num_threads *
sizeof(Ftype))))
77 ABORT (
"Malloc failed for dgemm buffV");
82template <
typename Ftype>
90 nsupers(nsupers_), trf3Dpartition(trf3Dpartition_),
92 grid3d(grid3d_in), SCT(SCT_),
93 options(options_), stat(stat_),
94 thresh(thresh_), info(info_), anc25d(grid3d_in)
109 xsup = LUstruct->Glu_persist->xsup;
110 int_t **Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr;
111 int_t **Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
112 Ftype **Lnzval_bc_ptr = LUstruct->Llu->Lnzval_bc_ptr;
113 Ftype **Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
133 int_t isDiagIncluded = 0;
170 for (
int pr = 0; pr <
Pr; pr++)
173 std::copy(localUvalSendCounts.begin(), localUvalSendCounts.end(), recvBuf.begin());
181 std::copy(localUidxSendCounts.begin(), localUidxSendCounts.end(), recvBuf.begin());
190 for (
int pc = 0; pc <
Pc; pc++)
193 std::copy(localLvalSendCounts.begin(), localLvalSendCounts.end(), recvBuf.begin());
201 std::copy(localLidxSendCounts.begin(), localLidxSendCounts.end(), recvBuf.begin());
241 #pragma warning disabling bcaststruct
244 bcastLval[
i] = bcLval;
246 bcastUval[
i] = bcUval;
248 bcastLidx[
i] = bcLidx;
250 bcastUidx[
i] = bcUidx;
272 for (
int ilvl = 0; ilvl <
maxLvl; ++ilvl)
274 if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode)
295#if ( PRNTlevel >= 1 )
296 printf(
"Time to intialize GPU DS= %g\n",tGPU );
313template <
typename Ftype>
321 if (myrow == krow(k))
327#pragma omp parallel for
328 for (
size_t ij = 0; ij < (nlb - st_lb) * nub; ij++)
330 int_t ii = ij / nub + st_lb;
332 blockUpdate(k, ii, jj, lpanel, upanel);
339template <
typename Ftype>
349 thread_id = omp_get_thread_num();
353 int_t *dstIdx = indirect + thread_id * ldt;
357 dstIdx[dstVec[
i]] =
i;
360 int_t *RCmap = (direction == ROW_MAP) ? indirectRow : indirectCol;
361 RCmap += thread_id * ldt;
365 RCmap[
i] = dstIdx[srcVec[
i]];
371template <
typename Ftype>
374 Ftype *Src,
int_t ldsrc,
380 int_t dstRowLen, dstColLen;
386 int lj = uPanelVec[li].find(gj);
387 Dst = uPanelVec[li].blkPtr(lj);
388 lddst = supersize(gi);
389 dstRowLen = supersize(gi);
391 dstColLen = uPanelVec[li].nbcol(lj);
392 dstColList = uPanelVec[li].colList(lj);
398 int li = lPanelVec[lj].find(gi);
399 Dst = lPanelVec[lj].blkPtr(li);
400 lddst = lPanelVec[lj].LDA();
401 dstRowLen = lPanelVec[lj].nbrow(li);
402 dstRowList = lPanelVec[lj].rowList(li);
403 dstColLen = supersize(gj);
408 int_t *rowS2D = computeIndirectMap(ROW_MAP, m, srcRowList,
409 dstRowLen, dstRowList);
412 int_t *colS2D = computeIndirectMap(COL_MAP, n, srcColList,
413 dstColLen, dstColList);
415 for (
int j = 0;
j < n;
j++)
417 for (
int i = 0;
i < m;
i++)
419 Dst[rowS2D[
i] + lddst * colS2D[
j]] -= Src[
i + ldsrc *
j];
426template <
typename Ftype>
430 int_t **Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
431 Ftype **Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
435 if (Ufstnz_br_ptr[
i] != NULL && isNodeInMyGrid[
i * Pr + myrow] == 1)
437 int_t globalId =
i * Pr + myrow;
438 uPanelVec[
i].packed2skyline(globalId, Ufstnz_br_ptr[
i], Unzval_br_ptr[
i], xsup);
456template <
typename Ftype>
464 if (myrow == krow(k))
475#pragma omp for nowait
476 for (
size_t ii = st_lb; ii < nlb; ii++)
480 blockUpdate(k, ii, jj, lpanel, upanel);
484#pragma omp for nowait
485 for (
size_t jj = 0; jj < nub; jj++)
489 blockUpdate(k, ii, jj, lpanel, upanel);
496template <
typename Ftype>
502 thread_id = omp_get_thread_num();
507 Ftype *V = bigV + thread_id * ldt * ldt;
509 Ftype alpha = one<Ftype>();
510 Ftype beta = zeroT<Ftype>();
511 superlu_gemm<Ftype>(
"N",
"N",
512 lpanel.
nbrow(ii), upanel.
nbcol(jj), supersize(k), alpha,
515 V, lpanel.
nbrow(ii));
522 ib, jb, V, lpanel.
nbrow(ii),
527template <
typename Ftype>
536 if (myrow == krow(k))
545#pragma omp parallel for
546 for (
size_t ij = 0; ij < (nlb - st_lb) * nub; ij++)
548 int_t ii = ij / nub + st_lb;
551 if (ii != exILoc && jj != exJLoc)
552 blockUpdate(k, ii, jj, lpanel, upanel);
557template <
typename Ftype>
563 if (iam == procIJ(k, k))
565 lPanelVec[g2lCol(k)].diagFactor(k, dFBufs[offset]->BlockUFactor, ksupc,
566 thresh, xsup, options, stat, info);
567 lPanelVec[g2lCol(k)].packDiagBlock(dFBufs[offset]->BlockLFactor, ksupc);
571 if (myrow == krow(k))
572 MPI_Bcast((
void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
573 MPI_DOUBLE, kcol(k), (grid->rscp).comm);
574 if (mycol == kcol(k))
575 MPI_Bcast((
void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
576 MPI_DOUBLE, krow(k), (grid->cscp).comm);
579 if (myrow == krow(k))
580 uPanelVec[g2lRow(k)].panelSolve(ksupc, dFBufs[offset]->BlockLFactor, ksupc);
582 if (mycol == kcol(k))
583 lPanelVec[g2lCol(k)].panelSolve(ksupc, dFBufs[offset]->BlockUFactor, ksupc);
588template <
typename Ftype>
594 if (myrow == krow(k))
595 k_upanel = uPanelVec[g2lRow(k)];
597 if (mycol == kcol(k))
598 k_lpanel = lPanelVec[g2lCol(k)];
600 if (UidxSendCounts[k] > 0)
602 MPI_Bcast(k_upanel.
index, UidxSendCounts[k],
mpi_int_t, krow(k), grid3d->cscp.comm);
603 MPI_Bcast(k_upanel.
val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm);
606 if (LidxSendCounts[k] > 0)
608 MPI_Bcast(k_lpanel.
index, LidxSendCounts[k],
mpi_int_t, kcol(k), grid3d->rscp.comm);
609 MPI_Bcast(k_lpanel.
val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm);
Definition: commWrapper.hpp:7
Definition: xlupanels.hpp:22
int_t find(int_t k)
Definition: l_panels_impl.hpp:52
int_t * rowList(int_t k)
Definition: xlupanels.hpp:80
int_t gid(int_t k)
Definition: xlupanels.hpp:63
int_t nblocks()
Definition: xlupanels.hpp:53
Ftype * blkPtr(int_t k)
Definition: xlupanels.hpp:90
int_t * index
Definition: xlupanels.hpp:24
int_t nbrow(int_t k)
Definition: xlupanels.hpp:69
Ftype * val
Definition: xlupanels.hpp:25
int_t isEmpty()
Definition: xlupanels.hpp:107
int_t LDA()
Definition: xlupanels.hpp:100
Definition: xlupanels.hpp:176
int_t find(int_t k)
Definition: u_panels_impl.hpp:112
int_t * index
Definition: xlupanels.hpp:178
int_t isEmpty()
Definition: xlupanels.hpp:258
int_t LDA()
Definition: xlupanels.hpp:214
int_t nbcol(int_t k)
Definition: xlupanels.hpp:223
Ftype * blkPtr(int_t k)
Definition: xlupanels.hpp:238
Ftype * val
Definition: xlupanels.hpp:179
int_t nblocks()
Definition: xlupanels.hpp:203
int_t gid(int_t k)
Definition: xlupanels.hpp:217
int_t * colList(int_t k)
Definition: xlupanels.hpp:228
@ SYNC
Definition: commWrapper.hpp:4
typename std::conditional< std::is_same< Ftype, double >::value, dLUstruct_t, typename std::conditional< std::is_same< Ftype, float >::value, sLUstruct_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zLUstruct_t, void >::type >::type >::type LUStruct_type
Definition: luAuxStructTemplated.hpp:102
typename std::conditional< std::is_same< Ftype, double >::value, dtrf3Dpartition_t, typename std::conditional< std::is_same< Ftype, float >::value, strf3Dpartition_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, ztrf3Dpartition_t, void >::type >::type >::type trf3dpartitionType
Definition: luAuxStructTemplated.hpp:87
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type threshPivValType
Definition: luAuxStructTemplated.hpp:70
typename std::conditional< std::is_same< Ftype, double >::value, ddiagFactBufs_t, typename std::conditional< std::is_same< Ftype, float >::value, sdiagFactBufs_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zdiagFactBufs_t, void >::type >::type >::type diagFactBufs_type
Definition: luAuxStructTemplated.hpp:147
#define GLOBAL_BLOCK_NOT_FOUND
Definition: lupanels.hpp:16
Ftype * getBigV(int_t ldt, int_t num_threads)
Definition: lupanels_impl.hpp:73
int numProcsPerNode(MPI_Comm baseCommunicator)
Definition: lupanels.cpp:397
Definition: util_dist.h:199
Definition: util_dist.h:101
Definition: superlu_defs.h:414
gridinfo_t grid2d
Definition: superlu_defs.h:419
superlu_scope_t zscp
Definition: superlu_defs.h:418
superlu_scope_t rscp
Definition: superlu_defs.h:416
int iam
Definition: superlu_defs.h:420
superlu_scope_t cscp
Definition: superlu_defs.h:417
int_t nprow
Definition: superlu_defs.h:409
int_t npcol
Definition: superlu_defs.h:410
int iam
Definition: superlu_defs.h:408
Definition: superlu_defs.h:989
treeTopoInfo_t topoInfo
Definition: superlu_defs.h:999
Definition: superlu_defs.h:728
int num_lookaheads
Definition: superlu_defs.h:757
int Np
Definition: superlu_defs.h:399
MPI_Comm comm
Definition: superlu_defs.h:398
int_t * eTreeTopLims
Definition: superlu_defs.h:972
Definition: xlupanels.hpp:335
int_t Pr
Definition: xlupanels.hpp:340
int_t dSchurCompUpdateExcludeOne(int_t k, int_t ex, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:528
diagFactBufs_type< Ftype > ** dFBufs
Definition: xlupanels.hpp:363
int_t maxUvalCount
Definition: xlupanels.hpp:375
int_t lookAheadUpdate(int_t k, int_t laIdx, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:457
std::vector< int_t * > LidxRecvBufs
Definition: xlupanels.hpp:382
int_t dDiagFactorPanelSolve(int_t k, int_t offset, diagFactBufs_type< Ftype > **dFBufs)
Definition: lupanels_impl.hpp:558
int_t mycol
Definition: xlupanels.hpp:340
int numDiagBufs
Definition: xlupanels.hpp:351
gridinfo_t * grid
Definition: xlupanels.hpp:339
xlpanel_t< Ftype > * lPanelVec
Definition: xlupanels.hpp:336
int_t blockUpdate(int_t k, int_t ii, int_t jj, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:497
int_t * indirectCol
Definition: xlupanels.hpp:345
int_t myrow
Definition: xlupanels.hpp:340
int_t packedU2skyline(LUStruct_type< Ftype > *LUstruct)
Definition: lupanels_impl.hpp:427
int nThreads
Definition: xlupanels.hpp:344
int_t maxLvl
Definition: xlupanels.hpp:360
superlu_dist_options_t * options
Definition: xlupanels.hpp:355
std::vector< int_t > UvalSendCounts
Definition: xlupanels.hpp:387
int_t * indirect
Definition: xlupanels.hpp:345
Ftype * bigV
Definition: xlupanels.hpp:346
int_t maxUidxCount
Definition: xlupanels.hpp:376
xLUstruct_t(int_t nsupers, int_t ldt_, trf3dpartitionType< Ftype > *trf3Dpartition, LUStruct_type< Ftype > *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT_, superlu_dist_options_t *options_, SuperLUStat_t *stat, threshPivValType< Ftype > thresh_, int *info_)
Definition: lupanels_impl.hpp:83
std::vector< int_t * > UidxRecvBufs
Definition: xlupanels.hpp:383
std::vector< Ftype * > LvalRecvBufs
Definition: xlupanels.hpp:380
int_t maxLidxCount
Definition: xlupanels.hpp:374
diagFactBufs_type< Ftype > ** initDiagFactBufsArr(int_t mxLeafNode, int_t ldt)
Definition: lupanels_impl.hpp:16
int_t nsupers
Definition: xlupanels.hpp:342
int_t ldt
Definition: xlupanels.hpp:340
int_t maxLvalCount
Definition: xlupanels.hpp:373
int_t dScatter(int_t m, int_t n, int_t gi, int_t gj, Ftype *V, int_t ldv, int_t *srcRowList, int_t *srcColList)
Definition: lupanels_impl.hpp:372
trf3dpartitionType< Ftype > * trf3Dpartition
Definition: xlupanels.hpp:359
indirectMapType
Definition: xlupanels.hpp:429
std::vector< int_t > LidxSendCounts
Definition: xlupanels.hpp:388
int superlu_acc_offload
Definition: xlupanels.hpp:364
int * isNodeInMyGrid
Definition: xlupanels.hpp:347
std::vector< int_t > UidxSendCounts
Definition: xlupanels.hpp:389
std::vector< Ftype * > UvalRecvBufs
Definition: xlupanels.hpp:381
int maxLeafNodes
Definition: xlupanels.hpp:361
xupanel_t< Ftype > * uPanelVec
Definition: xlupanels.hpp:337
int_t dPanelBcast(int_t k, int_t offset)
Definition: lupanels_impl.hpp:589
int_t * computeIndirectMap(indirectMapType direction, int_t srcLen, int_t *srcVec, int_t dstLen, int_t *dstVec)
Definition: lupanels_impl.hpp:340
int_t krow(int_t k)
Definition: xlupanels.hpp:402
int_t dSchurComplementUpdate(int_t k, xlpanel_t< Ftype > &lpanel, xupanel_t< Ftype > &upanel)
Definition: lupanels_impl.hpp:314
std::vector< int_t > LvalSendCounts
Definition: xlupanels.hpp:386
int_t * indirectRow
Definition: xlupanels.hpp:345
int_t * xsup
Definition: xlupanels.hpp:341
std::vector< Ftype * > diagFactBufs
Definition: xlupanels.hpp:377
int_t iam
Definition: xlupanels.hpp:340
gridinfo3d_t * grid3d
Definition: xlupanels.hpp:338
int_t Pc
Definition: xlupanels.hpp:340
Definitions which are precision-neutral.
#define CEILING(a, b)
Definition: superlu_defs.h:277
int_t log2i(int_t index)
Definition: supernodal_etree.c:17
#define SuperSize(bnum)
Definition: superlu_defs.h:271
#define mpi_int_t
Definition: superlu_defs.h:120
int sp_ienv_dist(int, superlu_dist_options_t *)
Definition: sp_ienv.c:80
int * getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t *myNodeCount, int_t **treePerm)
Definition: supernodalForest.c:307
int getNumThreads(int)
Definition: trfAux.c:61
#define MYCOL(iam, grid)
Definition: superlu_defs.h:268
int64_t int_t
Definition: superlu_defs.h:119
#define MYROW(iam, grid)
Definition: superlu_defs.h:267
double SuperLU_timer_()
Definition: superlu_timer.c:72
int j
Definition: sutil_dist.c:287
int i
Definition: sutil_dist.c:287
#define SUPERLU_MALLOC(size)
Definition: util_dist.h:48
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56
#define SUPERLU_FREE(addr)
Definition: util_dist.h:54
#define ABORT(err_msg)
Definition: util_dist.h:38