8template <
typename Ftype>
13 int_t myGrid = grid3d->zscp.Iam;
16 printf(
".maxLvl %d\n", maxLvl); fflush(stdout);
17 CHECK_MALLOC(grid3d->iam,
"Enter ancestorReduction3dGPU()");
20 int_t sender, receiver;
21 if ((myGrid % (1 << (ilvl + 1))) == 0)
23 sender = myGrid + (1 << ilvl);
29 receiver = myGrid - (1 << ilvl);
33 for (
int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
37 int_t numNodes = myNodeCount[alvl];
38 int_t *nodeList = treePerm[alvl];
43 for (
int_t node = 0; node < numNodes; ++node)
45 int_t k0 = nodeList[node];
49 zSendLPanelGPU(k0, receiver);
50 zSendUPanelGPU(k0, receiver);
54 Ftype alpha = one<Ftype>(); Ftype beta = one<Ftype>();
56 zRecvLPanelGPU(k0, sender, alpha, beta);
57 zRecvUPanelGPU(k0, sender, alpha, beta);
60 cudaStreamSynchronize(A_gpu.cuStreams[0]) ;
66 CHECK_MALLOC(grid3d->iam,
"Exit ancestorReduction3dGPU()");
72template <
typename Ftype>
76 if (mycol == kcol(k0))
78 int_t lk = g2lCol(k0);
79 if (!lPanelVec[lk].isEmpty())
81 MPI_Send(lPanelVec[lk].blkPtrGPU(0), lPanelVec[lk].nzvalSize(),
82 get_mpi_type<Ftype>(), receiverGrid, k0, grid3d->zscp.comm);
83 SCT->commVolRed += lPanelVec[lk].nzvalSize() *
sizeof(Ftype);
89template <
typename Ftype>
92 if (mycol == kcol(k0))
94 int_t lk = g2lCol(k0);
95 if (!lPanelVec[lk].isEmpty())
99 MPI_Recv(A_gpu.LvalRecvBufs[0], lPanelVec[lk].nzvalSize(), get_mpi_type<Ftype>(), senderGrid, k0,
100 grid3d->zscp.comm, &status);
103 cublasHandle_t handle=A_gpu.cuHandles[0];
104 cudaStream_t cuStream = A_gpu.cuStreams[0];
105 cublasSetStream(handle, cuStream);
106 myCublasScal<Ftype>(handle, lPanelVec[lk].nzvalSize(), &alpha, lPanelVec[lk].blkPtrGPU(0), 1);
107 myCublasAxpy<Ftype>(handle, lPanelVec[lk].nzvalSize(), &beta, A_gpu.LvalRecvBufs[0], 1, lPanelVec[lk].blkPtrGPU(0), 1);
108 cudaStreamSynchronize(cuStream);
115template <
typename Ftype>
119 if (myrow == krow(k0))
121 int_t lk = g2lRow(k0);
122 if (!uPanelVec[lk].isEmpty())
124 MPI_Send(uPanelVec[lk].blkPtrGPU(0), uPanelVec[lk].nzvalSize(),
125 get_mpi_type<Ftype>(), receiverGrid, k0, grid3d->zscp.comm);
126 SCT->commVolRed += uPanelVec[lk].nzvalSize() *
sizeof(Ftype);
132template <
typename Ftype>
135 if (myrow == krow(k0))
137 int_t lk = g2lRow(k0);
138 if (!uPanelVec[lk].isEmpty())
142 MPI_Recv(A_gpu.UvalRecvBufs[0], uPanelVec[lk].nzvalSize(), get_mpi_type<Ftype>(), senderGrid, k0,
143 grid3d->zscp.comm, &status);
146 cublasHandle_t handle=A_gpu.cuHandles[0];
147 cudaStream_t cuStream = A_gpu.cuStreams[0];
148 cublasSetStream(handle, cuStream);
149 myCublasScal<Ftype>(handle, uPanelVec[lk].nzvalSize(), &alpha, uPanelVec[lk].blkPtrGPU(0), 1);
150 myCublasAxpy<Ftype>(handle, uPanelVec[lk].nzvalSize(), &beta, A_gpu.UvalRecvBufs[0], 1, uPanelVec[lk].blkPtrGPU(0), 1);
151 cudaStreamSynchronize(cuStream);
Definition: xlupanels.hpp:335
int_t log2i(int_t index)
Definition: supernodal_etree.c:17
int64_t int_t
Definition: superlu_defs.h:119
double SuperLU_timer_()
Definition: superlu_timer.c:72
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56