SuperLU Distributed 9.0.0
gpu3d
lupanelsComm3dGPU_impl.hpp
Go to the documentation of this file.
1#include "mpi.h"
2// #include "cublasDefs.hhandle, "
3#include "lupanels.hpp"
5
6#ifdef HAVE_CUDA
7
8template <typename Ftype>
10 int_t **treePerm)
11{
12 int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
13 int_t myGrid = grid3d->zscp.Iam;
14
15#if (DEBUGlevel >= 1)
16 printf(".maxLvl %d\n", maxLvl); fflush(stdout);
17 CHECK_MALLOC(grid3d->iam, "Enter ancestorReduction3dGPU()");
18#endif
19
20 int_t sender, receiver;
21 if ((myGrid % (1 << (ilvl + 1))) == 0)
22 {
23 sender = myGrid + (1 << ilvl);
24 receiver = myGrid;
25 }
26 else
27 {
28 sender = myGrid;
29 receiver = myGrid - (1 << ilvl);
30 }
31
32 /*Reduce all the ancestors*/
33 for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
34 {
35 /* code */
36 // int_t atree = myTreeIdxs[alvl];
37 int_t numNodes = myNodeCount[alvl];
38 int_t *nodeList = treePerm[alvl];
39 double treduce = SuperLU_timer_();
40
41
42 /*first setting the L blocks to zero*/
43 for (int_t node = 0; node < numNodes; ++node) /* for each block column ... */
44 {
45 int_t k0 = nodeList[node];
46
47 if (myGrid == sender)
48 {
49 zSendLPanelGPU(k0, receiver);
50 zSendUPanelGPU(k0, receiver);
51 }
52 else
53 {
54 Ftype alpha = one<Ftype>(); Ftype beta = one<Ftype>();
55
56 zRecvLPanelGPU(k0, sender, alpha, beta);
57 zRecvUPanelGPU(k0, sender, alpha, beta);
58 }
59 }
60 cudaStreamSynchronize(A_gpu.cuStreams[0]) ;
61 // return 0;
62 SCT->ancsReduce += SuperLU_timer_() - treduce;
63 }
64
65#if (DEBUGlevel >= 1)
66 CHECK_MALLOC(grid3d->iam, "Exit ancestorReduction3dGPU()");
67#endif
68
69 return 0;
70}
71
72template <typename Ftype>
74{
75
76 if (mycol == kcol(k0))
77 {
78 int_t lk = g2lCol(k0);
79 if (!lPanelVec[lk].isEmpty())
80 {
81 MPI_Send(lPanelVec[lk].blkPtrGPU(0), lPanelVec[lk].nzvalSize(),
82 get_mpi_type<Ftype>(), receiverGrid, k0, grid3d->zscp.comm);
83 SCT->commVolRed += lPanelVec[lk].nzvalSize() * sizeof(Ftype);
84 }
85 }
86 return 0;
87}
88
89template <typename Ftype>
90int_t xLUstruct_t<Ftype>::zRecvLPanelGPU(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta)
91{
92 if (mycol == kcol(k0))
93 {
94 int_t lk = g2lCol(k0);
95 if (!lPanelVec[lk].isEmpty())
96 {
97
98 MPI_Status status;
99 MPI_Recv(A_gpu.LvalRecvBufs[0], lPanelVec[lk].nzvalSize(), get_mpi_type<Ftype>(), senderGrid, k0,
100 grid3d->zscp.comm, &status);
101
102 /*reduce the updates*/
103 cublasHandle_t handle=A_gpu.cuHandles[0];
104 cudaStream_t cuStream = A_gpu.cuStreams[0];
105 cublasSetStream(handle, cuStream);
106 myCublasScal<Ftype>(handle, lPanelVec[lk].nzvalSize(), &alpha, lPanelVec[lk].blkPtrGPU(0), 1);
107 myCublasAxpy<Ftype>(handle, lPanelVec[lk].nzvalSize(), &beta, A_gpu.LvalRecvBufs[0], 1, lPanelVec[lk].blkPtrGPU(0), 1);
108 cudaStreamSynchronize(cuStream);
109
110 }
111 }
112 return 0;
113}
114
115template <typename Ftype>
117{
118
119 if (myrow == krow(k0))
120 {
121 int_t lk = g2lRow(k0);
122 if (!uPanelVec[lk].isEmpty())
123 {
124 MPI_Send(uPanelVec[lk].blkPtrGPU(0), uPanelVec[lk].nzvalSize(),
125 get_mpi_type<Ftype>(), receiverGrid, k0, grid3d->zscp.comm);
126 SCT->commVolRed += uPanelVec[lk].nzvalSize() * sizeof(Ftype);
127 }
128 }
129 return 0;
130}
131
132template <typename Ftype>
133int_t xLUstruct_t<Ftype>::zRecvUPanelGPU(int_t k0, int_t senderGrid, Ftype alpha, Ftype beta)
134{
135 if (myrow == krow(k0))
136 {
137 int_t lk = g2lRow(k0);
138 if (!uPanelVec[lk].isEmpty())
139 {
140
141 MPI_Status status;
142 MPI_Recv(A_gpu.UvalRecvBufs[0], uPanelVec[lk].nzvalSize(), get_mpi_type<Ftype>(), senderGrid, k0,
143 grid3d->zscp.comm, &status);
144
145 /*reduce the updates*/
146 cublasHandle_t handle=A_gpu.cuHandles[0];
147 cudaStream_t cuStream = A_gpu.cuStreams[0];
148 cublasSetStream(handle, cuStream);
149 myCublasScal<Ftype>(handle, uPanelVec[lk].nzvalSize(), &alpha, uPanelVec[lk].blkPtrGPU(0), 1);
150 myCublasAxpy<Ftype>(handle, uPanelVec[lk].nzvalSize(), &beta, A_gpu.UvalRecvBufs[0], 1, uPanelVec[lk].blkPtrGPU(0), 1);
151 cudaStreamSynchronize(cuStream);
152 }
153 }
154 return 0;
155}
156
157#endif
Definition: xlupanels.hpp:335
int_t log2i(int_t index)
Definition: supernodal_etree.c:17
int64_t int_t
Definition: superlu_defs.h:119
double SuperLU_timer_()
Definition: superlu_timer.c:72
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56