SuperLU Distributed 9.0.0
gpu3d
dsparseTreeFactorGPU_impl.hpp
Go to the documentation of this file.
1#pragma once
2#include <cstdio>
3#include <cinttypes>
4#include "superlu_ddefs.h"
5#include "lupanels.hpp"
6#include "xlupanels.hpp"
7
8#ifdef HAVE_CUDA
9#include "lupanels_GPU.cuh"
10#include "xlupanels_GPU.cuh"
11#include "batch_block_copy.h"
12
13
14int getBufferOffset(int k0, int k1, int winSize, int winParity, int halfWin)
15{
16 int offset = (k0 - k1) % winSize;
17 if (winParity % 2)
18 offset += halfWin;
19
20 return offset;
21}
22
23#if 0
24template <typename Ftype>
25xLUMarshallData<Ftype>::xLUMarshallData()
26{
27 dev_diag_ptrs = dev_panel_ptrs = NULL;
28 dev_diag_ld_array = dev_diag_dim_array = dev_info_array = NULL;
29 dev_panel_ld_array = dev_panel_dim_array = NULL;
30}
31
32template <typename Ftype>
33xLUMarshallData<Ftype>::~xLUMarshallData()
34{
35 gpuErrchk(cudaFree(dev_diag_ptrs));
36 gpuErrchk(cudaFree(dev_panel_ptrs));
37 gpuErrchk(cudaFree(dev_diag_ld_array));
38 gpuErrchk(cudaFree(dev_diag_dim_array));
39 gpuErrchk(cudaFree(dev_info_array));
40 gpuErrchk(cudaFree(dev_panel_ld_array));
41 gpuErrchk(cudaFree(dev_panel_dim_array));
42}
43
44template <typename Ftype>
45void xLUMarshallData<Ftype>::setBatchSize(int batch_size)
46{
47 gpuErrchk(cudaMalloc(&dev_diag_ptrs, batch_size * sizeof(Ftype *)));
48 gpuErrchk(cudaMalloc(&dev_panel_ptrs, batch_size * sizeof(Ftype *)));
49
50 gpuErrchk(cudaMalloc(&dev_diag_ld_array, batch_size * sizeof(int)));
51 gpuErrchk(cudaMalloc(&dev_diag_dim_array, (batch_size + 1) * sizeof(int)));
52 gpuErrchk(cudaMalloc(&dev_info_array, batch_size * sizeof(int)));
53 gpuErrchk(cudaMalloc(&dev_panel_ld_array, batch_size * sizeof(int)));
54 gpuErrchk(cudaMalloc(&dev_panel_dim_array, (batch_size + 1) * sizeof(int)));
55
56 host_diag_ptrs.resize(batch_size);
57 host_diag_ld_array.resize(batch_size);
58 host_diag_dim_array.resize(batch_size);
59 host_panel_ptrs.resize(batch_size);
60 host_panel_ld_array.resize(batch_size);
61 host_panel_dim_array.resize(batch_size);
62}
63
64template <typename Ftype>
65void xLUMarshallData<Ftype>::setMaxDiag()
66{
67 max_diag = 0;
68 for (int i = 0; i < batchsize; i++)
69 max_diag = SUPERLU_MAX(max_diag, host_diag_dim_array[i]);
70}
71
72template <typename Ftype>
73void xLUMarshallData<Ftype>::setMaxPanel()
74{
75 max_panel = 0;
76 for (int i = 0; i < batchsize; i++)
77 max_panel = SUPERLU_MAX(max_panel, host_panel_dim_array[i]);
78}
79
80template <typename Ftype>
81xSCUMarshallData<Ftype>::xSCUMarshallData()
82{
83 dev_A_ptrs = dev_B_ptrs = dev_C_ptrs = NULL;
84 dev_lda_array = dev_ldb_array = dev_ldc_array = NULL;
85 dev_m_array = dev_n_array = dev_k_array = NULL;
86 dev_gpu_lpanels = NULL;
87 dev_gpu_upanels = NULL;
88 dev_ist = dev_iend = dev_jst = dev_jend = NULL;
89 dev_maxGemmCols = dev_maxGemmRows = NULL;
90}
91
92template <typename Ftype>
93xSCUMarshallData<Ftype>::~xSCUMarshallData()
94{
95 gpuErrchk(cudaFree(dev_A_ptrs));
96 gpuErrchk(cudaFree(dev_B_ptrs));
97 gpuErrchk(cudaFree(dev_C_ptrs));
98 gpuErrchk(cudaFree(dev_lda_array));
99 gpuErrchk(cudaFree(dev_ldb_array));
100 gpuErrchk(cudaFree(dev_ldc_array));
101 gpuErrchk(cudaFree(dev_m_array));
102 gpuErrchk(cudaFree(dev_n_array));
103 gpuErrchk(cudaFree(dev_k_array));
104 gpuErrchk(cudaFree(dev_gpu_lpanels));
105 gpuErrchk(cudaFree(dev_gpu_upanels));
106 gpuErrchk(cudaFree(dev_ist));
107 gpuErrchk(cudaFree(dev_iend));
108 gpuErrchk(cudaFree(dev_jst));
109 gpuErrchk(cudaFree(dev_jend));
110 gpuErrchk(cudaFree(dev_maxGemmCols));
111 gpuErrchk(cudaFree(dev_maxGemmRows));
112}
113
114template <typename Ftype>
115void xSCUMarshallData<Ftype>::setBatchSize(int batch_size)
116{
117 gpuErrchk(cudaMalloc(&dev_A_ptrs, batch_size * sizeof(Ftype *)));
118 gpuErrchk(cudaMalloc(&dev_B_ptrs, batch_size * sizeof(Ftype *)));
119 gpuErrchk(cudaMalloc(&dev_C_ptrs, batch_size * sizeof(Ftype *)));
120
121 gpuErrchk(cudaMalloc(&dev_lda_array, batch_size * sizeof(int)));
122 gpuErrchk(cudaMalloc(&dev_ldb_array, batch_size * sizeof(int)));
123 gpuErrchk(cudaMalloc(&dev_ldc_array, batch_size * sizeof(int)));
124
125 gpuErrchk(cudaMalloc(&dev_m_array, (batch_size + 1) * sizeof(int)));
126 gpuErrchk(cudaMalloc(&dev_n_array, (batch_size + 1) * sizeof(int)));
127 gpuErrchk(cudaMalloc(&dev_k_array, (batch_size + 1) * sizeof(int)));
128
129 gpuErrchk(cudaMalloc(&dev_ist, batch_size * sizeof(int)));
130 gpuErrchk(cudaMalloc(&dev_iend, batch_size * sizeof(int)));
131 gpuErrchk(cudaMalloc(&dev_jst, batch_size * sizeof(int)));
132 gpuErrchk(cudaMalloc(&dev_jend, batch_size * sizeof(int)));
133
134 gpuErrchk(cudaMalloc(&dev_maxGemmCols, batch_size * sizeof(int)));
135 gpuErrchk(cudaMalloc(&dev_maxGemmRows, batch_size * sizeof(int)));
136
137 gpuErrchk(cudaMalloc(&dev_gpu_lpanels, batch_size * sizeof(lpanelGPU_t)));
138 gpuErrchk(cudaMalloc(&dev_gpu_upanels, batch_size * sizeof(upanelGPU_t)));
139
140 host_A_ptrs.resize(batch_size);
141 host_B_ptrs.resize(batch_size);
142 host_C_ptrs.resize(batch_size);
143 host_lda_array.resize(batch_size);
144 host_ldb_array.resize(batch_size);
145 host_ldc_array.resize(batch_size);
146 host_m_array.resize(batch_size);
147 host_n_array.resize(batch_size);
148 host_k_array.resize(batch_size);
149 upanels.resize(batch_size);
150 lpanels.resize(batch_size);
151 host_gpu_upanels.resize(batch_size);
152 host_gpu_lpanels.resize(batch_size);
153 ist.resize(batch_size);
154 iend.resize(batch_size);
155 jst.resize(batch_size);
156 jend.resize(batch_size);
157 maxGemmRows.resize(batch_size);
158 maxGemmCols.resize(batch_size);
159}
160
161template <typename Ftype>
162void xSCUMarshallData<Ftype>::setMaxDims()
163{
164 max_n = max_k = max_m = max_ilen = max_jlen = 0;
165 for (int i = 0; i < batchsize; i++)
166 {
167 max_m = SUPERLU_MAX(max_m, host_m_array[i]);
168 max_n = SUPERLU_MAX(max_n, host_n_array[i]);
169 max_k = SUPERLU_MAX(max_k, host_k_array[i]);
170 max_ilen = SUPERLU_MAX(max_ilen, iend[i] - ist[i]);
171 max_jlen = SUPERLU_MAX(max_jlen, jend[i] - jst[i]);
172 }
173}
174
175template <typename Ftype>
176void xSCUMarshallData<Ftype>::copyToGPU()
177{
178 gpuErrchk(cudaMemcpy(dev_A_ptrs, host_A_ptrs.data(), batchsize * sizeof(Ftype *), cudaMemcpyHostToDevice));
179 gpuErrchk(cudaMemcpy(dev_B_ptrs, host_B_ptrs.data(), batchsize * sizeof(Ftype *), cudaMemcpyHostToDevice));
180 gpuErrchk(cudaMemcpy(dev_C_ptrs, host_C_ptrs.data(), batchsize * sizeof(Ftype *), cudaMemcpyHostToDevice));
181
182 gpuErrchk(cudaMemcpy(dev_lda_array, host_lda_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
183 gpuErrchk(cudaMemcpy(dev_ldb_array, host_ldb_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
184 gpuErrchk(cudaMemcpy(dev_ldc_array, host_ldc_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
185
186 gpuErrchk(cudaMemcpy(dev_m_array, host_m_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
187 gpuErrchk(cudaMemcpy(dev_n_array, host_n_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
188 gpuErrchk(cudaMemcpy(dev_k_array, host_k_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
189
190 gpuErrchk(cudaMemcpy(dev_ist, ist.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
191 gpuErrchk(cudaMemcpy(dev_iend, iend.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
192 gpuErrchk(cudaMemcpy(dev_jst, jst.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
193 gpuErrchk(cudaMemcpy(dev_jend, jend.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
194}
195
196template <typename Ftype>
197void xSCUMarshallData<Ftype>::copyPanelDataToGPU()
198{
199 gpuErrchk(cudaMemcpy(dev_gpu_lpanels, host_gpu_lpanels.data(), batchsize * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice));
200 gpuErrchk(cudaMemcpy(dev_gpu_upanels, host_gpu_upanels.data(), batchsize * sizeof(upanelGPU_t), cudaMemcpyHostToDevice));
201}
202
203#endif
205
206template <typename Ftype>
208{
209 // this is new version with diagonal factor being performed on GPU
210 // different from dDiagFactorPanelSolveGPU (it performs diag factor in CPU)
211
212 /* Sherry: argument dFBufs[] is on CPU, not used in this routine */
213
214 double t0 = SuperLU_timer_();
215 int ksupc = SuperSize(k);
216 cublasHandle_t cubHandle = A_gpu.cuHandles[offset];
217 cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[offset];
218 cudaStream_t cuStream = A_gpu.cuStreams[offset];
219
220 /*======= Diagonal Factorization ======*/
221 if (iam == procIJ(k, k))
222 {
223 lPanelVec[g2lCol(k)].diagFactorCuSolver(k,
224 cusolverH, cuStream,
225 A_gpu.diagFactWork[offset], A_gpu.diagFactInfo[offset], // CPU pointers
226 A_gpu.dFBufs[offset], ksupc, // CPU pointers
227 thresh, xsup, options, stat, info);
228 }
229
230 // CHECK_MALLOC(iam, "after diagFactorCuSolver()");
231
232 // TODO: need to synchronize the cuda stream
233 /*======= Diagonal Broadcast ======*/
234 if (myrow == krow(k))
235 MPI_Bcast((void *)A_gpu.dFBufs[offset], ksupc * ksupc,
236 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
237
238 // CHECK_MALLOC(iam, "after row Bcast");
239
240 if (mycol == kcol(k))
241 MPI_Bcast((void *)A_gpu.dFBufs[offset], ksupc * ksupc,
242 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
243
244 // do the panels solver
245 if (myrow == krow(k))
246 {
247 uPanelVec[g2lRow(k)].panelSolveGPU(
248 cubHandle, cuStream,
249 ksupc, A_gpu.dFBufs[offset], ksupc);
250 cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
251 }
252
253 if (mycol == kcol(k))
254 {
255 lPanelVec[g2lCol(k)].panelSolveGPU(
256 cubHandle, cuStream,
257 ksupc, A_gpu.dFBufs[offset], ksupc);
258 cudaStreamSynchronize(cuStream);
259 }
260 SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0);
261
262 return 0;
263} /* dDFactPSolveGPU */
264
265template <typename Ftype>
266int_t xLUstruct_t<Ftype>::dDFactPSolveGPU(int_t k, int_t handle_offset, int buffer_offset, diagFactBufs_type<Ftype> **dFBufs)
267{
268 // this is new version with diagonal factor being performed on GPU
269 // different from dDiagFactorPanelSolveGPU (it performs diag factor in CPU)
270
271 /* Sherry: argument dFBufs[] is on CPU, not used in this routine */
272
273 double t0 = SuperLU_timer_();
274 int ksupc = SuperSize(k);
275 cublasHandle_t cubHandle = A_gpu.cuHandles[handle_offset];
276 cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[handle_offset];
277 cudaStream_t cuStream = A_gpu.cuStreams[handle_offset];
278
279 /*======= Diagonal Factorization ======*/
280 if (iam == procIJ(k, k))
281 {
282 lPanelVec[g2lCol(k)].diagFactorCuSolver(k,
283 cusolverH, cuStream,
284 A_gpu.diagFactWork[handle_offset], A_gpu.diagFactInfo[handle_offset], // CPU pointers
285 A_gpu.dFBufs[buffer_offset], ksupc, // CPU pointers
286 thresh, xsup, options, stat, info);
287 }
288
289 // CHECK_MALLOC(iam, "after diagFactorCuSolver()");
290
291 // TODO: need to synchronize the cuda stream
292 /*======= Diagonal Broadcast ======*/
293 if (myrow == krow(k))
294 MPI_Bcast((void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc,
295 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
296
297 // CHECK_MALLOC(iam, "after row Bcast");
298
299 if (mycol == kcol(k))
300 MPI_Bcast((void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc,
301 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
302
303 // do the panels solver
304 if (myrow == krow(k))
305 {
306 uPanelVec[g2lRow(k)].panelSolveGPU(
307 cubHandle, cuStream,
308 ksupc, A_gpu.dFBufs[buffer_offset], ksupc);
309 cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
310 }
311
312 if (mycol == kcol(k))
313 {
314 lPanelVec[g2lCol(k)].panelSolveGPU(
315 cubHandle, cuStream,
316 ksupc, A_gpu.dFBufs[buffer_offset], ksupc);
317 cudaStreamSynchronize(cuStream);
318 }
319 SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0);
320
321 return 0;
322} /* dDFactPSolveGPU */
323
324/* This performs diag factor on CPU */
325template <typename Ftype>
327{
328 double t0 = SuperLU_timer_();
329 int_t ksupc = SuperSize(k);
330 cublasHandle_t cubHandle = A_gpu.cuHandles[offset];
331 cudaStream_t cuStream = A_gpu.cuStreams[offset];
332 if (iam == procIJ(k, k))
333 {
334
335 lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k,
336 dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers
337 dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers
338 thresh, xsup, options, stat, info);
339 }
340
341 /*======= Diagonal Broadcast ======*/
342 if (myrow == krow(k))
343 MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
344 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
345 if (mycol == kcol(k))
346 MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
347 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
348
349 /*======= Panel Update ======*/
350 if (myrow == krow(k))
351 {
352
353 cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockLFactor,
354 ksupc * ksupc * sizeof(Ftype), cudaMemcpyHostToDevice);
355 uPanelVec[g2lRow(k)].panelSolveGPU(
356 cubHandle, cuStream,
357 ksupc, A_gpu.dFBufs[offset], ksupc);
358 cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
359 }
360
361 if (mycol == kcol(k))
362 {
363 cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockUFactor,
364 ksupc * ksupc * sizeof(Ftype), cudaMemcpyHostToDevice);
365 lPanelVec[g2lCol(k)].panelSolveGPU(
366 cubHandle, cuStream,
367 ksupc, A_gpu.dFBufs[offset], ksupc);
368 cudaStreamSynchronize(cuStream);
369 }
370 SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0);
371
372 return 0;
373}
374
375template <typename Ftype>
377{
378 double t0 = SuperLU_timer_();
379 /*======= Panel Broadcast ======*/
380 // upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset],
381 // A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]);
382 // lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset],
383 // A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]);
384 // if (myrow == krow(k))
385 // {
386 // k_upanel = uPanelVec[g2lRow(k)];
387 // }
388 // if (mycol == kcol(k))
389 // k_lpanel = lPanelVec[g2lCol(k)];
390 xupanel_t<Ftype> k_upanel = getKUpanel(k, offset);
391 xlpanel_t<Ftype> k_lpanel = getKLpanel(k, offset);
392
393 if (UidxSendCounts[k] > 0)
394 {
395 // assuming GPU direct is available
396 MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
397 MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], get_mpi_type<Ftype>(), krow(k), grid3d->cscp.comm);
398 // copy the index to cpu
399 gpuErrchk(cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index,
400 sizeof(int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost));
401 }
402
403 if (LidxSendCounts[k] > 0)
404 {
405 MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
406 MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], get_mpi_type<Ftype>(), kcol(k), grid3d->rscp.comm);
407 gpuErrchk(cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index,
408 sizeof(int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost));
409 }
410 SCT->tPanelBcast += (SuperLU_timer_() - t0);
411 return 0;
412}
413
414template <typename Ftype>
416 sForest_t *sforest,
417 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
418 gEtreeInfo_t *gEtreeInfo, // global etree info
419
420 int tag_ub)
421{
422 int_t nnodes = sforest->nNodes; // number of nodes in the tree
423 if (nnodes < 1)
424 {
425 return 1;
426 }
427
428 int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
429 treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
430 int_t *myIperm = treeTopoInfo->myIperm;
431 int_t maxTopoLevel = treeTopoInfo->numLvl;
432 int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
433
434 /*main loop over all the levels*/
435 int_t numLA = SUPERLU_MIN(A_gpu.numCudaStreams, getNumLookAhead(options));
436
437#if (DEBUGlevel >= 1)
438 CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorGPU()");
439
440 printf("Using New code V100 with GPU acceleration\n");
441 fflush(stdout);
442 printf(". lookahead numLA %d\n", numLA);
443 fflush(stdout);
444#endif
445 // start the pipeline. Sherry: need to free these 3 arrays
446 int *donePanelBcast = int32Malloc_dist(nnodes);
447 int *donePanelSolve = int32Malloc_dist(nnodes);
448 int *localNumChildrenLeft = int32Malloc_dist(nnodes);
449
450 // TODO: not needed, remove after testing
451 for (int_t i = 0; i < nnodes; i++)
452 {
453 donePanelBcast[i] = 0;
454 donePanelSolve[i] = 0;
455 localNumChildrenLeft[i] = 0;
456 }
457
458 for (int_t k0 = 0; k0 < nnodes; k0++)
459 {
460 int_t k = perm_c_supno[k0];
461 int_t k_parent = gEtreeInfo->setree[k];
462 int_t ik = myIperm[k_parent];
463 if (ik > -1 && ik < nnodes)
464 localNumChildrenLeft[ik]++;
465 }
466
467 // start the pipeline
468 int_t topoLvl = 0;
469 int_t k_st = eTreeTopLims[topoLvl];
470 int_t k_end = eTreeTopLims[topoLvl + 1];
471
472 // TODO: make this asynchronous
473 for (int_t k0 = k_st; k0 < k_end; k0++)
474 {
475 int_t k = perm_c_supno[k0];
476 int_t offset = 0;
477 // dDiagFactorPanelSolveGPU(k, offset, dFBufs);
478 dDFactPSolveGPU(k, offset, dFBufs);
479 donePanelSolve[k0] = 1;
480 }
481
482 // TODO: its really the panels that needs to be doubled
483 // everything else can remain as it is
484 int_t winSize = SUPERLU_MIN(numLA / 2, eTreeTopLims[1]);
485
486 // printf(". lookahead winSize %d\n", winSize);
487#if ( PRNTlevel >= 1 )
488 printf(". lookahead winSize %" PRIdMAX "\n", static_cast<intmax_t>(winSize));
489 fflush(stdout);
490#endif
491
492 for (int k0 = k_st; k0 < winSize; ++k0)
493 {
494 int_t k = perm_c_supno[k0];
495 int_t offset = k0 % numLA;
496 if (!donePanelBcast[k0])
497 {
498 dPanelBcastGPU(k, offset);
499 donePanelBcast[k0] = 1;
500 }
501 } /*for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)*/
502
503 int_t k1 = 0;
504 int_t winParity = 0;
505 int_t halfWin = numLA / 2;
506 while (k1 < nnodes)
507 {
508 for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1 + winSize); ++k0)
509 {
510 int_t k = perm_c_supno[k0];
511 int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
512 xupanel_t<Ftype> k_upanel = getKUpanel(k, offset);
513 xlpanel_t<Ftype> k_lpanel = getKLpanel(k, offset);
514 int_t k_parent = gEtreeInfo->setree[k];
515 /* L o o k A h e a d P a n e l U p d a t e */
516 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
517 lookAheadUpdateGPU(offset, k, k_parent, k_lpanel, k_upanel);
518 }
519
520 for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1 + winSize); ++k0)
521 {
522 // int_t k = perm_c_supno[k0];
523 int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
524 SyncLookAheadUpdate(offset);
525 }
526
527 for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1 + winSize); ++k0)
528 {
529 int_t k = perm_c_supno[k0];
530 int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
531 xupanel_t<Ftype> k_upanel = getKUpanel(k, offset);
532 xlpanel_t<Ftype> k_lpanel = getKLpanel(k, offset);
533 int_t k_parent = gEtreeInfo->setree[k];
534 /* Look Ahead Panel Solve */
535 if (k_parent < nsupers)
536 {
537 int_t k0_parent = myIperm[k_parent];
538 if (k0_parent > 0 && k0_parent < nnodes)
539 {
540 localNumChildrenLeft[k0_parent]--;
541 if (topoLvl < maxTopoLevel - 1 && !localNumChildrenLeft[k0_parent])
542 {
543#if (PRNTlevel >= 2)
544 printf("parent %d of node %d during second phase\n", k0_parent, k0);
545#endif
546 int_t dOffset = 0; // this is wrong
547 // dDiagFactorPanelSolveGPU(k_parent, dOffset,dFBufs);
548 dDFactPSolveGPU(k_parent, dOffset, dFBufs);
549 donePanelSolve[k0_parent] = 1;
550 }
551 }
552 }
553
554 /*proceed with remaining SchurComplement update */
555 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
556 dSchurCompUpdateExcludeOneGPU(offset, k, k_parent, k_lpanel, k_upanel);
557 }
558
559 int_t k1_next = k1 + winSize;
560 int_t oldWinSize = winSize;
561 for (int_t k0_next = k1_next; k0_next < SUPERLU_MIN(nnodes, k1_next + winSize); ++k0_next)
562 {
563 int k_next = perm_c_supno[k0_next];
564 if (!localNumChildrenLeft[k0_next])
565 {
566 // int offset_next = (k0_next-k1_next)%winSize;
567 // if(!(winParity%2))
568 // offset_next += halfWin;
569
570 int_t offset_next = getBufferOffset(k0_next, k1_next, winSize, winParity + 1, halfWin);
571 dPanelBcastGPU(k_next, offset_next);
572 donePanelBcast[k0_next] = 1;
573 // printf("Trying %d on offset %d\n", k0_next, offset_next);
574 }
575 else
576 {
577 winSize = k0_next - k1_next;
578 break;
579 }
580 }
581
582 for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1 + oldWinSize); ++k0)
583 {
584 int_t k = perm_c_supno[k0];
585 // int_t offset = (k0-k1)%oldWinSize;
586 // if(winParity%2)
587 // offset+= halfWin;
588 int_t offset = getBufferOffset(k0, k1, oldWinSize, winParity, halfWin);
589 // printf("Syncing stream %d on offset %d\n", k0, offset);
590 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
591 gpuErrchk(cudaStreamSynchronize(A_gpu.cuStreams[offset]));
592 }
593
594 k1 = k1_next;
595 winParity++;
596 }
597
598#if 0
599
600 for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
601 {
602 /* code */
603 int_t k_st = eTreeTopLims[topoLvl];
604 int_t k_end = eTreeTopLims[topoLvl + 1];
605 for (int_t k0 = k_st; k0 < k_end; ++k0)
606 {
607 int_t k = perm_c_supno[k0];
608
609 int_t ksupc = SuperSize(k);
610 cublasHandle_t cubHandle = A_gpu.cuHandles[0];
611 cudaStream_t cuStream = A_gpu.cuStreams[0];
612 dDiagFactorPanelSolveGPU(k, 0, dFBufs);
613 /*======= Panel Broadcast ======*/
614 // panelBcastGPU(k, 0);
615 int_t offset = k0%numLA;
616 dPanelBcastGPU(k, offset);
617
618 /*======= Schurcomplement Update ======*/
619 upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset],
620 A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]);
621 lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset],
622 A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]);
623 if (myrow == krow(k))
624 {
625 k_upanel = uPanelVec[g2lRow(k)];
626 }
627 if (mycol == kcol(k))
628 k_lpanel = lPanelVec[g2lCol(k)];
629
630 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
631 {
632 int streamId = 0;
633
634#if 0
635
636 dSchurComplementUpdateGPU(
637 streamId,
638 k, k_lpanel, k_upanel);
639#else
640 int_t k_parent = gEtreeInfo->setree[k];
641 lookAheadUpdateGPU(
642 streamId,
643 k, k_parent, k_lpanel, k_upanel);
644 dSchurCompUpdateExcludeOneGPU(
645 streamId,
646 k, k_parent, k_lpanel, k_upanel);
647
648#endif
649
650 }
651 } /*for k0= k_st:k_end */
652 } /*for topoLvl = 0:maxTopoLevel*/
653
654#endif /* match #if 0 at line 562 before "for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)" */
655
656 /* Sherry added 2/1/23 */
657 SUPERLU_FREE(donePanelBcast);
658 SUPERLU_FREE(donePanelSolve);
659 SUPERLU_FREE(localNumChildrenLeft);
660
661#if (DEBUGlevel >= 1)
662 CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorGPU()");
663#endif
664
665 return 0;
666} /* dsparseTreeFactorGPU */
667
668
669#if 0
670template <typename Ftype>
671void xLUstruct_t<Ftype>::marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno)
672{
673 // First gather up all the pointer and meta data on the host
674 LUMarshallData &mdata = A_gpu.marshall_data;
675 Ftype **panel_ptrs = mdata.host_panel_ptrs.data();
676 int *panel_ld_batch = mdata.host_panel_ld_array.data();
677 int *panel_dim_batch = mdata.host_panel_dim_array.data();
678 Ftype **diag_ptrs = mdata.host_diag_ptrs.data();
679 int *diag_ld_batch = mdata.host_diag_ld_array.data();
680 int *diag_dim_batch = mdata.host_diag_dim_array.data();
681
682 mdata.batchsize = 0;
683
684 for (int_t k0 = k_st; k0 < k_end; k0++)
685 {
686 int_t k = perm_c_supno[k0];
687 int_t buffer_offset = k0 - k_st;
688 int ksupc = SuperSize(k);
689
690 if (iam == procIJ(k, k))
691 {
692 lpanel_t &lpanel = lPanelVec[g2lCol(k)];
693
694 assert(mdata.batchsize < mdata.host_diag_ptrs.size());
695
696 panel_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0);
697 panel_ld_batch[mdata.batchsize] = lpanel.LDA();
698 panel_dim_batch[mdata.batchsize] = ksupc;
699
700 diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset];
701 diag_ld_batch[mdata.batchsize] = ksupc;
702 diag_dim_batch[mdata.batchsize] = ksupc;
703
704 mdata.batchsize++;
705 }
706 }
707
708 mdata.setMaxDiag();
709 mdata.setMaxPanel();
710
711 // Then copy the marshalled data over to the GPU
712 gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(Ftype *), cudaMemcpyHostToDevice));
713 gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
714 gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
715 gpuErrchk(cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(Ftype *), cudaMemcpyHostToDevice));
716 gpuErrchk(cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
717 gpuErrchk(cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
718}
719
720#endif
722
723template <typename Ftype>
724void xLUstruct_t<Ftype>::dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno)
725{
726#if 0//def HAVE_MAGMA
727 //#ifdef HAVE_MAGMA
728 // Marshall the data for the leaf level batched LU decomposition
729 LUMarshallData &mdata = A_gpu.marshall_data;
730 cudaStream_t stream = A_gpu.cuStreams[0];
731 marshallBatchedLUData(k_st, k_end, perm_c_supno);
732
733 int info = magma_dgetrf_vbatched(
734 mdata.dev_diag_dim_array, mdata.dev_diag_dim_array,
735 mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
736 NULL, mdata.dev_info_array, mdata.batchsize,
737 A_gpu.magma_queue);
738
739 // Hackathon change: assuming individual matrices in a local batch are not distributed
740 // so we don't do the buffer copies anymore
741 // // Copy the diagonal block data over to the bcast buffers
742 // marshallBatchedBufferCopyData(k_st, k_end, perm_c_supno);
743
744 // copyBlock_vbatch(
745 // stream, mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, mdata.max_diag, mdata.max_panel,
746 // mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
747 // mdata.batchsize
748 // );
749
750 // Upper panel triangular solves
751 marshallBatchedTRSMUData(k_st, k_end, perm_c_supno);
752
753 magmablas_dtrsm_vbatched(
754 MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
755 mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, 1.0,
756 mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
757 mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
758 mdata.batchsize, A_gpu.magma_queue);
759
760 // Lower panel triangular solves
761 marshallBatchedTRSMLData(k_st, k_end, perm_c_supno);
762
763 magmablas_dtrsm_vbatched(
764 MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
765 mdata.dev_panel_dim_array, mdata.dev_diag_dim_array, 1.0,
766 mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
767 mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
768 mdata.batchsize, A_gpu.magma_queue);
769
770 // Wajih: This should be converted to a single bcast if possible
771 // Hackathon change: assuming individual matrices in a local batch are not distributed
772 // for (int k0 = k_st; k0 < k_end; k0++)
773 // {
774 // int k = perm_c_supno[k0];
775 // int offset = 0;
776 // /*======= Panel Broadcast ======*/
777 // dPanelBcastGPU(k, offset);
778 // }
779
780 // Initialize the schur complement update marshall data
781 initSCUMarshallData(k_st, k_end, perm_c_supno);
782 xSCUMarshallData &sc_mdata = A_gpu.sc_marshall_data;
783
784 // Keep marshalling while there are batches to be processed
785 int done_i = 0;
786 int total_batches = 0;
787 while (done_i == 0)
788 {
789 done_i = marshallSCUBatchedDataOuter(k_st, k_end, perm_c_supno);
790 int done_j = 0;
791 while (done_i == 0 && done_j == 0)
792 {
793 done_j = marshallSCUBatchedDataInner(k_st, k_end, perm_c_supno);
794 if (done_j != 1)
795 {
796 total_batches++;
797 magmablas_dgemm_vbatched_max_nocheck(
798 MagmaNoTrans, MagmaNoTrans, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array,
799 1.0, sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array,
800 0.0, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, sc_mdata.batchsize,
801 sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, A_gpu.magma_queue);
802
803 scatterGPU_batchDriver(
804 sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend,
805 sc_mdata.max_ilen, sc_mdata.max_jlen, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array,
806 A_gpu.maxSuperSize, ldt, sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels,
807 dA_gpu, sc_mdata.batchsize, A_gpu.cuStreams[0]);
808 }
809 }
810 }
811 // printf("SCU batches = %d\n", total_batches);
812#endif
813}
814
815#if 0
817template <typename Ftype>
819 sForest_t *sforest,
820 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
821 gEtreeInfo_t *gEtreeInfo, // global etree info
822 int tag_ub)
823{
824#if 0// def HAVE_MAGMA
825 int nnodes = sforest->nNodes; // number of nodes in the tree
826 int topoLvl, k_st, k_end, k0, k, offset, ksupc;
827 if (nnodes < 1)
828 {
829 return 1;
830 }
831
832 int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
833 treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
834 int_t *myIperm = treeTopoInfo->myIperm;
835 int_t maxTopoLevel = treeTopoInfo->numLvl;
836 int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
837
838#if (DEBUGlevel >= 1)
839 CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorBatchGPU()");
840#endif
841 printf("Using level-based scheduling on GPU\n");
842 fflush(stdout);
843
844 /* For all the leaves at level 0 */
845 topoLvl = 0;
846 k_st = eTreeTopLims[topoLvl];
847 k_end = eTreeTopLims[topoLvl + 1];
848 printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end);
849 fflush(stdout);
850
851#if 0
852 //ToDo: make this batched
853 for (k0 = k_st; k0 < k_end; k0++)
854 {
855 k = perm_c_supno[k0];
856 offset = k0 - k_st;
857 // dDiagFactorPanelSolveGPU(k, offset, dFBufs);
858 dDFactPSolveGPU(k, 0, offset, dFBufs);
859
860 /*======= Panel Broadcast ======*/
861 dPanelBcastGPU(k, offset); // does this only if (UidxSendCounts[k] > 0)
862 //donePanelSolve[k0]=1;
863
864 /*======= Schurcomplement Update ======*/
865 /* UidxSendCounts are computed in LUstruct_v100 constructor in LUpanels.cpp */
866 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) {
867 // k_upanel.checkCorrectness();
868 int streamId = 0;
869 upanel_t k_upanel = getKUpanel(k,offset);
870 lpanel_t k_lpanel = getKLpanel(k,offset);
871 dSchurComplementUpdateGPU( streamId,
872 k, k_lpanel, k_upanel);
873 // cudaStreamSynchronize(cuStream); // there is sync inside the kernel
874 }
875 }
876
877 /* Main loop over all the internal levels */
878 for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) {
879
880 k_st = eTreeTopLims[topoLvl];
881 k_end = eTreeTopLims[topoLvl + 1];
882
883 printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout);
884
885 /* loop over all the nodes at level topoLvl */
886 for (k0 = k_st; k0 < k_end; ++k0) { /* ToDo: batch this */
887 k = perm_c_supno[k0];
888 offset = k0 - k_st;
889 // offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
890 //ksupc = SuperSize(k);
891
892 dDFactPSolveGPU(k, 0, offset, dFBufs);
893
894 /*======= Panel Broadcast ======*/
895 dPanelBcastGPU(k, offset); // does this only if (UidxSendCounts[k] > 0)
896
897 /*======= Schurcomplement Update ======*/
898 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
899 {
900 // k_upanel.checkCorrectness();
901 int streamId = 0;
902
903 //#define NDEBUG
904#ifndef NDEBUG
905 checkGPU(); // ??
906#endif
907 upanel_t k_upanel = getKUpanel(k,offset);
908 lpanel_t k_lpanel = getKLpanel(k,offset);
909 dSchurComplementUpdateGPU(streamId,
910 k, k_lpanel, k_upanel);
911// cudaStreamSynchronize(cuStream); // there is sync inside the kernel
912#if 0 // Sherry commented out 7/4/23
913#ifndef NDEBUG
914 dSchurComplementUpdate(k, k_lpanel, k_upanel); // ?? why do this on CPU ?
915n cudaStreamSynchronize(cuStream);
916 checkGPU();
917#endif
918#endif
919 }
920 // MPI_Barrier(grid3d->comm);
921 } /* end for k0= k_st:k_end */
922 } /* end for topoLvl = 0:maxTopoLevel */
923#else
924 printf("Using batched code\n");
925 double t0 = SuperLU_timer_();
926
927 // Copy the nodelist to the GPU
928 gpuErrchk(cudaMemcpy(A_gpu.dperm_c_supno, perm_c_supno, sizeof(int) * sforest->nNodes, cudaMemcpyHostToDevice));
929
930 // Solve level by level
931 dFactBatchSolve(k_st, k_end, perm_c_supno);
932 for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl)
933 {
934
935 k_st = eTreeTopLims[topoLvl];
936 k_end = eTreeTopLims[topoLvl + 1];
937 dFactBatchSolve(k_st, k_end, perm_c_supno);
938 }
939 printf("Batch time = %.3f\n", SuperLU_timer_() - t0);
940#endif
941
942#if (DEBUGlevel >= 1)
943 CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorBatchGPU()");
944#endif
945
946#else
947 printf("MAGMA is required for batched execution!\n");
948 fflush(stdout);
949 exit(0);
950
951#endif /* match ifdef have_magma */
952
953 return 0;
954} /* end dsparseTreeFactorBatchGPU */
955
956#endif /* match #if 0 */
958
959
960// TODO: needs to be merged as a single factorization function
961template <typename Ftype>
963 sForest_t *sforest,
964 diagFactBufs_type<Ftype> **dFBufs, // size maxEtree level
965 gEtreeInfo_t *gEtreeInfo, // global etree info
966
967 int tag_ub)
968{
969 int_t nnodes = sforest->nNodes; // number of nodes in the tree
970 if (nnodes < 1)
971 {
972 return 1;
973 }
974
975 printf("Using New code V100 with GPU acceleration\n");
976#if (DEBUGlevel >= 1)
977 CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorGPUBaseline()");
978#endif
979
980 int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
981 treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
982 int_t *myIperm = treeTopoInfo->myIperm;
983 int_t maxTopoLevel = treeTopoInfo->numLvl;
984 int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
985
986 /*main loop over all the levels*/
987 int_t numLA = getNumLookAhead(options);
988
989 // start the pipeline
990
991 for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
992 {
993 /* code */
994 int_t k_st = eTreeTopLims[topoLvl];
995 int_t k_end = eTreeTopLims[topoLvl + 1];
996 for (int_t k0 = k_st; k0 < k_end; ++k0)
997 {
998 int_t k = perm_c_supno[k0];
999 int_t offset = k0 - k_st;
1000 int_t ksupc = SuperSize(k);
1001 cublasHandle_t cubHandle = A_gpu.cuHandles[0];
1002 cudaStream_t cuStream = A_gpu.cuStreams[0];
1003 /*======= Diagonal Factorization ======*/
1004 if (iam == procIJ(k, k))
1005 {
1006// #define NDEBUG
1007#ifndef NDEBUG
1008 lPanelVec[g2lCol(k)].checkGPU();
1009 lPanelVec[g2lCol(k)].diagFactor(k, dFBufs[offset]->BlockUFactor, ksupc,
1010 thresh, xsup, options, stat, info);
1011 lPanelVec[g2lCol(k)].packDiagBlock(dFBufs[offset]->BlockLFactor, ksupc);
1012#endif
1013 lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k,
1014 dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers
1015 dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers
1016 thresh, xsup, options, stat, info);
1017// cudaStreamSynchronize(cuStream);
1018#ifndef NDEBUG
1019 cudaStreamSynchronize(cuStream);
1020 lPanelVec[g2lCol(k)].checkGPU();
1021#endif
1022 }
1023
1024 /*======= Diagonal Broadcast ======*/
1025 if (myrow == krow(k))
1026 MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
1027 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
1028 if (mycol == kcol(k))
1029 MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
1030 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
1031
1032 /*======= Panel Update ======*/
1033 if (myrow == krow(k))
1034 {
1035#ifndef NDEBUG
1036 uPanelVec[g2lRow(k)].checkGPU();
1037#endif
1038 cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockLFactor,
1039 ksupc * ksupc * sizeof(Ftype), cudaMemcpyHostToDevice);
1040 uPanelVec[g2lRow(k)].panelSolveGPU(
1041 cubHandle, cuStream,
1042 ksupc, A_gpu.dFBufs[0], ksupc);
1043 cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
1044#ifndef NDEBUG
1045 uPanelVec[g2lRow(k)].panelSolve(ksupc, dFBufs[offset]->BlockLFactor, ksupc);
1046 cudaStreamSynchronize(cuStream);
1047 uPanelVec[g2lRow(k)].checkGPU();
1048#endif
1049 }
1050
1051 if (mycol == kcol(k))
1052 {
1053 cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockUFactor,
1054 ksupc * ksupc * sizeof(Ftype), cudaMemcpyHostToDevice);
1055 lPanelVec[g2lCol(k)].panelSolveGPU(
1056 cubHandle, cuStream,
1057 ksupc, A_gpu.dFBufs[0], ksupc);
1058 cudaStreamSynchronize(cuStream);
1059#ifndef NDEBUG
1060 lPanelVec[g2lCol(k)].panelSolve(ksupc, dFBufs[offset]->BlockUFactor, ksupc);
1061 cudaStreamSynchronize(cuStream);
1062 lPanelVec[g2lCol(k)].checkGPU();
1063#endif
1064 }
1065
1066 /*======= Panel Broadcast ======*/
1067 upanel_t k_upanel(UidxRecvBufs[0], UvalRecvBufs[0],
1068 A_gpu.UidxRecvBufs[0], A_gpu.UvalRecvBufs[0]);
1069 lpanel_t k_lpanel(LidxRecvBufs[0], LvalRecvBufs[0],
1070 A_gpu.LidxRecvBufs[0], A_gpu.LvalRecvBufs[0]);
1071 if (myrow == krow(k))
1072 {
1073 k_upanel = uPanelVec[g2lRow(k)];
1074 }
1075 if (mycol == kcol(k))
1076 k_lpanel = lPanelVec[g2lCol(k)];
1077
1078 if (UidxSendCounts[k] > 0)
1079 {
1080 // assuming GPU direct is available
1081 MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
1082 MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], get_mpi_type<Ftype>(), krow(k), grid3d->cscp.comm);
1083 // copy the index to cpu
1084 cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index,
1085 sizeof(int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost);
1086
1087#ifndef NDEBUG
1088 MPI_Bcast(k_upanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
1089 MPI_Bcast(k_upanel.val, UvalSendCounts[k], get_mpi_type<Ftype>(), krow(k), grid3d->cscp.comm);
1090#endif
1091 }
1092
1093 if (LidxSendCounts[k] > 0)
1094 {
1095 MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
1096 MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], get_mpi_type<Ftype>(), kcol(k), grid3d->rscp.comm);
1097 cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index,
1098 sizeof(int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost);
1099
1100#ifndef NDEBUG
1101 MPI_Bcast(k_lpanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
1102 MPI_Bcast(k_lpanel.val, LvalSendCounts[k], get_mpi_type<Ftype>(), kcol(k), grid3d->rscp.comm);
1103#endif
1104 }
1105
1106 /*======= Schurcomplement Update ======*/
1107
1108 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
1109 {
1110 // k_upanel.checkCorrectness();
1111 int streamId = 0;
1112#ifndef NDEBUG
1113 checkGPU();
1114#endif
1115 dSchurComplementUpdateGPU(
1116 streamId,
1117 k, k_lpanel, k_upanel);
1118// cudaStreamSynchronize(cuStream); // there is sync inside the kernel
1119#ifndef NDEBUG
1120 dSchurComplementUpdate(k, k_lpanel, k_upanel);
1121 cudaStreamSynchronize(cuStream);
1122 checkGPU();
1123#endif
1124 }
1125 // MPI_Barrier(grid3d->comm);
1126
1127 } /*for k0= k_st:k_end */
1128
1129 } /*for topoLvl = 0:maxTopoLevel*/
1130
1131#if (DEBUGlevel >= 1)
1132 CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorGPUBaseline()");
1133#endif
1134
1135 return 0;
1136} /* dsparseTreeFactorGPUBaseline */
1137
1138#endif /* match #if HAVE_CUDA */
1139
Definition: lupanels.hpp:19
int_t LDA()
Definition: lupanels.hpp:97
Definition: lupanels.hpp:172
Definition: xlupanels.hpp:22
int_t * index
Definition: xlupanels.hpp:24
Ftype * val
Definition: xlupanels.hpp:25
Definition: xlupanels.hpp:176
int_t * index
Definition: xlupanels.hpp:178
Ftype * val
Definition: xlupanels.hpp:179
typename std::conditional< std::is_same< Ftype, double >::value, ddiagFactBufs_t, typename std::conditional< std::is_same< Ftype, float >::value, sdiagFactBufs_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zdiagFactBufs_t, void >::type >::type >::type diagFactBufs_type
Definition: luAuxStructTemplated.hpp:147
Definition: superlu_defs.h:978
int_t * setree
Definition: superlu_defs.h:979
Definition: superlu_defs.h:989
treeTopoInfo_t topoInfo
Definition: superlu_defs.h:999
int_t * nodeList
Definition: superlu_defs.h:992
int_t nNodes
Definition: superlu_defs.h:991
Definition: superlu_defs.h:970
int_t * myIperm
Definition: superlu_defs.h:973
int_t numLvl
Definition: superlu_defs.h:971
int_t * eTreeTopLims
Definition: superlu_defs.h:972
Definition: xlupanels.hpp:335
void dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno)
void marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno)
int dsparseTreeFactorBatchGPU(sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Distributed SuperLU data types and function prototypes.
#define SuperSize(bnum)
Definition: superlu_defs.h:271
#define mpi_int_t
Definition: superlu_defs.h:120
int_t getNumLookAhead(superlu_dist_options_t *)
Definition: treeFactorization.c:186
int * int32Malloc_dist(int)
Definition: memory.c:193
int64_t int_t
Definition: superlu_defs.h:119
double SuperLU_timer_()
Definition: superlu_timer.c:72
int i
Definition: sutil_dist.c:287
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56
#define SUPERLU_MIN(x, y)
Definition: util_dist.h:64
#define SUPERLU_FREE(addr)
Definition: util_dist.h:54
#define SUPERLU_MAX(x, y)
Definition: util_dist.h:63