14int getBufferOffset(
int k0,
int k1,
int winSize,
int winParity,
int halfWin)
16 int offset = (k0 - k1) % winSize;
24template <
typename Ftype>
25xLUMarshallData<Ftype>::xLUMarshallData()
27 dev_diag_ptrs = dev_panel_ptrs = NULL;
28 dev_diag_ld_array = dev_diag_dim_array = dev_info_array = NULL;
29 dev_panel_ld_array = dev_panel_dim_array = NULL;
32template <
typename Ftype>
33xLUMarshallData<Ftype>::~xLUMarshallData()
35 gpuErrchk(cudaFree(dev_diag_ptrs));
36 gpuErrchk(cudaFree(dev_panel_ptrs));
37 gpuErrchk(cudaFree(dev_diag_ld_array));
38 gpuErrchk(cudaFree(dev_diag_dim_array));
39 gpuErrchk(cudaFree(dev_info_array));
40 gpuErrchk(cudaFree(dev_panel_ld_array));
41 gpuErrchk(cudaFree(dev_panel_dim_array));
44template <
typename Ftype>
45void xLUMarshallData<Ftype>::setBatchSize(
int batch_size)
47 gpuErrchk(cudaMalloc(&dev_diag_ptrs, batch_size *
sizeof(Ftype *)));
48 gpuErrchk(cudaMalloc(&dev_panel_ptrs, batch_size *
sizeof(Ftype *)));
50 gpuErrchk(cudaMalloc(&dev_diag_ld_array, batch_size *
sizeof(
int)));
51 gpuErrchk(cudaMalloc(&dev_diag_dim_array, (batch_size + 1) *
sizeof(
int)));
52 gpuErrchk(cudaMalloc(&dev_info_array, batch_size *
sizeof(
int)));
53 gpuErrchk(cudaMalloc(&dev_panel_ld_array, batch_size *
sizeof(
int)));
54 gpuErrchk(cudaMalloc(&dev_panel_dim_array, (batch_size + 1) *
sizeof(
int)));
56 host_diag_ptrs.resize(batch_size);
57 host_diag_ld_array.resize(batch_size);
58 host_diag_dim_array.resize(batch_size);
59 host_panel_ptrs.resize(batch_size);
60 host_panel_ld_array.resize(batch_size);
61 host_panel_dim_array.resize(batch_size);
64template <
typename Ftype>
65void xLUMarshallData<Ftype>::setMaxDiag()
68 for (
int i = 0;
i < batchsize;
i++)
69 max_diag =
SUPERLU_MAX(max_diag, host_diag_dim_array[
i]);
72template <
typename Ftype>
73void xLUMarshallData<Ftype>::setMaxPanel()
76 for (
int i = 0;
i < batchsize;
i++)
77 max_panel =
SUPERLU_MAX(max_panel, host_panel_dim_array[
i]);
80template <
typename Ftype>
81xSCUMarshallData<Ftype>::xSCUMarshallData()
83 dev_A_ptrs = dev_B_ptrs = dev_C_ptrs = NULL;
84 dev_lda_array = dev_ldb_array = dev_ldc_array = NULL;
85 dev_m_array = dev_n_array = dev_k_array = NULL;
86 dev_gpu_lpanels = NULL;
87 dev_gpu_upanels = NULL;
88 dev_ist = dev_iend = dev_jst = dev_jend = NULL;
89 dev_maxGemmCols = dev_maxGemmRows = NULL;
92template <
typename Ftype>
93xSCUMarshallData<Ftype>::~xSCUMarshallData()
95 gpuErrchk(cudaFree(dev_A_ptrs));
96 gpuErrchk(cudaFree(dev_B_ptrs));
97 gpuErrchk(cudaFree(dev_C_ptrs));
98 gpuErrchk(cudaFree(dev_lda_array));
99 gpuErrchk(cudaFree(dev_ldb_array));
100 gpuErrchk(cudaFree(dev_ldc_array));
101 gpuErrchk(cudaFree(dev_m_array));
102 gpuErrchk(cudaFree(dev_n_array));
103 gpuErrchk(cudaFree(dev_k_array));
104 gpuErrchk(cudaFree(dev_gpu_lpanels));
105 gpuErrchk(cudaFree(dev_gpu_upanels));
106 gpuErrchk(cudaFree(dev_ist));
107 gpuErrchk(cudaFree(dev_iend));
108 gpuErrchk(cudaFree(dev_jst));
109 gpuErrchk(cudaFree(dev_jend));
110 gpuErrchk(cudaFree(dev_maxGemmCols));
111 gpuErrchk(cudaFree(dev_maxGemmRows));
114template <
typename Ftype>
115void xSCUMarshallData<Ftype>::setBatchSize(
int batch_size)
117 gpuErrchk(cudaMalloc(&dev_A_ptrs, batch_size *
sizeof(Ftype *)));
118 gpuErrchk(cudaMalloc(&dev_B_ptrs, batch_size *
sizeof(Ftype *)));
119 gpuErrchk(cudaMalloc(&dev_C_ptrs, batch_size *
sizeof(Ftype *)));
121 gpuErrchk(cudaMalloc(&dev_lda_array, batch_size *
sizeof(
int)));
122 gpuErrchk(cudaMalloc(&dev_ldb_array, batch_size *
sizeof(
int)));
123 gpuErrchk(cudaMalloc(&dev_ldc_array, batch_size *
sizeof(
int)));
125 gpuErrchk(cudaMalloc(&dev_m_array, (batch_size + 1) *
sizeof(
int)));
126 gpuErrchk(cudaMalloc(&dev_n_array, (batch_size + 1) *
sizeof(
int)));
127 gpuErrchk(cudaMalloc(&dev_k_array, (batch_size + 1) *
sizeof(
int)));
129 gpuErrchk(cudaMalloc(&dev_ist, batch_size *
sizeof(
int)));
130 gpuErrchk(cudaMalloc(&dev_iend, batch_size *
sizeof(
int)));
131 gpuErrchk(cudaMalloc(&dev_jst, batch_size *
sizeof(
int)));
132 gpuErrchk(cudaMalloc(&dev_jend, batch_size *
sizeof(
int)));
134 gpuErrchk(cudaMalloc(&dev_maxGemmCols, batch_size *
sizeof(
int)));
135 gpuErrchk(cudaMalloc(&dev_maxGemmRows, batch_size *
sizeof(
int)));
137 gpuErrchk(cudaMalloc(&dev_gpu_lpanels, batch_size *
sizeof(lpanelGPU_t)));
138 gpuErrchk(cudaMalloc(&dev_gpu_upanels, batch_size *
sizeof(upanelGPU_t)));
140 host_A_ptrs.resize(batch_size);
141 host_B_ptrs.resize(batch_size);
142 host_C_ptrs.resize(batch_size);
143 host_lda_array.resize(batch_size);
144 host_ldb_array.resize(batch_size);
145 host_ldc_array.resize(batch_size);
146 host_m_array.resize(batch_size);
147 host_n_array.resize(batch_size);
148 host_k_array.resize(batch_size);
149 upanels.resize(batch_size);
150 lpanels.resize(batch_size);
151 host_gpu_upanels.resize(batch_size);
152 host_gpu_lpanels.resize(batch_size);
153 ist.resize(batch_size);
154 iend.resize(batch_size);
155 jst.resize(batch_size);
156 jend.resize(batch_size);
157 maxGemmRows.resize(batch_size);
158 maxGemmCols.resize(batch_size);
161template <
typename Ftype>
162void xSCUMarshallData<Ftype>::setMaxDims()
164 max_n = max_k = max_m = max_ilen = max_jlen = 0;
165 for (
int i = 0;
i < batchsize;
i++)
175template <
typename Ftype>
176void xSCUMarshallData<Ftype>::copyToGPU()
178 gpuErrchk(cudaMemcpy(dev_A_ptrs, host_A_ptrs.data(), batchsize *
sizeof(Ftype *), cudaMemcpyHostToDevice));
179 gpuErrchk(cudaMemcpy(dev_B_ptrs, host_B_ptrs.data(), batchsize *
sizeof(Ftype *), cudaMemcpyHostToDevice));
180 gpuErrchk(cudaMemcpy(dev_C_ptrs, host_C_ptrs.data(), batchsize *
sizeof(Ftype *), cudaMemcpyHostToDevice));
182 gpuErrchk(cudaMemcpy(dev_lda_array, host_lda_array.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
183 gpuErrchk(cudaMemcpy(dev_ldb_array, host_ldb_array.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
184 gpuErrchk(cudaMemcpy(dev_ldc_array, host_ldc_array.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
186 gpuErrchk(cudaMemcpy(dev_m_array, host_m_array.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
187 gpuErrchk(cudaMemcpy(dev_n_array, host_n_array.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
188 gpuErrchk(cudaMemcpy(dev_k_array, host_k_array.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
190 gpuErrchk(cudaMemcpy(dev_ist, ist.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
191 gpuErrchk(cudaMemcpy(dev_iend, iend.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
192 gpuErrchk(cudaMemcpy(dev_jst, jst.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
193 gpuErrchk(cudaMemcpy(dev_jend, jend.data(), batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
196template <
typename Ftype>
197void xSCUMarshallData<Ftype>::copyPanelDataToGPU()
199 gpuErrchk(cudaMemcpy(dev_gpu_lpanels, host_gpu_lpanels.data(), batchsize *
sizeof(lpanelGPU_t), cudaMemcpyHostToDevice));
200 gpuErrchk(cudaMemcpy(dev_gpu_upanels, host_gpu_upanels.data(), batchsize *
sizeof(upanelGPU_t), cudaMemcpyHostToDevice));
206template <
typename Ftype>
216 cublasHandle_t cubHandle = A_gpu.cuHandles[offset];
217 cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[offset];
218 cudaStream_t cuStream = A_gpu.cuStreams[offset];
221 if (iam == procIJ(k, k))
223 lPanelVec[g2lCol(k)].diagFactorCuSolver(k,
225 A_gpu.diagFactWork[offset], A_gpu.diagFactInfo[offset],
226 A_gpu.dFBufs[offset], ksupc,
227 thresh, xsup, options, stat, info);
234 if (myrow == krow(k))
235 MPI_Bcast((
void *)A_gpu.dFBufs[offset], ksupc * ksupc,
236 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
240 if (mycol == kcol(k))
241 MPI_Bcast((
void *)A_gpu.dFBufs[offset], ksupc * ksupc,
242 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
245 if (myrow == krow(k))
247 uPanelVec[g2lRow(k)].panelSolveGPU(
249 ksupc, A_gpu.dFBufs[offset], ksupc);
250 cudaStreamSynchronize(cuStream);
253 if (mycol == kcol(k))
255 lPanelVec[g2lCol(k)].panelSolveGPU(
257 ksupc, A_gpu.dFBufs[offset], ksupc);
258 cudaStreamSynchronize(cuStream);
265template <
typename Ftype>
275 cublasHandle_t cubHandle = A_gpu.cuHandles[handle_offset];
276 cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[handle_offset];
277 cudaStream_t cuStream = A_gpu.cuStreams[handle_offset];
280 if (iam == procIJ(k, k))
282 lPanelVec[g2lCol(k)].diagFactorCuSolver(k,
284 A_gpu.diagFactWork[handle_offset], A_gpu.diagFactInfo[handle_offset],
285 A_gpu.dFBufs[buffer_offset], ksupc,
286 thresh, xsup, options, stat, info);
293 if (myrow == krow(k))
294 MPI_Bcast((
void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc,
295 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
299 if (mycol == kcol(k))
300 MPI_Bcast((
void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc,
301 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
304 if (myrow == krow(k))
306 uPanelVec[g2lRow(k)].panelSolveGPU(
308 ksupc, A_gpu.dFBufs[buffer_offset], ksupc);
309 cudaStreamSynchronize(cuStream);
312 if (mycol == kcol(k))
314 lPanelVec[g2lCol(k)].panelSolveGPU(
316 ksupc, A_gpu.dFBufs[buffer_offset], ksupc);
317 cudaStreamSynchronize(cuStream);
325template <
typename Ftype>
330 cublasHandle_t cubHandle = A_gpu.cuHandles[offset];
331 cudaStream_t cuStream = A_gpu.cuStreams[offset];
332 if (iam == procIJ(k, k))
335 lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k,
336 dFBufs[offset]->BlockUFactor, ksupc,
337 dFBufs[offset]->BlockLFactor, ksupc,
338 thresh, xsup, options, stat, info);
342 if (myrow == krow(k))
343 MPI_Bcast((
void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
344 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
345 if (mycol == kcol(k))
346 MPI_Bcast((
void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
347 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
350 if (myrow == krow(k))
353 cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockLFactor,
354 ksupc * ksupc *
sizeof(Ftype), cudaMemcpyHostToDevice);
355 uPanelVec[g2lRow(k)].panelSolveGPU(
357 ksupc, A_gpu.dFBufs[offset], ksupc);
358 cudaStreamSynchronize(cuStream);
361 if (mycol == kcol(k))
363 cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockUFactor,
364 ksupc * ksupc *
sizeof(Ftype), cudaMemcpyHostToDevice);
365 lPanelVec[g2lCol(k)].panelSolveGPU(
367 ksupc, A_gpu.dFBufs[offset], ksupc);
368 cudaStreamSynchronize(cuStream);
375template <
typename Ftype>
393 if (UidxSendCounts[k] > 0)
396 MPI_Bcast(k_upanel.gpuPanel.
index, UidxSendCounts[k],
mpi_int_t, krow(k), grid3d->cscp.comm);
397 MPI_Bcast(k_upanel.gpuPanel.
val, UvalSendCounts[k], get_mpi_type<Ftype>(), krow(k), grid3d->cscp.comm);
399 gpuErrchk(cudaMemcpy(k_upanel.
index, k_upanel.gpuPanel.
index,
400 sizeof(
int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost));
403 if (LidxSendCounts[k] > 0)
405 MPI_Bcast(k_lpanel.gpuPanel.
index, LidxSendCounts[k],
mpi_int_t, kcol(k), grid3d->rscp.comm);
406 MPI_Bcast(k_lpanel.gpuPanel.
val, LvalSendCounts[k], get_mpi_type<Ftype>(), kcol(k), grid3d->rscp.comm);
407 gpuErrchk(cudaMemcpy(k_lpanel.
index, k_lpanel.gpuPanel.
index,
408 sizeof(
int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost));
414template <
typename Ftype>
438 CHECK_MALLOC(grid3d->iam,
"Enter dsparseTreeFactorGPU()");
440 printf(
"Using New code V100 with GPU acceleration\n");
442 printf(
". lookahead numLA %d\n", numLA);
453 donePanelBcast[
i] = 0;
454 donePanelSolve[
i] = 0;
455 localNumChildrenLeft[
i] = 0;
458 for (
int_t k0 = 0; k0 < nnodes; k0++)
460 int_t k = perm_c_supno[k0];
462 int_t ik = myIperm[k_parent];
463 if (ik > -1 && ik < nnodes)
464 localNumChildrenLeft[ik]++;
469 int_t k_st = eTreeTopLims[topoLvl];
470 int_t k_end = eTreeTopLims[topoLvl + 1];
473 for (
int_t k0 = k_st; k0 < k_end; k0++)
475 int_t k = perm_c_supno[k0];
478 dDFactPSolveGPU(k, offset, dFBufs);
479 donePanelSolve[k0] = 1;
487#if ( PRNTlevel >= 1 )
488 printf(
". lookahead winSize %" PRIdMAX
"\n",
static_cast<intmax_t
>(winSize));
492 for (
int k0 = k_st; k0 < winSize; ++k0)
494 int_t k = perm_c_supno[k0];
495 int_t offset = k0 % numLA;
496 if (!donePanelBcast[k0])
498 dPanelBcastGPU(k, offset);
499 donePanelBcast[k0] = 1;
505 int_t halfWin = numLA / 2;
510 int_t k = perm_c_supno[k0];
511 int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
516 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
517 lookAheadUpdateGPU(offset, k, k_parent, k_lpanel, k_upanel);
523 int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
524 SyncLookAheadUpdate(offset);
529 int_t k = perm_c_supno[k0];
530 int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
535 if (k_parent < nsupers)
537 int_t k0_parent = myIperm[k_parent];
538 if (k0_parent > 0 && k0_parent < nnodes)
540 localNumChildrenLeft[k0_parent]--;
541 if (topoLvl < maxTopoLevel - 1 && !localNumChildrenLeft[k0_parent])
544 printf(
"parent %d of node %d during second phase\n", k0_parent, k0);
548 dDFactPSolveGPU(k_parent, dOffset, dFBufs);
549 donePanelSolve[k0_parent] = 1;
555 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
556 dSchurCompUpdateExcludeOneGPU(offset, k, k_parent, k_lpanel, k_upanel);
559 int_t k1_next = k1 + winSize;
560 int_t oldWinSize = winSize;
561 for (
int_t k0_next = k1_next; k0_next <
SUPERLU_MIN(nnodes, k1_next + winSize); ++k0_next)
563 int k_next = perm_c_supno[k0_next];
564 if (!localNumChildrenLeft[k0_next])
570 int_t offset_next = getBufferOffset(k0_next, k1_next, winSize, winParity + 1, halfWin);
571 dPanelBcastGPU(k_next, offset_next);
572 donePanelBcast[k0_next] = 1;
577 winSize = k0_next - k1_next;
584 int_t k = perm_c_supno[k0];
588 int_t offset = getBufferOffset(k0, k1, oldWinSize, winParity, halfWin);
590 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
591 gpuErrchk(cudaStreamSynchronize(A_gpu.cuStreams[offset]));
600 for (
int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
603 int_t k_st = eTreeTopLims[topoLvl];
604 int_t k_end = eTreeTopLims[topoLvl + 1];
605 for (
int_t k0 = k_st; k0 < k_end; ++k0)
607 int_t k = perm_c_supno[k0];
610 cublasHandle_t cubHandle = A_gpu.cuHandles[0];
611 cudaStream_t cuStream = A_gpu.cuStreams[0];
612 dDiagFactorPanelSolveGPU(k, 0, dFBufs);
615 int_t offset = k0%numLA;
616 dPanelBcastGPU(k, offset);
619 upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset],
620 A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]);
621 lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset],
622 A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]);
623 if (myrow == krow(k))
625 k_upanel = uPanelVec[g2lRow(k)];
627 if (mycol == kcol(k))
628 k_lpanel = lPanelVec[g2lCol(k)];
630 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
636 dSchurComplementUpdateGPU(
638 k, k_lpanel, k_upanel);
643 k, k_parent, k_lpanel, k_upanel);
644 dSchurCompUpdateExcludeOneGPU(
646 k, k_parent, k_lpanel, k_upanel);
662 CHECK_MALLOC(grid3d->iam,
"Exit dsparseTreeFactorGPU()");
670template <
typename Ftype>
674 LUMarshallData &mdata = A_gpu.marshall_data;
675 Ftype **panel_ptrs = mdata.host_panel_ptrs.data();
676 int *panel_ld_batch = mdata.host_panel_ld_array.data();
677 int *panel_dim_batch = mdata.host_panel_dim_array.data();
678 Ftype **diag_ptrs = mdata.host_diag_ptrs.data();
679 int *diag_ld_batch = mdata.host_diag_ld_array.data();
680 int *diag_dim_batch = mdata.host_diag_dim_array.data();
684 for (
int_t k0 = k_st; k0 < k_end; k0++)
686 int_t k = perm_c_supno[k0];
687 int_t buffer_offset = k0 - k_st;
690 if (iam == procIJ(k, k))
692 lpanel_t &lpanel = lPanelVec[g2lCol(k)];
694 assert(mdata.batchsize < mdata.host_diag_ptrs.size());
696 panel_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0);
697 panel_ld_batch[mdata.batchsize] = lpanel.
LDA();
698 panel_dim_batch[mdata.batchsize] = ksupc;
700 diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset];
701 diag_ld_batch[mdata.batchsize] = ksupc;
702 diag_dim_batch[mdata.batchsize] = ksupc;
712 gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize *
sizeof(Ftype *), cudaMemcpyHostToDevice));
713 gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
714 gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
715 gpuErrchk(cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize *
sizeof(Ftype *), cudaMemcpyHostToDevice));
716 gpuErrchk(cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
717 gpuErrchk(cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize *
sizeof(
int), cudaMemcpyHostToDevice));
723template <
typename Ftype>
729 LUMarshallData &mdata = A_gpu.marshall_data;
730 cudaStream_t stream = A_gpu.cuStreams[0];
731 marshallBatchedLUData(k_st, k_end, perm_c_supno);
733 int info = magma_dgetrf_vbatched(
734 mdata.dev_diag_dim_array, mdata.dev_diag_dim_array,
735 mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
736 NULL, mdata.dev_info_array, mdata.batchsize,
751 marshallBatchedTRSMUData(k_st, k_end, perm_c_supno);
753 magmablas_dtrsm_vbatched(
754 MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
755 mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, 1.0,
756 mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
757 mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
758 mdata.batchsize, A_gpu.magma_queue);
761 marshallBatchedTRSMLData(k_st, k_end, perm_c_supno);
763 magmablas_dtrsm_vbatched(
764 MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
765 mdata.dev_panel_dim_array, mdata.dev_diag_dim_array, 1.0,
766 mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
767 mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
768 mdata.batchsize, A_gpu.magma_queue);
781 initSCUMarshallData(k_st, k_end, perm_c_supno);
782 xSCUMarshallData &sc_mdata = A_gpu.sc_marshall_data;
786 int total_batches = 0;
789 done_i = marshallSCUBatchedDataOuter(k_st, k_end, perm_c_supno);
791 while (done_i == 0 && done_j == 0)
793 done_j = marshallSCUBatchedDataInner(k_st, k_end, perm_c_supno);
797 magmablas_dgemm_vbatched_max_nocheck(
798 MagmaNoTrans, MagmaNoTrans, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array,
799 1.0, sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array,
800 0.0, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, sc_mdata.batchsize,
801 sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, A_gpu.magma_queue);
803 scatterGPU_batchDriver(
804 sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend,
805 sc_mdata.max_ilen, sc_mdata.max_jlen, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array,
806 A_gpu.maxSuperSize, ldt, sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels,
807 dA_gpu, sc_mdata.batchsize, A_gpu.cuStreams[0]);
817template <
typename Ftype>
825 int nnodes = sforest->
nNodes;
826 int topoLvl, k_st, k_end, k0, k, offset, ksupc;
839 CHECK_MALLOC(grid3d->iam,
"Enter dsparseTreeFactorBatchGPU()");
841 printf(
"Using level-based scheduling on GPU\n");
846 k_st = eTreeTopLims[topoLvl];
847 k_end = eTreeTopLims[topoLvl + 1];
848 printf(
"level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end);
853 for (k0 = k_st; k0 < k_end; k0++)
855 k = perm_c_supno[k0];
858 dDFactPSolveGPU(k, 0, offset, dFBufs);
861 dPanelBcastGPU(k, offset);
866 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) {
869 upanel_t k_upanel = getKUpanel(k,offset);
870 lpanel_t k_lpanel = getKLpanel(k,offset);
871 dSchurComplementUpdateGPU( streamId,
872 k, k_lpanel, k_upanel);
878 for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) {
880 k_st = eTreeTopLims[topoLvl];
881 k_end = eTreeTopLims[topoLvl + 1];
883 printf(
"level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout);
886 for (k0 = k_st; k0 < k_end; ++k0) {
887 k = perm_c_supno[k0];
892 dDFactPSolveGPU(k, 0, offset, dFBufs);
895 dPanelBcastGPU(k, offset);
898 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
907 upanel_t k_upanel = getKUpanel(k,offset);
908 lpanel_t k_lpanel = getKLpanel(k,offset);
909 dSchurComplementUpdateGPU(streamId,
910 k, k_lpanel, k_upanel);
914 dSchurComplementUpdate(k, k_lpanel, k_upanel);
915n cudaStreamSynchronize(cuStream);
924 printf(
"Using batched code\n");
928 gpuErrchk(cudaMemcpy(A_gpu.dperm_c_supno, perm_c_supno,
sizeof(
int) * sforest->
nNodes, cudaMemcpyHostToDevice));
931 dFactBatchSolve(k_st, k_end, perm_c_supno);
932 for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl)
935 k_st = eTreeTopLims[topoLvl];
936 k_end = eTreeTopLims[topoLvl + 1];
937 dFactBatchSolve(k_st, k_end, perm_c_supno);
943 CHECK_MALLOC(grid3d->iam,
"Exit dsparseTreeFactorBatchGPU()");
947 printf(
"MAGMA is required for batched execution!\n");
961template <
typename Ftype>
975 printf(
"Using New code V100 with GPU acceleration\n");
977 CHECK_MALLOC(grid3d->iam,
"Enter dsparseTreeFactorGPUBaseline()");
991 for (
int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
994 int_t k_st = eTreeTopLims[topoLvl];
995 int_t k_end = eTreeTopLims[topoLvl + 1];
996 for (
int_t k0 = k_st; k0 < k_end; ++k0)
998 int_t k = perm_c_supno[k0];
999 int_t offset = k0 - k_st;
1001 cublasHandle_t cubHandle = A_gpu.cuHandles[0];
1002 cudaStream_t cuStream = A_gpu.cuStreams[0];
1004 if (iam == procIJ(k, k))
1008 lPanelVec[g2lCol(k)].checkGPU();
1009 lPanelVec[g2lCol(k)].diagFactor(k, dFBufs[offset]->BlockUFactor, ksupc,
1010 thresh, xsup, options, stat, info);
1011 lPanelVec[g2lCol(k)].packDiagBlock(dFBufs[offset]->BlockLFactor, ksupc);
1013 lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k,
1014 dFBufs[offset]->BlockUFactor, ksupc,
1015 dFBufs[offset]->BlockLFactor, ksupc,
1016 thresh, xsup, options, stat, info);
1019 cudaStreamSynchronize(cuStream);
1020 lPanelVec[g2lCol(k)].checkGPU();
1025 if (myrow == krow(k))
1026 MPI_Bcast((
void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
1027 get_mpi_type<Ftype>(), kcol(k), (grid->rscp).comm);
1028 if (mycol == kcol(k))
1029 MPI_Bcast((
void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
1030 get_mpi_type<Ftype>(), krow(k), (grid->cscp).comm);
1033 if (myrow == krow(k))
1036 uPanelVec[g2lRow(k)].checkGPU();
1038 cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockLFactor,
1039 ksupc * ksupc *
sizeof(Ftype), cudaMemcpyHostToDevice);
1040 uPanelVec[g2lRow(k)].panelSolveGPU(
1041 cubHandle, cuStream,
1042 ksupc, A_gpu.dFBufs[0], ksupc);
1043 cudaStreamSynchronize(cuStream);
1045 uPanelVec[g2lRow(k)].panelSolve(ksupc, dFBufs[offset]->BlockLFactor, ksupc);
1046 cudaStreamSynchronize(cuStream);
1047 uPanelVec[g2lRow(k)].checkGPU();
1051 if (mycol == kcol(k))
1053 cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockUFactor,
1054 ksupc * ksupc *
sizeof(Ftype), cudaMemcpyHostToDevice);
1055 lPanelVec[g2lCol(k)].panelSolveGPU(
1056 cubHandle, cuStream,
1057 ksupc, A_gpu.dFBufs[0], ksupc);
1058 cudaStreamSynchronize(cuStream);
1060 lPanelVec[g2lCol(k)].panelSolve(ksupc, dFBufs[offset]->BlockUFactor, ksupc);
1061 cudaStreamSynchronize(cuStream);
1062 lPanelVec[g2lCol(k)].checkGPU();
1067 upanel_t k_upanel(UidxRecvBufs[0], UvalRecvBufs[0],
1068 A_gpu.UidxRecvBufs[0], A_gpu.UvalRecvBufs[0]);
1069 lpanel_t k_lpanel(LidxRecvBufs[0], LvalRecvBufs[0],
1070 A_gpu.LidxRecvBufs[0], A_gpu.LvalRecvBufs[0]);
1071 if (myrow == krow(k))
1073 k_upanel = uPanelVec[g2lRow(k)];
1075 if (mycol == kcol(k))
1076 k_lpanel = lPanelVec[g2lCol(k)];
1078 if (UidxSendCounts[k] > 0)
1081 MPI_Bcast(k_upanel.gpuPanel.
index, UidxSendCounts[k],
mpi_int_t, krow(k), grid3d->cscp.comm);
1082 MPI_Bcast(k_upanel.gpuPanel.
val, UvalSendCounts[k], get_mpi_type<Ftype>(), krow(k), grid3d->cscp.comm);
1084 cudaMemcpy(k_upanel.
index, k_upanel.gpuPanel.
index,
1085 sizeof(
int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost);
1088 MPI_Bcast(k_upanel.
index, UidxSendCounts[k],
mpi_int_t, krow(k), grid3d->cscp.comm);
1089 MPI_Bcast(k_upanel.
val, UvalSendCounts[k], get_mpi_type<Ftype>(), krow(k), grid3d->cscp.comm);
1093 if (LidxSendCounts[k] > 0)
1095 MPI_Bcast(k_lpanel.gpuPanel.
index, LidxSendCounts[k],
mpi_int_t, kcol(k), grid3d->rscp.comm);
1096 MPI_Bcast(k_lpanel.gpuPanel.
val, LvalSendCounts[k], get_mpi_type<Ftype>(), kcol(k), grid3d->rscp.comm);
1097 cudaMemcpy(k_lpanel.
index, k_lpanel.gpuPanel.
index,
1098 sizeof(
int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost);
1101 MPI_Bcast(k_lpanel.
index, LidxSendCounts[k],
mpi_int_t, kcol(k), grid3d->rscp.comm);
1102 MPI_Bcast(k_lpanel.
val, LvalSendCounts[k], get_mpi_type<Ftype>(), kcol(k), grid3d->rscp.comm);
1108 if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
1115 dSchurComplementUpdateGPU(
1117 k, k_lpanel, k_upanel);
1120 dSchurComplementUpdate(k, k_lpanel, k_upanel);
1121 cudaStreamSynchronize(cuStream);
1131#if (DEBUGlevel >= 1)
1132 CHECK_MALLOC(grid3d->iam,
"Exit dsparseTreeFactorGPUBaseline()");
Definition: lupanels.hpp:19
int_t LDA()
Definition: lupanels.hpp:97
Definition: lupanels.hpp:172
Definition: xlupanels.hpp:22
int_t * index
Definition: xlupanels.hpp:24
Ftype * val
Definition: xlupanels.hpp:25
Definition: xlupanels.hpp:176
int_t * index
Definition: xlupanels.hpp:178
Ftype * val
Definition: xlupanels.hpp:179
typename std::conditional< std::is_same< Ftype, double >::value, ddiagFactBufs_t, typename std::conditional< std::is_same< Ftype, float >::value, sdiagFactBufs_t, typename std::conditional< std::is_same< Ftype, doublecomplex >::value, zdiagFactBufs_t, void >::type >::type >::type diagFactBufs_type
Definition: luAuxStructTemplated.hpp:147
Definition: superlu_defs.h:978
int_t * setree
Definition: superlu_defs.h:979
Definition: superlu_defs.h:989
treeTopoInfo_t topoInfo
Definition: superlu_defs.h:999
int_t * nodeList
Definition: superlu_defs.h:992
int_t nNodes
Definition: superlu_defs.h:991
Definition: superlu_defs.h:970
int_t * myIperm
Definition: superlu_defs.h:973
int_t numLvl
Definition: superlu_defs.h:971
int_t * eTreeTopLims
Definition: superlu_defs.h:972
Definition: xlupanels.hpp:335
void dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno)
void marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno)
int dsparseTreeFactorBatchGPU(sForest_t *sforest, diagFactBufs_type< Ftype > **dFBufs, gEtreeInfo_t *gEtreeInfo, int tag_ub)
Distributed SuperLU data types and function prototypes.
#define SuperSize(bnum)
Definition: superlu_defs.h:271
#define mpi_int_t
Definition: superlu_defs.h:120
int_t getNumLookAhead(superlu_dist_options_t *)
Definition: treeFactorization.c:186
int * int32Malloc_dist(int)
Definition: memory.c:193
int64_t int_t
Definition: superlu_defs.h:119
double SuperLU_timer_()
Definition: superlu_timer.c:72
int i
Definition: sutil_dist.c:287
#define CHECK_MALLOC(pnum, where)
Definition: util_dist.h:56
#define SUPERLU_MIN(x, y)
Definition: util_dist.h:64
#define SUPERLU_FREE(addr)
Definition: util_dist.h:54
#define SUPERLU_MAX(x, y)
Definition: util_dist.h:63