10#include <cuda_runtime.h>
25 for (
int i = 0;
i < n;
i++) {
29 nrmA = std::sqrt(nrmA);
31 for (
int i = 0;
i < n;
i++) {
45 size_t idxSize =
sizeof(
int_t) * indexSize();
46 size_t valSize =
sizeof(T) * nzvalSize();
48 gpuErrchk(cudaMalloc(&gpuPanel.index, idxSize));
49 gpuErrchk(cudaMalloc(&gpuPanel.val, valSize));
51 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
52 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
62 size_t idxSize =
sizeof(
int_t) * indexSize();
63 size_t valSize =
sizeof(T) * nzvalSize();
66 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
68 basePtr = (
char *)basePtr+ idxSize;
69 gpuPanel.val = (T *) basePtr;
71 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
81 size_t valSize =
sizeof(T) * nzvalSize();
82 gpuErrchk(cudaMemcpy(val, gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
91 size_t valSize =
sizeof(T) * nzvalSize();
92 gpuErrchk(cudaMemcpy(val, gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
101 size_t valSize =
sizeof(T) * nzvalSize();
102 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
110 size_t valSize =
sizeof(T) * nzvalSize();
111 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
119 size_t idxSize =
sizeof(
int_t) * indexSize();
120 size_t valSize =
sizeof(T) * nzvalSize();
122 gpuErrchk(cudaMalloc(&gpuPanel.index, idxSize));
123 gpuErrchk(cudaMalloc(&gpuPanel.val, valSize));
125 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
126 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
135 size_t idxSize =
sizeof(
int_t) * indexSize();
136 size_t valSize =
sizeof(T) * nzvalSize();
139 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
141 basePtr = (
char *)basePtr+ idxSize;
142 gpuPanel.val = (T *) basePtr;
144 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
152 assert(isEmpty() == gpuPanel.isEmpty());
157 size_t valSize =
sizeof(T) * nzvalSize();
159 std::vector<T> tmpArr(nzvalSize());
160 gpuErrchk(cudaMemcpy(tmpArr.data(), gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
162 int out =
checkArr(tmpArr.data(), val, nzvalSize());
175 T *lPanelStPtr = blkPtrGPU(0);
176 int_t len = nzrows();
179 lPanelStPtr = blkPtrGPU(1);
185 cublasSetStream(handle, cuStream);
186 cublasStatus_t cbstatus =
187 myCublasTrsm<T>(handle,
188 CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,
189 CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
190 len, ksupsz, &alpha, DiagBlk, LDD,
199 T *DiagLBlk,
int_t LDD,
200 T thresh,
int_t *xsup,
205 size_t dpitch = LDD *
sizeof(T);
206 size_t spitch = LDA() *
sizeof(T);
207 size_t width = kSupSize *
sizeof(T);
208 size_t height = kSupSize;
209 T *val = blkPtrGPU(0);
211 gpuErrchk(cudaMemcpy2D(DiagLBlk, dpitch, val, spitch,
212 width, height, cudaMemcpyDeviceToHost));
215 dgstrf2(k, DiagLBlk, LDD, UBlk, LDU,
216 thresh, xsup, options, stat, info);
219 gpuErrchk(cudaMemcpy2D(val, spitch, DiagLBlk, dpitch,
220 width, height, cudaMemcpyHostToDevice));
227 cusolverDnHandle_t cusolverH, cudaStream_t cuStream,
228 T *dWork,
int* dInfo,
229 T *dDiagBuf,
int_t LDD,
236 size_t dpitch = LDD *
sizeof(T);
237 size_t spitch = LDA() *
sizeof(T);
238 size_t width = kSupSize *
sizeof(T);
239 size_t height = kSupSize;
240 T *val = blkPtrGPU(0);
242 gpuCusolverErrchk(cusolverDnSetStream(cusolverH, cuStream));
243 gpuCusolverErrchk(myCusolverGetrf<T>(cusolverH, kSupSize, kSupSize, val, LDA(), dWork, NULL, dInfo));
245 gpuErrchk(cudaMemcpy2DAsync(dDiagBuf, dpitch, val, spitch,
246 width, height, cudaMemcpyDeviceToDevice, cuStream));
247 gpuErrchk(cudaStreamSynchronize(cuStream));
260 cublasSetStream(handle, cuStream);
261 cublasStatus_t cbstatus =
262 myCublasTrsm<T>(handle,
263 CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
264 CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
265 ksupsz, nzcols(), &alpha, DiagBlk, LDD,
266 blkPtrGPU(0), LDA());
274 assert(isEmpty() == gpuPanel.isEmpty());
279 size_t valSize =
sizeof(T) * nzvalSize();
281 std::vector<T> tmpArr(nzvalSize());
282 gpuErrchk(cudaMemcpy(tmpArr.data(), gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
284 int out =
checkArr(tmpArr.data(), val, nzvalSize());
Definition: xlupanels.hpp:22
int_t * index
Definition: xlupanels.hpp:24
Definition: xlupanels.hpp:176
int_t * index
Definition: xlupanels.hpp:178
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type threshPivValType
Definition: luAuxStructTemplated.hpp:70
double sqnorm(float value)
Definition: luAuxStructTemplated.hpp:275
#define EPSILON
Definition: lupanels_GPU_impl.hpp:9
int checkArr(const T *A, const T *B, int n)
Definition: lupanels_GPU_impl.hpp:22
Definition: util_dist.h:101
Definition: superlu_defs.h:728
void dgstrf2(int_t k, double *diagBlk, int_t LDA, double *BlockUfactor, int_t LDU, double thresh, int_t *xsup, superlu_dist_options_t *options, SuperLUStat_t *stat, int *info)
Definition: pdgstrf2.c:404
Definitions which are precision-neutral.
#define SuperSize(bnum)
Definition: superlu_defs.h:271
int64_t int_t
Definition: superlu_defs.h:119
int i
Definition: sutil_dist.c:287