SuperLU Distributed 9.0.0
gpu3d
lupanels_GPU_impl.hpp
Go to the documentation of this file.
1#pragma once
2#include <cassert>
3#include <algorithm>
4#include <cmath>
5#include "superlu_defs.h"
7
8// #ifdef HAVE_CUDA
9#define EPSILON 1e-3
10#include <cuda_runtime.h>
11
12#include "cublas_v2.h"
13
14
15#include "lupanels.hpp"
16
17#include <cmath>
18#include <complex>
19#include <cassert>
20
21template<typename T>
22int checkArr(const T *A, const T *B, int n)
23{
24 double nrmA = 0;
25 for (int i = 0; i < n; i++) {
26 // For complex numbers, std::norm gives the squared magnitude.
27 nrmA += sqnorm(A[i]);
28 }
29 nrmA = std::sqrt(nrmA);
30
31 for (int i = 0; i < n; i++) {
32 // Use std::abs for both real and complex numbers to get the magnitude.
33 // assert(std::abs(A[i] - B[i]) <= EPSILON * nrmA / n);
34 assert(std::sqrt(sqnorm(A[i] - B[i])) <= EPSILON * nrmA / n);
35 }
36
37 return 0;
38}
39
40template <typename T>
41xlpanelGPU_t<T> xlpanel_t<T>::copyToGPU()
42{
43 if (isEmpty())
44 return gpuPanel;
45 size_t idxSize = sizeof(int_t) * indexSize();
46 size_t valSize = sizeof(T) * nzvalSize();
47
48 gpuErrchk(cudaMalloc(&gpuPanel.index, idxSize));
49 gpuErrchk(cudaMalloc(&gpuPanel.val, valSize));
50
51 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
52 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
53
54 return gpuPanel;
55}
56
57template <typename T>
58xlpanelGPU_t<T> xlpanel_t<T>::copyToGPU(void* basePtr)
59{
60 if (isEmpty())
61 return gpuPanel;
62 size_t idxSize = sizeof(int_t) * indexSize();
63 size_t valSize = sizeof(T) * nzvalSize();
64
65 gpuPanel.index = (int_t*) basePtr;
66 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
67
68 basePtr = (char *)basePtr+ idxSize;
69 gpuPanel.val = (T *) basePtr;
70
71 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
72
73 return gpuPanel;
74}
75
76template <typename T>
78{
79 if(isEmpty())
80 return 0;
81 size_t valSize = sizeof(T) * nzvalSize();
82 gpuErrchk(cudaMemcpy(val, gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
83 return 0;
84}
85
86template <typename T>
88{
89 if(isEmpty())
90 return 0;
91 size_t valSize = sizeof(T) * nzvalSize();
92 gpuErrchk(cudaMemcpy(val, gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
93 return 0;
94}
95
96template <typename T>
98{
99 if(isEmpty())
100 return 0;
101 size_t valSize = sizeof(T) * nzvalSize();
102 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
103}
104
105template <typename T>
107{
108 if(isEmpty())
109 return 0;
110 size_t valSize = sizeof(T) * nzvalSize();
111 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
112}
113
114template <typename T>
115xupanelGPU_t<T> xupanel_t<T>::copyToGPU()
116{
117 if (isEmpty())
118 return gpuPanel;
119 size_t idxSize = sizeof(int_t) * indexSize();
120 size_t valSize = sizeof(T) * nzvalSize();
121
122 gpuErrchk(cudaMalloc(&gpuPanel.index, idxSize));
123 gpuErrchk(cudaMalloc(&gpuPanel.val, valSize));
124
125 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
126 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
127 return gpuPanel;
128}
129
130template <typename T>
131xupanelGPU_t<T> xupanel_t<T>::copyToGPU(void* basePtr)
132{
133 if (isEmpty())
134 return gpuPanel;
135 size_t idxSize = sizeof(int_t) * indexSize();
136 size_t valSize = sizeof(T) * nzvalSize();
137
138 gpuPanel.index = (int_t*) basePtr;
139 gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));
140
141 basePtr = (char *)basePtr+ idxSize;
142 gpuPanel.val = (T *) basePtr;
143
144 gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));
145
146 return gpuPanel;
147}
148
149template <typename T>
151{
152 assert(isEmpty() == gpuPanel.isEmpty());
153
154 if (isEmpty())
155 return 0;
156
157 size_t valSize = sizeof(T) * nzvalSize();
158
159 std::vector<T> tmpArr(nzvalSize());
160 gpuErrchk(cudaMemcpy(tmpArr.data(), gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
161
162 int out = checkArr(tmpArr.data(), val, nzvalSize());
163
164 return 0;
165}
166
167template <typename T>
168int_t xlpanel_t<T>::panelSolveGPU(cublasHandle_t handle, cudaStream_t cuStream,
169 int_t ksupsz,
170 T *DiagBlk, // device pointer
171 int_t LDD)
172{
173 if (isEmpty())
174 return 0;
175 T *lPanelStPtr = blkPtrGPU(0); // &val[blkPtrOffset(0)];
176 int_t len = nzrows();
177 if (haveDiag())
178 {
179 lPanelStPtr = blkPtrGPU(1); // &val[blkPtrOffset(1)];
180 len -= nbrow(0);
181 }
182
183 T alpha = one<T>();
184
185 cublasSetStream(handle, cuStream);
186 cublasStatus_t cbstatus =
187 myCublasTrsm<T>(handle,
188 CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,
189 CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
190 len, ksupsz, &alpha, DiagBlk, LDD,
191 lPanelStPtr, LDA());
192
193 return 0;
194}
195
196template <typename T>
198 T *UBlk, int_t LDU, // CPU pointers
199 T *DiagLBlk, int_t LDD, // CPU pointers
200 T thresh, int_t *xsup,
201 superlu_dist_options_t *options,
202 SuperLUStat_t *stat, int *info)
203{
204 int kSupSize = SuperSize(k);
205 size_t dpitch = LDD * sizeof(T);
206 size_t spitch = LDA() * sizeof(T);
207 size_t width = kSupSize * sizeof(T);
208 size_t height = kSupSize;
209 T *val = blkPtrGPU(0);
210
211 gpuErrchk(cudaMemcpy2D(DiagLBlk, dpitch, val, spitch,
212 width, height, cudaMemcpyDeviceToHost));
213
214 // call dgetrf2
215 dgstrf2(k, DiagLBlk, LDD, UBlk, LDU,
216 thresh, xsup, options, stat, info);
217
218 //copy back to device
219 gpuErrchk(cudaMemcpy2D(val, spitch, DiagLBlk, dpitch,
220 width, height, cudaMemcpyHostToDevice));
221
222 return 0;
223}
224
225template <typename T>
227 cusolverDnHandle_t cusolverH, cudaStream_t cuStream,
228 T *dWork, int* dInfo, // GPU pointers
229 T *dDiagBuf, int_t LDD, // GPU pointers
230 threshPivValType<T> thresh, int_t *xsup,
231 superlu_dist_options_t *options,
232 SuperLUStat_t *stat, int *info)
233{
234 // cudaStream_t stream = NULL;
235 int kSupSize = SuperSize(k);
236 size_t dpitch = LDD * sizeof(T);
237 size_t spitch = LDA() * sizeof(T);
238 size_t width = kSupSize * sizeof(T);
239 size_t height = kSupSize;
240 T *val = blkPtrGPU(0);
241
242 gpuCusolverErrchk(cusolverDnSetStream(cusolverH, cuStream));
243 gpuCusolverErrchk(myCusolverGetrf<T>(cusolverH, kSupSize, kSupSize, val, LDA(), dWork, NULL, dInfo));
244
245 gpuErrchk(cudaMemcpy2DAsync(dDiagBuf, dpitch, val, spitch,
246 width, height, cudaMemcpyDeviceToDevice, cuStream));
247 gpuErrchk(cudaStreamSynchronize(cuStream));
248 return 0;
249}
250
251template <typename T>
252int_t xupanel_t<T>::panelSolveGPU(cublasHandle_t handle, cudaStream_t cuStream,
253 int_t ksupsz, T *DiagBlk, int_t LDD)
254{
255 if (isEmpty())
256 return 0;
257
258 T alpha = one<T>();
259
260 cublasSetStream(handle, cuStream);
261 cublasStatus_t cbstatus =
262 myCublasTrsm<T>(handle,
263 CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
264 CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
265 ksupsz, nzcols(), &alpha, DiagBlk, LDD,
266 blkPtrGPU(0), LDA());
267
268 return 0;
269}
270
271template <typename T>
273{
274 assert(isEmpty() == gpuPanel.isEmpty());
275
276 if (isEmpty())
277 return 0;
278
279 size_t valSize = sizeof(T) * nzvalSize();
280
281 std::vector<T> tmpArr(nzvalSize());
282 gpuErrchk(cudaMemcpy(tmpArr.data(), gpuPanel.val, valSize, cudaMemcpyDeviceToHost));
283
284 int out = checkArr(tmpArr.data(), val, nzvalSize());
285
286 return 0;
287}
Definition: xlupanels.hpp:22
int_t * index
Definition: xlupanels.hpp:24
Definition: xlupanels.hpp:176
int_t * index
Definition: xlupanels.hpp:178
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type threshPivValType
Definition: luAuxStructTemplated.hpp:70
double sqnorm(float value)
Definition: luAuxStructTemplated.hpp:275
#define EPSILON
Definition: lupanels_GPU_impl.hpp:9
int checkArr(const T *A, const T *B, int n)
Definition: lupanels_GPU_impl.hpp:22
Definition: util_dist.h:101
Definition: superlu_defs.h:728
void dgstrf2(int_t k, double *diagBlk, int_t LDA, double *BlockUfactor, int_t LDU, double thresh, int_t *xsup, superlu_dist_options_t *options, SuperLUStat_t *stat, int *info)
Definition: pdgstrf2.c:404
Definitions which are precision-neutral.
#define SuperSize(bnum)
Definition: superlu_defs.h:271
int64_t int_t
Definition: superlu_defs.h:119
int i
Definition: sutil_dist.c:287