superlu/superlu_dist_code_html/lupanels__GPU__impl_8hpp_source.html

#pragma once

#include <cassert>

#include <algorithm>

#include <cmath>

#include "superlu_defs.h"

#include "superlu_dist_config.h"


// #ifdef HAVE_CUDA

#define EPSILON 1e-3

#include <cuda_runtime.h>


#include "cublas_v2.h"


#include "lupanels.hpp"


#include <cmath>

#include <complex>

#include <cassert>


template<typename T>

int checkArr(const T *A, const T *B, int n)

{

    double nrmA = 0;

    for (int i = 0; i < n; i++) {

        // For complex numbers, std::norm gives the squared magnitude.

        nrmA += sqnorm(A[i]);

    }

    nrmA = std::sqrt(nrmA);


    for (int i = 0; i < n; i++) {

        // Use std::abs for both real and complex numbers to get the magnitude.

        // assert(std::abs(A[i] - B[i]) <= EPSILON * nrmA / n);

        assert(std::sqrt(sqnorm(A[i] - B[i])) <= EPSILON * nrmA / n);

    }


    return 0;

}


template <typename T>

xlpanelGPU_t<T> xlpanel_t<T>::copyToGPU()

{

    if (isEmpty())

        return gpuPanel;

    size_t idxSize = sizeof(int_t) * indexSize();

    size_t valSize = sizeof(T) * nzvalSize();


    gpuErrchk(cudaMalloc(&gpuPanel.index, idxSize));

    gpuErrchk(cudaMalloc(&gpuPanel.val, valSize));


    gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));


    return gpuPanel;

}


template <typename T>

xlpanelGPU_t<T> xlpanel_t<T>::copyToGPU(void* basePtr)

{

    if (isEmpty())

        return gpuPanel;

    size_t idxSize = sizeof(int_t) * indexSize();

    size_t valSize = sizeof(T) * nzvalSize();


    gpuPanel.index = (int_t*) basePtr;

    gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));


    basePtr = (char *)basePtr+ idxSize;

    gpuPanel.val = (T *) basePtr;


    gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));


    return gpuPanel;

}


template <typename T>

int_t xlpanel_t<T>::copyFromGPU()

{

    if(isEmpty())

        return 0;

    size_t valSize = sizeof(T) * nzvalSize();

    gpuErrchk(cudaMemcpy(val, gpuPanel.val,  valSize, cudaMemcpyDeviceToHost));

    return 0;

}


template <typename T>

int_t xupanel_t<T>::copyFromGPU()

{

    if(isEmpty())

        return 0;

    size_t valSize = sizeof(T) * nzvalSize();

    gpuErrchk(cudaMemcpy(val, gpuPanel.val,  valSize, cudaMemcpyDeviceToHost));

    return 0;

}


template <typename T>

int xupanel_t<T>::copyBackToGPU()

{

    if(isEmpty())

        return 0;

    size_t valSize = sizeof(T) * nzvalSize();

    gpuErrchk(cudaMemcpy(gpuPanel.val, val,  valSize, cudaMemcpyHostToDevice));

}


template <typename T>

int xlpanel_t<T>::copyBackToGPU()

{

    if(isEmpty())

        return 0;

    size_t valSize = sizeof(T) * nzvalSize();

    gpuErrchk(cudaMemcpy(gpuPanel.val, val,  valSize, cudaMemcpyHostToDevice));

}


template <typename T>

xupanelGPU_t<T> xupanel_t<T>::copyToGPU()

{

    if (isEmpty())

        return gpuPanel;

    size_t idxSize = sizeof(int_t) * indexSize();

    size_t valSize = sizeof(T) * nzvalSize();


    gpuErrchk(cudaMalloc(&gpuPanel.index, idxSize));

    gpuErrchk(cudaMalloc(&gpuPanel.val, valSize));


    gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));

    return gpuPanel;

}


template <typename T>

xupanelGPU_t<T> xupanel_t<T>::copyToGPU(void* basePtr)

{

    if (isEmpty())

        return gpuPanel;

    size_t idxSize = sizeof(int_t) * indexSize();

    size_t valSize = sizeof(T) * nzvalSize();


    gpuPanel.index = (int_t*) basePtr;

    gpuErrchk(cudaMemcpy(gpuPanel.index, index, idxSize, cudaMemcpyHostToDevice));


    basePtr = (char *)basePtr+ idxSize;

    gpuPanel.val = (T *) basePtr;


    gpuErrchk(cudaMemcpy(gpuPanel.val, val, valSize, cudaMemcpyHostToDevice));


    return gpuPanel;

}


template <typename T>

int xlpanel_t<T>::checkGPU()

{

    assert(isEmpty() == gpuPanel.isEmpty());


    if (isEmpty())

        return 0;


    size_t valSize = sizeof(T) * nzvalSize();


    std::vector<T> tmpArr(nzvalSize());

    gpuErrchk(cudaMemcpy(tmpArr.data(), gpuPanel.val, valSize, cudaMemcpyDeviceToHost));


    int out = checkArr(tmpArr.data(), val, nzvalSize());


    return 0;

}


template <typename T>

int_t xlpanel_t<T>::panelSolveGPU(cublasHandle_t handle, cudaStream_t cuStream,

                              int_t ksupsz,

                              T *DiagBlk, // device pointer

                              int_t LDD)

{

    if (isEmpty())

        return 0;

    T *lPanelStPtr = blkPtrGPU(0); // &val[blkPtrOffset(0)];

    int_t len = nzrows();

    if (haveDiag())

    {

        lPanelStPtr = blkPtrGPU(1); // &val[blkPtrOffset(1)];

        len -= nbrow(0);

    }


    T alpha = one<T>();


    cublasSetStream(handle, cuStream);

    cublasStatus_t cbstatus =

        myCublasTrsm<T>(handle,

                    CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,

                    CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,

                    len, ksupsz, &alpha, DiagBlk, LDD,

                    lPanelStPtr, LDA());


    return 0;

}


template <typename T>

int_t xlpanel_t<T>::diagFactorPackDiagBlockGPU(int_t k,

                                           T *UBlk, int_t LDU,     // CPU pointers

                                           T *DiagLBlk, int_t LDD, // CPU pointers

                                           T thresh, int_t *xsup,

                                           superlu_dist_options_t *options,

                                           SuperLUStat_t *stat, int *info)

{

    int kSupSize = SuperSize(k);

    size_t dpitch = LDD * sizeof(T);

    size_t spitch = LDA() * sizeof(T);

    size_t width = kSupSize * sizeof(T);

    size_t height = kSupSize;

    T *val = blkPtrGPU(0);


    gpuErrchk(cudaMemcpy2D(DiagLBlk, dpitch, val, spitch,

                 width, height, cudaMemcpyDeviceToHost));


    // call dgetrf2

    dgstrf2(k, DiagLBlk, LDD, UBlk, LDU,

            thresh, xsup, options, stat, info);


    //copy back to device

    gpuErrchk(cudaMemcpy2D(val, spitch, DiagLBlk, dpitch,

                 width, height, cudaMemcpyHostToDevice));


    return 0;

}


template <typename T>

int_t xlpanel_t<T>::diagFactorCuSolver(int_t k,

                                     cusolverDnHandle_t cusolverH, cudaStream_t cuStream,

                                    T *dWork, int* dInfo,  // GPU pointers

                                    T *dDiagBuf, int_t LDD, // GPU pointers

                                    threshPivValType<T> thresh, int_t *xsup,

                                    superlu_dist_options_t *options,

                                    SuperLUStat_t *stat, int *info)

{

    // cudaStream_t stream = NULL;

    int kSupSize = SuperSize(k);

    size_t dpitch = LDD * sizeof(T);

    size_t spitch = LDA() * sizeof(T);

    size_t width = kSupSize * sizeof(T);

    size_t height = kSupSize;

    T *val = blkPtrGPU(0);


    gpuCusolverErrchk(cusolverDnSetStream(cusolverH, cuStream));

    gpuCusolverErrchk(myCusolverGetrf<T>(cusolverH, kSupSize, kSupSize, val, LDA(), dWork, NULL, dInfo));


    gpuErrchk(cudaMemcpy2DAsync(dDiagBuf, dpitch, val, spitch,

                 width, height, cudaMemcpyDeviceToDevice, cuStream));

    gpuErrchk(cudaStreamSynchronize(cuStream));

    return 0;

}


template <typename T>

int_t xupanel_t<T>::panelSolveGPU(cublasHandle_t handle, cudaStream_t cuStream,

                              int_t ksupsz, T *DiagBlk, int_t LDD)

{

    if (isEmpty())

        return 0;


    T alpha = one<T>();


    cublasSetStream(handle, cuStream);

    cublasStatus_t cbstatus =

        myCublasTrsm<T>(handle,

                    CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,

                    CUBLAS_OP_N, CUBLAS_DIAG_UNIT,

                    ksupsz, nzcols(), &alpha, DiagBlk, LDD,

                    blkPtrGPU(0), LDA());


    return 0;

}


template <typename T>

int xupanel_t<T>::checkGPU()

{

    assert(isEmpty() == gpuPanel.isEmpty());


    if (isEmpty())

        return 0;


    size_t valSize = sizeof(T) * nzvalSize();


    std::vector<T> tmpArr(nzvalSize());

    gpuErrchk(cudaMemcpy(tmpArr.data(), gpuPanel.val, valSize, cudaMemcpyDeviceToHost));


    int out = checkArr(tmpArr.data(), val, nzvalSize());


    return 0;

}

xlpanel_t
Definition: xlupanels.hpp:22

xlpanel_t::index
int_t * index
Definition: xlupanels.hpp:24

xupanel_t
Definition: xlupanels.hpp:176

xupanel_t::index
int_t * index
Definition: xlupanels.hpp:178

threshPivValType
typename std::conditional< std::is_same< Ftype, float >::value, float, typename std::conditional< std::is_same< Ftype, double >::value||std::is_same< Ftype, doublecomplex >::value, double, float >::type >::type threshPivValType
Definition: luAuxStructTemplated.hpp:70

sqnorm
double sqnorm(float value)
Definition: luAuxStructTemplated.hpp:275

lupanels.hpp

EPSILON
#define EPSILON
Definition: lupanels_GPU_impl.hpp:9

checkArr
int checkArr(const T *A, const T *B, int n)
Definition: lupanels_GPU_impl.hpp:22

SuperLUStat_t
Definition: util_dist.h:101

superlu_dist_options_t
Definition: superlu_defs.h:728

dgstrf2
void dgstrf2(int_t k, double *diagBlk, int_t LDA, double *BlockUfactor, int_t LDU, double thresh, int_t *xsup, superlu_dist_options_t *options, SuperLUStat_t *stat, int *info)
Definition: pdgstrf2.c:404

superlu_defs.h
Definitions which are precision-neutral.

SuperSize
#define SuperSize(bnum)
Definition: superlu_defs.h:271

int_t
int64_t int_t
Definition: superlu_defs.h:119

superlu_dist_config.h

i
int i
Definition: sutil_dist.c:287