34 #ifndef DISTRIBUTED_MATRIX_HPP 
   35 #define DISTRIBUTED_MATRIX_HPP 
   43 #include "misc/RandomWrapper.hpp" 
   49 #ifndef DOXYGEN_SHOULD_SKIP_THIS 
   50   inline int indxl2g(
int INDXLOC, 
int NB,
 
   51                      int IPROC, 
int ISRCPROC, 
int NPROCS)  {
 
   52     return NPROCS*NB*((INDXLOC-1)/NB) + (INDXLOC-1) % NB +
 
   53       ((NPROCS+IPROC-ISRCPROC) % NPROCS)*NB + 1;
 
   55   inline int indxg2l(
int INDXGLOB, 
int NB,
 
   56                      int IPROC, 
int ISRCPROC, 
int NPROCS) {
 
   57     return NB*((INDXGLOB-1)/(NB*NPROCS)) + (INDXGLOB-1) % NB + 1;
 
   59   inline int indxg2p(
int INDXGLOB, 
int NB,
 
   60                      int IPROC, 
int ISRCPROC, 
int NPROCS) {
 
   61     return ( ISRCPROC + (INDXGLOB - 1) / NB ) % NPROCS;
 
   85     using real_t = 
typename RealType<scalar_t>::value_type;
 
  120                       const std::function<scalar_t(std::size_t,
 
  220     const int* 
desc()
 const { 
return desc_; }
 
  268     virtual int rows()
 const { 
return desc_[2]; }
 
  273     virtual int cols()
 const { 
return desc_[3]; }
 
  278     int lrows()
 const { 
return lrows_; }
 
  283     int lcols()
 const { 
return lcols_; }
 
  288     int ld()
 const { 
return lrows_; }
 
  294     int MB()
 const { 
return desc_[4]; }
 
  299     int NB()
 const { 
return desc_[5]; }
 
  318     virtual int I()
 const { 
return 1; }
 
  325     virtual int J()
 const { 
return 1; }
 
  340     virtual void lranges(
int& rlo, 
int& rhi, 
int& clo, 
int& chi) 
const;
 
  346     const scalar_t* 
data()
 const { 
return data_; }
 
  352     scalar_t* 
data() { 
return data_; }
 
  363     const scalar_t& 
operator()(
int r, 
int c)
 const { 
return data_[r+
ld()*c]; }
 
  502 #ifndef DOXYGEN_SHOULD_SKIP_THIS 
  504     int rowl2g_fixed(
int row)
 const {
 
  505       assert(
grid() && fixed());
 
  508     int coll2g_fixed(
int col)
 const {
 
  509       assert(
grid() && fixed());
 
  512     int rowg2l_fixed(
int row)
 const {
 
  513       assert(
grid() && fixed());
 
  516     int colg2l_fixed(
int col)
 const {
 
  517       assert(
grid() && fixed());
 
  520     int rowg2p_fixed(
int row)
 const {
 
  521       assert(
grid() && fixed());
 
  524     int colg2p_fixed(
int col)
 const {
 
  525       assert(
grid() && fixed());
 
  528     int rank_fixed(
int r, 
int c)
 const {
 
  529       assert(
grid() && fixed()); 
return rowg2p_fixed(r) + colg2p_fixed(c) * 
nprows();
 
  531     bool is_local_fixed(
int r, 
int c)
 const {
 
  532       assert(
grid() && fixed());
 
  533       return rowg2p_fixed(r) == 
prow() && colg2p_fixed(c) == 
pcol();
 
  535     scalar_t& global_fixed(
int r, 
int c) {
 
  536       assert(
is_local(r, c)); assert(fixed());
 
  537       return operator()(rowg2l_fixed(r),colg2l_fixed(c));
 
  551     const scalar_t& 
global(
int r, 
int c)
 const {
 
  594     void print()
 const { print(
"A"); }
 
  595     void print(std::string name, 
int precision=15) 
const;
 
  596     void print_to_file(std::string name, std::string filename,
 
  598     void print_to_files(std::string name, 
int precision=16) 
const;
 
  603     void fill(scalar_t a);
 
  604     void fill(
const std::function<scalar_t(std::size_t,
 
  607     void shift(scalar_t sigma);
 
  608     scalar_t trace() 
const;
 
  610     virtual void resize(std::size_t m, std::size_t n);
 
  617     void laswp(
const std::vector<int>& P, 
bool fwd);
 
  620     extract_rows(
const std::vector<std::size_t>& Ir) 
const;
 
  622     extract_cols(
const std::vector<std::size_t>& Ic) 
const;
 
  625     extract(
const std::vector<std::size_t>& 
I,
 
  626             const std::vector<std::size_t>& 
J) 
const;
 
  634     real_t normF() 
const;
 
  635     real_t norm1() 
const;
 
  636     real_t normI() 
const;
 
  638     virtual std::size_t memory()
 const {
 
  639       return sizeof(scalar_t)*std::size_t(
lrows())*std::size_t(
lcols());
 
  641     virtual std::size_t total_memory()
 const {
 
  642       return sizeof(scalar_t)*std::size_t(
rows())*std::size_t(
cols());
 
  644     virtual std::size_t nonzeros()
 const {
 
  645       return std::size_t(
lrows())*std::size_t(
lcols());
 
  647     virtual std::size_t total_nonzeros()
 const {
 
  648       return std::size_t(
rows())*std::size_t(
cols());
 
  651     void scatter(
const DenseMatrix<scalar_t>& a);
 
  652     DenseMatrix<scalar_t> gather() 
const;
 
  653     DenseMatrix<scalar_t> all_gather() 
const;
 
  655     DenseMatrix<scalar_t> dense_and_clear();
 
  656     DenseMatrix<scalar_t> dense() 
const;
 
  657     DenseMatrixWrapper<scalar_t> dense_wrapper();
 
  659     std::vector<int> LU();
 
  660     int LU(std::vector<int>&);
 
  662     DistributedMatrix<scalar_t>
 
  663     solve(
const DistributedMatrix<scalar_t>& b,
 
  664           const std::vector<int>& piv) 
const;
 
  666     void LQ(DistributedMatrix<scalar_t>& L,
 
  667             DistributedMatrix<scalar_t>& Q) 
const;
 
  669     void orthogonalize(scalar_t& r_max, scalar_t& r_min);
 
  671     void ID_column(DistributedMatrix<scalar_t>& X, std::vector<int>& piv,
 
  672                    std::vector<std::size_t>& ind,
 
  673                    real_t rel_tol, real_t abs_tol, 
int max_rank);
 
  674     void ID_row(DistributedMatrix<scalar_t>& X, std::vector<int>& piv,
 
  675                 std::vector<std::size_t>& ind, real_t rel_tol, real_t abs_tol,
 
  676                 int max_rank, 
const BLACSGrid* grid_T);
 
  701     scalar_t* data_ = 
nullptr;
 
  711   template<
typename scalar_t> 
void 
  714        int dest, 
int context_all);
 
  716   template<
typename scalar_t> 
void 
  722   template<
typename scalar_t> 
void 
  725        std::size_t ib, std::size_t jb, 
int context_all);
 
  732   template<
typename scalar_t>
 
  735     int _rows, _cols, _i, _j;
 
  738       _rows(0), _cols(0), _i(0), _j(0) {}
 
  745                              std::size_t i, std::size_t j);
 
  749                              int MB, 
int NB, scalar_t* A);
 
  760     int rows()
 const override { 
return _rows; }
 
  761     int cols()
 const override { 
return _cols; }
 
  762     int I()
 const override { 
return _i+1; }
 
  763     int J()
 const override { 
return _j+1; }
 
  764     void lranges(
int& rlo, 
int& rhi, 
int& clo, 
int& chi) 
const override;
 
  766     void resize(std::size_t m, std::size_t n)
 override { assert(1); }
 
  769     { this->data_ = 
nullptr; DistributedMatrix<scalar_t>::clear(); }
 
  770     std::size_t memory()
 const override { 
return 0; }
 
  771     std::size_t total_memory()
 const override { 
return 0; }
 
  772     std::size_t nonzeros()
 const override { 
return 0; }
 
  773     std::size_t total_nonzeros()
 const override { 
return 0; }
 
  775     DenseMatrix<scalar_t> dense_and_clear() = 
delete;
 
  776     DenseMatrixWrapper<scalar_t> dense_wrapper() = 
delete;
 
  777     DistributedMatrixWrapper<scalar_t>&
 
  778     operator=(
const DistributedMatrix<scalar_t>&) = 
delete;
 
  779     DistributedMatrixWrapper<scalar_t>&
 
  780     operator=(DistributedMatrix<scalar_t>&&) = 
delete;
 
  784   template<
typename scalar_t> 
long long int 
  785   LU_flops(
const DistributedMatrix<scalar_t>& a) {
 
  786     if (!a.active()) 
return 0;
 
  787     return (is_complex<scalar_t>() ? 4:1) *
 
  788       blas::getrf_flops(a.rows(), a.cols()) /
 
  792   template<
typename scalar_t> 
long long int 
  793   solve_flops(
const DistributedMatrix<scalar_t>& b) {
 
  794     if (!b.active()) 
return 0;
 
  795     return (is_complex<scalar_t>() ? 4:1) *
 
  796       blas::getrs_flops(b.rows(), b.cols()) /
 
  800   template<
typename scalar_t> 
long long int 
  801   LQ_flops(
const DistributedMatrix<scalar_t>& a) {
 
  802     if (!a.active()) 
return 0;
 
  803     auto minrc = std::min(a.rows(), a.cols());
 
  804     return (is_complex<scalar_t>() ? 4:1) *
 
  805       (blas::gelqf_flops(a.rows(), a.cols()) +
 
  806        blas::xxglq_flops(a.cols(), a.cols(), minrc)) /
 
  810   template<
typename scalar_t> 
long long int 
  811   ID_row_flops(
const DistributedMatrix<scalar_t>& a, 
int rank) {
 
  812     if (!a.active()) 
return 0;
 
  813     return (is_complex<scalar_t>() ? 4:1) *
 
  814       (blas::geqp3_flops(a.cols(), a.rows())
 
  815        + blas::trsm_flops(rank, a.cols() - rank, scalar_t(1.), 
'L')) /
 
  819   template<
typename scalar_t> 
long long int 
  820   trsm_flops(
Side s, scalar_t alpha, 
const DistributedMatrix<scalar_t>& a,
 
  821              const DistributedMatrix<scalar_t>& b) {
 
  822     if (!a.active()) 
return 0;
 
  823     return (is_complex<scalar_t>() ? 4:1) *
 
  824       blas::trsm_flops(b.rows(), b.cols(), alpha, 
char(s)) /
 
  828   template<
typename scalar_t> 
long long int 
  830              const DistributedMatrix<scalar_t>& a,
 
  831              const DistributedMatrix<scalar_t>& b, scalar_t beta) {
 
  832     if (!a.active()) 
return 0;
 
  833     return (is_complex<scalar_t>() ? 4:1) *
 
  835       ((ta==
Trans::N) ? a.rows() : a.cols(),
 
  836        (tb==
Trans::N) ? b.cols() : b.rows(),
 
  837        (ta==
Trans::N) ? a.cols() : a.rows(), alpha, beta) /
 
  841   template<
typename scalar_t> 
long long int 
  842   gemv_flops(
Trans ta, 
const DistributedMatrix<scalar_t>& a,
 
  843              scalar_t alpha, scalar_t beta) {
 
  844     if (!a.active()) 
return 0;
 
  845     auto m = (ta==
Trans::N) ? a.rows() : a.cols();
 
  846     auto n = (ta==
Trans::N) ? a.cols() : a.rows();
 
  847     return (is_complex<scalar_t>() ? 4:1) *
 
  848       ((alpha != scalar_t(0.)) * m * (n * 2 - 1) +
 
  849        (alpha != scalar_t(1.) && alpha != scalar_t(0.)) * m +
 
  850        (beta != scalar_t(0.) && beta != scalar_t(1.)) * m +
 
  851        (alpha != scalar_t(0.) && beta != scalar_t(0.)) * m) /
 
  855   template<
typename scalar_t> 
long long int 
  857     if (!a.active()) 
return 0;
 
  858     auto minrc = std::min(a.rows(), a.cols());
 
  859     return (is_complex<scalar_t>() ? 4:1) *
 
  860       (blas::geqrf_flops(a.rows(), minrc) +
 
  861        blas::xxgqr_flops(a.rows(), minrc, minrc)) /
 
  866   template<
typename scalar_t>
 
  867   std::unique_ptr<const DistributedMatrixWrapper<scalar_t>>
 
  868   ConstDistributedMatrixWrapperPtr
 
  869   (std::size_t m, std::size_t n, 
const DistributedMatrix<scalar_t>& D,
 
  870    std::size_t i, std::size_t j) {
 
  871     return std::unique_ptr<const DistributedMatrixWrapper<scalar_t>>
 
  872       (
new DistributedMatrixWrapper<scalar_t>
 
  873        (m, n, 
const_cast<DistributedMatrix<scalar_t>&
>(D), i, j));
 
  877   template<
typename scalar_t> 
void gemm 
  878   (
Trans ta, 
Trans tb, scalar_t alpha, 
const DistributedMatrix<scalar_t>& A,
 
  879    const DistributedMatrix<scalar_t>& B, scalar_t beta,
 
  880    DistributedMatrix<scalar_t>& C);
 
  882   template<
typename scalar_t> 
void trsm 
  884    const DistributedMatrix<scalar_t>& A, DistributedMatrix<scalar_t>& B);
 
  886   template<
typename scalar_t> 
void trsv 
  887   (
UpLo ul, 
Trans ta, 
Diag d, 
const DistributedMatrix<scalar_t>& A,
 
  888    DistributedMatrix<scalar_t>& B);
 
  890   template<
typename scalar_t> 
void gemv 
  891   (
Trans ta, scalar_t alpha, 
const DistributedMatrix<scalar_t>& A,
 
  892    const DistributedMatrix<scalar_t>& X, scalar_t beta,
 
  893    DistributedMatrix<scalar_t>& Y);
 
  895   template<
typename scalar_t> DistributedMatrix<scalar_t> 
vconcat 
  896   (
int cols, 
int arows, 
int brows, 
const DistributedMatrix<scalar_t>& a,
 
  897    const DistributedMatrix<scalar_t>& b, 
const BLACSGrid* gnew, 
int cxt_all);
 
  899   template<
typename scalar_t> 
void subgrid_copy_to_buffers
 
  900   (
const DistributedMatrix<scalar_t>& a, 
const DistributedMatrix<scalar_t>& b,
 
  901    int p0, 
int npr, 
int npc, std::vector<std::vector<scalar_t>>& sbuf);
 
  903   template<
typename scalar_t> 
void subproc_copy_to_buffers
 
  904   (
const DenseMatrix<scalar_t>& a, 
const DistributedMatrix<scalar_t>& b,
 
  905    int p0, 
int npr, 
int npc, std::vector<std::vector<scalar_t>>& sbuf);
 
  907   template<
typename scalar_t> 
void subgrid_add_from_buffers
 
  908   (
const BLACSGrid* subg, 
int master, DistributedMatrix<scalar_t>& b,
 
  909    std::vector<scalar_t*>& pbuf);
 
Contains a wrapper class around a BLACS grid/context.
Contains the DenseMatrix and DenseMatrixWrapper classes, simple wrappers around BLAS/LAPACK style den...
Contains some simple C++ MPI wrapper utilities.
This is a small wrapper class around a BLACS grid and a BLACS context.
Definition: BLACSGrid.hpp:66
bool active() const
Definition: BLACSGrid.hpp:264
int npcols() const
Definition: BLACSGrid.hpp:232
int ctxt_all() const
Definition: BLACSGrid.hpp:220
int npactives() const
Definition: BLACSGrid.hpp:257
int pcol() const
Definition: BLACSGrid.hpp:244
const MPIComm & Comm() const
Definition: BLACSGrid.hpp:196
int nprows() const
Definition: BLACSGrid.hpp:226
int ctxt() const
Definition: BLACSGrid.hpp:209
int prow() const
Definition: BLACSGrid.hpp:238
Like DenseMatrix, this class represents a matrix, stored in column major format, to allow direct use ...
Definition: DenseMatrix.hpp:1015
This class represents a matrix, stored in column major format, to allow direct use of BLAS/LAPACK rou...
Definition: DenseMatrix.hpp:138
Definition: DistributedMatrix.hpp:733
int J() const override
Definition: DistributedMatrix.hpp:763
int I() const override
Definition: DistributedMatrix.hpp:762
int rows() const override
Definition: DistributedMatrix.hpp:760
void lranges(int &rlo, int &rhi, int &clo, int &chi) const override
int cols() const override
Definition: DistributedMatrix.hpp:761
2D block cyclicly distributed matrix, as used by ScaLAPACK.
Definition: DistributedMatrix.hpp:84
const scalar_t * data() const
Definition: DistributedMatrix.hpp:346
int prow() const
Definition: DistributedMatrix.hpp:381
DistributedMatrix(const BLACSGrid *g, DenseMatrix< scalar_t > &&m)
DistributedMatrix(const DistributedMatrix< scalar_t > &m)
int ctxt_all() const
Definition: DistributedMatrix.hpp:262
int lcols() const
Definition: DistributedMatrix.hpp:283
virtual int I() const
Definition: DistributedMatrix.hpp:318
scalar_t & global(int r, int c)
Definition: DistributedMatrix.hpp:565
DistributedMatrix< scalar_t > & operator=(const DistributedMatrix< scalar_t > &m)
int rowblocks() const
Definition: DistributedMatrix.hpp:305
bool is_master() const
Definition: DistributedMatrix.hpp:416
DistributedMatrix< scalar_t > & operator=(DistributedMatrix< scalar_t > &&m)
int npactives() const
Definition: DistributedMatrix.hpp:406
const int * desc() const
Definition: DistributedMatrix.hpp:220
const BLACSGrid * grid() const
Definition: DistributedMatrix.hpp:235
int MB() const
Definition: DistributedMatrix.hpp:294
int rowl2g(int row) const
Definition: DistributedMatrix.hpp:425
static const int default_MB
Definition: DistributedMatrix.hpp:690
int pcol() const
Definition: DistributedMatrix.hpp:387
virtual int rows() const
Definition: DistributedMatrix.hpp:268
scalar_t & operator()(int r, int c)
Definition: DistributedMatrix.hpp:374
static const int default_NB
Definition: DistributedMatrix.hpp:697
int npcols() const
Definition: DistributedMatrix.hpp:399
DistributedMatrix(const BLACSGrid *g, int M, int N, const std::function< scalar_t(std::size_t, std::size_t)> &A)
bool is_local(int r, int c) const
Definition: DistributedMatrix.hpp:497
DistributedMatrix(const BLACSGrid *g, DenseMatrixWrapper< scalar_t > &&m)
const scalar_t & operator()(int r, int c) const
Definition: DistributedMatrix.hpp:363
int rank(int r, int c) const
Definition: DistributedMatrix.hpp:488
DistributedMatrix(const BLACSGrid *g, int M, int N, int MB, int NB)
int colblocks() const
Definition: DistributedMatrix.hpp:310
DistributedMatrix(const BLACSGrid *g, int M, int N)
scalar_t all_global(int r, int c) const
int coll2g(int col) const
Definition: DistributedMatrix.hpp:435
DistributedMatrix(const BLACSGrid *g, const DenseMatrix< scalar_t > &m)
int nprows() const
Definition: DistributedMatrix.hpp:393
MPI_Comm comm() const
Definition: DistributedMatrix.hpp:248
void global(int r, int c, scalar_t v)
Definition: DistributedMatrix.hpp:578
bool active() const
Definition: DistributedMatrix.hpp:229
int ld() const
Definition: DistributedMatrix.hpp:288
int colg2p(int col) const
Definition: DistributedMatrix.hpp:476
virtual int cols() const
Definition: DistributedMatrix.hpp:273
int ctxt() const
Definition: DistributedMatrix.hpp:254
DistributedMatrix(const BLACSGrid *g, int M, int N, const DistributedMatrix< scalar_t > &m, int context_all)
scalar_t * data()
Definition: DistributedMatrix.hpp:352
int lrows() const
Definition: DistributedMatrix.hpp:278
virtual int J() const
Definition: DistributedMatrix.hpp:325
const scalar_t & global(int r, int c) const
Definition: DistributedMatrix.hpp:551
int rowg2p(int row) const
Definition: DistributedMatrix.hpp:465
virtual void lranges(int &rlo, int &rhi, int &clo, int &chi) const
int colg2l(int col) const
Definition: DistributedMatrix.hpp:455
int rowg2l(int row) const
Definition: DistributedMatrix.hpp:445
const MPIComm & Comm() const
Definition: DistributedMatrix.hpp:242
virtual ~DistributedMatrix()
DistributedMatrix(DistributedMatrix< scalar_t > &&m)
int NB() const
Definition: DistributedMatrix.hpp:299
Wrapper class around an MPI_Comm object.
Definition: MPIWrapper.hpp:194
MPI_Comm comm() const
Definition: MPIWrapper.hpp:261
class to wrap the C++11 random number generator/distribution
Definition: RandomWrapper.hpp:105
Definition: StrumpackOptions.hpp:43
long long int LU_flops(const DenseMatrix< scalar_t > &a)
Definition: DenseMatrix.hpp:1452
long long int gemm_flops(Trans ta, Trans tb, scalar_t alpha, const DenseMatrix< scalar_t > &a, const DenseMatrix< scalar_t > &b, scalar_t beta)
Definition: DenseMatrix.hpp:1491
UpLo
Definition: DenseMatrix.hpp:83
DenseMatrix< scalar_t > vconcat(const DenseMatrix< scalar_t > &a, const DenseMatrix< scalar_t > &b)
Definition: DenseMatrix.hpp:1280
void trsv(UpLo ul, Trans ta, Diag d, const DenseMatrix< scalar_t > &a, DenseMatrix< scalar_t > &b, int depth=0)
void gemv(Trans ta, scalar_t alpha, const DenseMatrix< scalar_t > &a, const DenseMatrix< scalar_t > &x, scalar_t beta, DenseMatrix< scalar_t > &y, int depth=0)
Trans
Definition: DenseMatrix.hpp:51
void trsm(Side s, UpLo ul, Trans ta, Diag d, scalar_t alpha, const DenseMatrix< scalar_t > &a, DenseMatrix< scalar_t > &b, int depth=0)
long long int orthogonalize_flops(const DenseMatrix< scalar_t > &a)
Definition: DenseMatrix.hpp:1513
long long int trsm_flops(Side s, scalar_t alpha, const DenseMatrix< scalar_t > &a, const DenseMatrix< scalar_t > &b)
Definition: DenseMatrix.hpp:1483
Side
Definition: DenseMatrix.hpp:74
void copy(std::size_t m, std::size_t n, const DenseMatrix< scalar_from_t > &a, std::size_t ia, std::size_t ja, DenseMatrix< scalar_to_t > &b, std::size_t ib, std::size_t jb)
Definition: DenseMatrix.hpp:1231
void gemm(Trans ta, Trans tb, scalar_t alpha, const DenseMatrix< scalar_t > &a, const DenseMatrix< scalar_t > &b, scalar_t beta, DenseMatrix< scalar_t > &c, int depth=0)
long long int ID_row_flops(const DenseMatrix< scalar_t > &a, int rank)
Definition: DenseMatrix.hpp:1475
long long int LQ_flops(const DenseMatrix< scalar_t > &a)
Definition: DenseMatrix.hpp:1466
long long int solve_flops(const DenseMatrix< scalar_t > &b)
Definition: DenseMatrix.hpp:1459
Diag
Definition: DenseMatrix.hpp:92