28#ifndef HSS_EXTRA_MPI_HPP
29#define HSS_EXTRA_MPI_HPP
31#include "HSSExtra.hpp"
37#ifndef DOXYGEN_SHOULD_SKIP_THIS
38 template<
typename scalar_t>
39 class WorkCompressMPI :
public WorkCompressBase<scalar_t> {
41 std::vector<WorkCompressMPI<scalar_t>> c;
42 DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43 DistributedMatrix<scalar_t> Qr, Qc;
45 std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46 void split(
const std::pair<std::size_t,std::size_t>& dim) {
49 c[0].offset = this->offset;
50 c[1].offset = this->offset + dim;
51 c[0].lvl = c[1].lvl = this->lvl + 1;
54 void create_sequential() {
56 w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57 (
new WorkCompress<scalar_t>());
58 w_seq->lvl = this->lvl;
63#ifndef DOXYGEN_SHOULD_SKIP_THIS
64 template<
typename scalar_t,
65 typename real_t=
typename RealType<scalar_t>::value_type>
66 class WorkCompressMPIANN :
public WorkCompressBase<scalar_t> {
68 std::vector<WorkCompressMPIANN<scalar_t>> c;
69 DistributedMatrix<scalar_t> S;
70 std::vector<std::pair<std::size_t,real_t>> ids_scores;
71 std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
73 void split(
const std::pair<std::size_t,std::size_t>& dim) {
76 c[0].offset = this->offset;
77 c[1].offset = this->offset + dim;
78 c[0].lvl = c[1].lvl = this->lvl + 1;
81 void create_sequential() {
83 w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84 (
new WorkCompressANN<scalar_t>());
85 w_seq->lvl = this->lvl;
90#ifndef DOXYGEN_SHOULD_SKIP_THIS
91 template<
typename scalar_t>
class WorkApplyMPI {
93 std::pair<std::size_t,std::size_t> offset;
94 std::vector<WorkApplyMPI<scalar_t>> c;
95 DistributedMatrix<scalar_t> tmp1, tmp2;
96 std::unique_ptr<WorkApply<scalar_t>> w_seq;
101 template<
typename scalar_t>
class HSSMatrixBase;
103#ifndef DOXYGEN_SHOULD_SKIP_THIS
104 template<
typename scalar_t>
class DistSubLeaf {
107 (
int cols,
const HSSMatrixBase<scalar_t>* H,
const BLACSGrid* lg)
108 : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
111 (
int cols,
const HSSMatrixBase<scalar_t>* H,
const BLACSGrid* lg,
112 const DistributedMatrix<scalar_t>& dist)
113 : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114 void from_block_row(DistributedMatrix<scalar_t>& dist)
const
115 { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116 DistributedMatrix<scalar_t> leaf;
117 DenseMatrix<scalar_t> sub;
118 const BLACSGrid* grid_local()
const {
return grid_loc_; }
119 int cols()
const {
return cols_; }
122 void allocate_block_row()
123 { hss_->allocate_block_row(cols_, sub, leaf); }
124 void to_block_row(
const DistributedMatrix<scalar_t>& dist)
125 { hss_->to_block_row(dist, sub, leaf); }
128 const HSSMatrixBase<scalar_t>* hss_;
129 const BLACSGrid* grid_loc_;
139 std::cout <<
"ranges=[";
140 for (std::size_t p=0; p<ranges_.size()/5; p++)
141 std::cout << rlo(p) <<
"," << rhi(p) <<
"/"
142 << clo(p) <<
"," << chi(p) <<
"/" << leaf_procs(p) <<
" ";
143 std::cout <<
"];" << std::endl;
145 int rlo(
int p)
const {
return ranges_[5*p+0]; }
146 int rhi(
int p)
const {
return ranges_[5*p+1]; }
147 int clo(
int p)
const {
return ranges_[5*p+2]; }
148 int chi(
int p)
const {
return ranges_[5*p+3]; }
149 int leaf_procs(
int p)
const {
return ranges_[5*p+4]; }
150 int& rlo(
int p) {
return ranges_[5*p+0]; }
151 int& rhi(
int p) {
return ranges_[5*p+1]; }
152 int& clo(
int p) {
return ranges_[5*p+2]; }
153 int& chi(
int p) {
return ranges_[5*p+3]; }
154 int& leaf_procs(
int p) {
return ranges_[5*p+4]; }
156 std::vector<int> ranges_;
159#ifndef DOXYGEN_SHOULD_SKIP_THIS
160 template<
typename scalar_t>
class WorkFactorMPI {
162 std::vector<WorkFactorMPI<scalar_t>> c;
169 std::unique_ptr<WorkFactor<scalar_t>> w_seq;
198 return sizeof(*this) + L_.memory() + Vt0_.memory()
199 + W1_.memory() + Q_.memory() + D_.memory()
200 +
sizeof(int)*piv_.size();
211 return L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
212 + Q_.nonzeros() + D_.nonzeros();
243 std::vector<int> piv_;
247#ifndef DOXYGEN_SHOULD_SKIP_THIS
248 template<
typename scalar_t>
class WorkSolveMPI {
250 std::vector<WorkSolveMPI<scalar_t>> c;
251 std::unique_ptr<WorkSolve<scalar_t>> w_seq;
261 std::pair<std::size_t,std::size_t> offset;
266#ifndef DOXYGEN_SHOULD_SKIP_THIS
267 template<
typename scalar_t>
class WorkExtractMPI {
269 std::vector<WorkExtractMPI<scalar_t>> c;
270 DistributedMatrix<scalar_t> y, z;
271 std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
272 std::unique_ptr<WorkExtract<scalar_t>> w_seq;
273 void split_extraction_sets
274 (
const std::pair<std::size_t,std::size_t>& dim) {
277 c[0].I.reserve(I.size());
278 c[1].I.reserve(I.size());
280 if (i < dim.first) c[0].I.push_back(i);
281 else c[1].I.push_back(i - dim.first);
282 c[0].J.reserve(J.size());
283 c[1].J.reserve(J.size());
285 if (j < dim.second) c[0].J.push_back(j);
286 else c[1].J.push_back(j - dim.second);
289 void communicate_child_ycols(
const MPIComm& comm,
int rch1) {
292 auto c0ycols = c[0].ycols.size();
293 auto c1ycols = c[1].ycols.size();
294 comm.broadcast_from(c0ycols, rch0);
295 comm.broadcast_from(c1ycols, rch1);
296 c[0].ycols.resize(c0ycols);
297 c[1].ycols.resize(c1ycols);
298 comm.broadcast_from(c[0].ycols, rch0);
299 comm.broadcast_from(c[1].ycols, rch1);
301 void combine_child_ycols() {
302 auto c0ycols = c[0].ycols.size();
303 auto c1ycols = c[1].ycols.size();
304 ycols.resize(c0ycols + c1ycols);
305 std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
306 std::copy(c[1].ycols.begin(), c[1].ycols.end(),
307 ycols.begin()+c0ycols);
313#ifndef DOXYGEN_SHOULD_SKIP_THIS
314 template<
typename scalar_t>
class WorkExtractBlocksMPI {
316 WorkExtractBlocksMPI(std::size_t nb) {
326 std::vector<WorkExtractBlocksMPI<scalar_t>> c;
327 std::vector<DistributedMatrix<scalar_t>> y, z;
328 std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
329 std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
331 void split_extraction_sets
332 (
const std::pair<std::size_t,std::size_t>& dim) {
338 for (std::size_t k=0; k<nb; k++) {
339 c[0].I[k].reserve(I[k].size());
340 c[1].I[k].reserve(I[k].size());
342 if (i < dim.first) c[0].I[k].push_back(i);
343 else c[1].I[k].push_back(i - dim.first);
344 c[0].J[k].reserve(J[k].size());
345 c[1].J[k].reserve(J[k].size());
347 if (j < dim.second) c[0].J[k].push_back(j);
348 else c[1].J[k].push_back(j - dim.second);
353 void communicate_child_ycols(
const MPIComm& comm,
int rch1) {
354 int rank = comm.rank(), P = comm.size();
355 int P0total = rch1, P1total = P - rch1;
356 int pcols = std::floor(std::sqrt((
float)P0total));
357 int prows = P0total / pcols;
358 int P0active = prows * pcols;
359 pcols = std::floor(std::sqrt((
float)P1total));
360 prows = P1total / pcols;
361 int P1active = prows * pcols;
362 std::vector<MPI_Request> sreq;
365 std::vector<std::size_t> sbuf0, sbuf1;
366 if (rank < P0active) {
367 if (rank < (P-P0active)) {
371 std::size_t ssize = 0;
372 for (std::size_t k=0; k<I.size(); k++)
373 ssize += 1 + c[0].ycols[k].size();
374 sbuf0.reserve(ssize);
375 for (std::size_t k=0; k<I.size(); k++) {
376 sbuf0.push_back(c[0].ycols[k].size());
377 for (
auto i : c[0].ycols[k])
380 for (
int p=P0active; p<P; p++)
381 if (rank == (p - P0active) % P0active) {
383 comm.isend(sbuf0, p, 0, &sreq.back());
387 if (rank >= rch1 && rank < rch1+P1active) {
388 if ((rank-rch1) < (P-P1active)) {
393 std::size_t ssize = 0;
394 for (std::size_t k=0; k<I.size(); k++)
395 ssize += 1 + c[1].ycols[k].size();
396 sbuf1.reserve(ssize);
397 for (std::size_t k=0; k<I.size(); k++) {
398 sbuf1.push_back(c[1].ycols[k].size());
399 for (
auto i : c[1].ycols[k])
402 for (
int p=0; p<rch1; p++)
403 if (rank - rch1 == p % P1active) {
405 comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
407 for (
int p=rch1+P1active; p<P; p++)
408 if (rank - rch1 == (p - P1active) % P1active) {
410 comm.isend(sbuf1, p, 1, &sreq.back());
415 if (rank >= P0active) {
418 for (
int p=0; p<P0active; p++)
419 if (p == (rank - P0active) % P0active) { dest = p;
break; }
421 auto buf = comm.recv<std::size_t>(dest, 0);
422 auto ptr = buf.data();
423 for (std::size_t k=0; k<I.size(); k++) {
424 auto c0ycols = *ptr++;
425 c[0].ycols[k].resize(c0ycols);
426 for (std::size_t i=0; i<c0ycols; i++)
427 c[0].ycols[k][i] = *ptr++;
430 if (!(rank >= rch1 && rank < rch1+P1active)) {
433 for (
int p=rch1; p<rch1+P1active; p++) {
435 if (p - rch1 == rank % P1active) { dest = p;
break; }
436 }
else if (p - rch1 == (rank - P1active) % P1active) {
441 auto buf = comm.recv<std::size_t>(dest, 1);
442 auto ptr = buf.data();
443 for (std::size_t k=0; k<I.size(); k++) {
444 auto c1ycols = *ptr++;
445 c[1].ycols[k].resize(c1ycols);
446 for (std::size_t i=0; i<c1ycols; i++)
447 c[1].ycols[k][i] = *ptr++;
453 void combine_child_ycols(
const std::vector<bool>& odiag) {
454 for (std::size_t k=0; k<I.size(); k++) {
459 auto c0ycols = c[0].ycols[k].size();
460 auto c1ycols = c[1].ycols[k].size();
461 ycols[k].resize(c0ycols + c1ycols);
463 (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
465 (c[1].ycols[k].begin(), c[1].ycols[k].end(),
466 ycols[k].begin()+c0ycols);
Contains the DistributedMatrix and DistributedMatrixWrapper classes, wrappers around ScaLAPACK/PBLAS ...
2D block cyclicly distributed matrix, as used by ScaLAPACK.
Definition: DistributedMatrix.hpp:84
Contains data related to ULV factorization of a distributed HSS matrix.
Definition: HSSExtraMPI.hpp:184
std::size_t memory()
Definition: HSSExtraMPI.hpp:197
std::size_t nonzeros() const
Definition: HSSExtraMPI.hpp:210
DistributedMatrix< scalar_t > & Vhat()
Definition: HSSExtraMPI.hpp:223
const DistributedMatrix< scalar_t > & Vhat() const
Definition: HSSExtraMPI.hpp:218
Abstract base class for Hierarchically Semi-Separable (HSS) matrices.
Definition: HSSMatrixBase.hpp:83
Distributed memory implementation of the HSS (Hierarchically Semi-Separable) matrix format.
Definition: HSSMatrixMPI.hpp:68
Definition: HSSExtraMPI.hpp:134
Definition: StrumpackOptions.hpp:43
void wait_all(std::vector< MPIRequest > &reqs)
Definition: MPIWrapper.hpp:173
void copy(std::size_t m, std::size_t n, const DenseMatrix< scalar_from_t > &a, std::size_t ia, std::size_t ja, DenseMatrix< scalar_to_t > &b, std::size_t ib, std::size_t jb)
Definition: DenseMatrix.hpp:1231