28 #ifndef HSS_EXTRA_MPI_HPP
29 #define HSS_EXTRA_MPI_HPP
31 #include "HSSExtra.hpp"
32 #include "dense/DistributedMatrix.hpp"
37 #ifndef DOXYGEN_SHOULD_SKIP_THIS
38 template<
typename scalar_t>
39 class WorkCompressMPI :
public WorkCompressBase<scalar_t> {
41 std::vector<WorkCompressMPI<scalar_t>> c;
42 DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43 DistributedMatrix<scalar_t> Qr, Qc;
45 std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46 void split(
const std::pair<std::size_t,std::size_t>& dim) {
49 c[0].offset = this->offset;
50 c[1].offset = this->offset + dim;
51 c[0].lvl = c[1].lvl = this->lvl + 1;
54 void create_sequential() {
56 w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57 (
new WorkCompress<scalar_t>());
58 w_seq->lvl = this->lvl;
61 #endif //DOXYGEN_SHOULD_SKIP_THIS
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS
64 template<
typename scalar_t,
65 typename real_t=
typename RealType<scalar_t>::value_type>
66 class WorkCompressMPIANN :
public WorkCompressBase<scalar_t> {
68 std::vector<WorkCompressMPIANN<scalar_t>> c;
69 DistributedMatrix<scalar_t> S;
70 std::vector<std::pair<std::size_t,real_t>> ids_scores;
71 std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
73 void split(
const std::pair<std::size_t,std::size_t>& dim) {
76 c[0].offset = this->offset;
77 c[1].offset = this->offset + dim;
78 c[0].lvl = c[1].lvl = this->lvl + 1;
81 void create_sequential() {
83 w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84 (
new WorkCompressANN<scalar_t>());
85 w_seq->lvl = this->lvl;
88 #endif //DOXYGEN_SHOULD_SKIP_THIS
90 #ifndef DOXYGEN_SHOULD_SKIP_THIS
91 template<
typename scalar_t>
class WorkApplyMPI {
93 std::pair<std::size_t,std::size_t> offset;
94 std::vector<WorkApplyMPI<scalar_t>> c;
95 DistributedMatrix<scalar_t> tmp1, tmp2;
96 std::unique_ptr<WorkApply<scalar_t>> w_seq;
98 #endif //DOXYGEN_SHOULD_SKIP_THIS
101 template<
typename scalar_t>
class HSSMatrixBase;
103 #ifndef DOXYGEN_SHOULD_SKIP_THIS
104 template<
typename scalar_t>
class DistSubLeaf {
107 (
int cols,
const HSSMatrixBase<scalar_t>* H,
const BLACSGrid* lg)
108 : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
111 (
int cols,
const HSSMatrixBase<scalar_t>* H,
const BLACSGrid* lg,
112 const DistributedMatrix<scalar_t>& dist)
113 : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114 void from_block_row(DistributedMatrix<scalar_t>& dist)
const
115 { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116 DistributedMatrix<scalar_t> leaf;
117 DenseMatrix<scalar_t> sub;
118 const BLACSGrid* grid_local()
const {
return grid_loc_; }
119 int cols()
const {
return cols_; }
122 void allocate_block_row()
123 { hss_->allocate_block_row(cols_, sub, leaf); }
124 void to_block_row(
const DistributedMatrix<scalar_t>& dist)
125 { hss_->to_block_row(dist, sub, leaf); }
128 const HSSMatrixBase<scalar_t>* hss_;
129 const BLACSGrid* grid_loc_;
131 #endif //DOXYGEN_SHOULD_SKIP_THIS
139 std::cout <<
"ranges=[";
140 for (std::size_t p=0; p<_ranges.size()/5; p++)
141 std::cout << rlo(p) <<
"," << rhi(p) <<
"/"
142 << clo(p) <<
"," << chi(p) <<
"/" << leaf_procs(p) <<
" ";
143 std::cout <<
"];" << std::endl;
145 int rlo(
int p)
const {
return _ranges[5*p+0]; }
146 int rhi(
int p)
const {
return _ranges[5*p+1]; }
147 int clo(
int p)
const {
return _ranges[5*p+2]; }
148 int chi(
int p)
const {
return _ranges[5*p+3]; }
149 int leaf_procs(
int p)
const {
return _ranges[5*p+4]; }
150 int& rlo(
int p) {
return _ranges[5*p+0]; }
151 int& rhi(
int p) {
return _ranges[5*p+1]; }
152 int& clo(
int p) {
return _ranges[5*p+2]; }
153 int& chi(
int p) {
return _ranges[5*p+3]; }
154 int& leaf_procs(
int p) {
return _ranges[5*p+4]; }
156 std::vector<int> _ranges;
160 #ifndef DOXYGEN_SHOULD_SKIP_THIS
161 template<
typename scalar_t>
class WorkFactorMPI {
163 std::vector<WorkFactorMPI<scalar_t>> c;
170 std::unique_ptr<WorkFactor<scalar_t>> w_seq;
172 #endif //DOXYGEN_SHOULD_SKIP_THIS
199 std::size_t mem =
sizeof(*this) + L_.memory() + Vt0_.memory()
200 + W1_.memory() + Q_.memory() + D_.memory()
201 +
sizeof(int)*piv_.size();
202 for (
auto& c : ch_) mem += c.memory();
203 if (factors_seq_) mem += factors_seq_->memory();
215 std::size_t nnz = L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
216 + Q_.nonzeros() + D_.nonzeros();
217 for (
auto& c : ch_) nnz += c.nonzeros();
218 if (factors_seq_) nnz += factors_seq_->nonzeros();
256 std::vector<HSSFactorsMPI<scalar_t>> ch_;
257 std::unique_ptr<HSSFactors<scalar_t>> factors_seq_;
276 std::vector<int> piv_;
280 #ifndef DOXYGEN_SHOULD_SKIP_THIS
281 template<
typename scalar_t>
class WorkSolveMPI {
283 std::vector<WorkSolveMPI<scalar_t>> c;
284 std::unique_ptr<WorkSolve<scalar_t>> w_seq;
294 std::pair<std::size_t,std::size_t> offset;
296 #endif //DOXYGEN_SHOULD_SKIP_THIS
299 #ifndef DOXYGEN_SHOULD_SKIP_THIS
300 template<
typename scalar_t>
class WorkExtractMPI {
302 std::vector<WorkExtractMPI<scalar_t>> c;
303 DistributedMatrix<scalar_t> y, z;
304 std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
305 std::unique_ptr<WorkExtract<scalar_t>> w_seq;
306 void split_extraction_sets
307 (
const std::pair<std::size_t,std::size_t>& dim) {
310 c[0].I.reserve(I.size());
311 c[1].I.reserve(I.size());
313 if (i < dim.first) c[0].I.push_back(i);
314 else c[1].I.push_back(i - dim.first);
315 c[0].J.reserve(J.size());
316 c[1].J.reserve(J.size());
318 if (j < dim.second) c[0].J.push_back(j);
319 else c[1].J.push_back(j - dim.second);
322 void communicate_child_ycols(
const MPIComm& comm,
int rch1) {
325 auto c0ycols = c[0].ycols.size();
326 auto c1ycols = c[1].ycols.size();
327 comm.broadcast_from(c0ycols, rch0);
328 comm.broadcast_from(c1ycols, rch1);
329 c[0].ycols.resize(c0ycols);
330 c[1].ycols.resize(c1ycols);
331 comm.broadcast_from(c[0].ycols, rch0);
332 comm.broadcast_from(c[1].ycols, rch1);
334 void combine_child_ycols() {
335 auto c0ycols = c[0].ycols.size();
336 auto c1ycols = c[1].ycols.size();
337 ycols.resize(c0ycols + c1ycols);
338 std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
339 std::copy(c[1].ycols.begin(), c[1].ycols.end(),
340 ycols.begin()+c0ycols);
343 #endif //DOXYGEN_SHOULD_SKIP_THIS
346 #ifndef DOXYGEN_SHOULD_SKIP_THIS
347 template<
typename scalar_t>
class WorkExtractBlocksMPI {
349 WorkExtractBlocksMPI(std::size_t nb) {
359 std::vector<WorkExtractBlocksMPI<scalar_t>> c;
360 std::vector<DistributedMatrix<scalar_t>> y, z;
361 std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
362 std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
364 void split_extraction_sets
365 (
const std::pair<std::size_t,std::size_t>& dim) {
371 for (std::size_t k=0; k<nb; k++) {
372 c[0].I[k].reserve(I[k].size());
373 c[1].I[k].reserve(I[k].size());
375 if (i < dim.first) c[0].I[k].push_back(i);
376 else c[1].I[k].push_back(i - dim.first);
377 c[0].J[k].reserve(J[k].size());
378 c[1].J[k].reserve(J[k].size());
380 if (j < dim.second) c[0].J[k].push_back(j);
381 else c[1].J[k].push_back(j - dim.second);
386 void communicate_child_ycols(
const MPIComm& comm,
int rch1) {
387 int rank = comm.rank(), P = comm.size();
388 int P0total = rch1, P1total = P - rch1;
389 int pcols = std::floor(std::sqrt((
float)P0total));
390 int prows = P0total / pcols;
391 int P0active = prows * pcols;
392 pcols = std::floor(std::sqrt((
float)P1total));
393 prows = P1total / pcols;
394 int P1active = prows * pcols;
395 std::vector<MPI_Request> sreq;
398 std::vector<std::size_t> sbuf0, sbuf1;
399 if (rank < P0active) {
400 if (rank < (P-P0active)) {
404 std::size_t ssize = 0;
405 for (std::size_t k=0; k<I.size(); k++)
406 ssize += 1 + c[0].ycols[k].size();
407 sbuf0.reserve(ssize);
408 for (std::size_t k=0; k<I.size(); k++) {
409 sbuf0.push_back(c[0].ycols[k].size());
410 for (
auto i : c[0].ycols[k])
413 for (
int p=P0active; p<P; p++)
414 if (rank == (p - P0active) % P0active) {
416 comm.isend(sbuf0, p, 0, &sreq.back());
420 if (rank >= rch1 && rank < rch1+P1active) {
421 if ((rank-rch1) < (P-P1active)) {
426 std::size_t ssize = 0;
427 for (std::size_t k=0; k<I.size(); k++)
428 ssize += 1 + c[1].ycols[k].size();
429 sbuf1.reserve(ssize);
430 for (std::size_t k=0; k<I.size(); k++) {
431 sbuf1.push_back(c[1].ycols[k].size());
432 for (
auto i : c[1].ycols[k])
435 for (
int p=0; p<rch1; p++)
436 if (rank - rch1 == p % P1active) {
438 comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
440 for (
int p=rch1+P1active; p<P; p++)
441 if (rank - rch1 == (p - P1active) % P1active) {
443 comm.isend(sbuf1, p, 1, &sreq.back());
448 if (rank >= P0active) {
451 for (
int p=0; p<P0active; p++)
452 if (p == (rank - P0active) % P0active) { dest = p;
break; }
454 auto buf = comm.recv<std::size_t>(dest, 0);
455 auto ptr = buf.data();
456 for (std::size_t k=0; k<I.size(); k++) {
457 auto c0ycols = *ptr++;
458 c[0].ycols[k].resize(c0ycols);
459 for (std::size_t i=0; i<c0ycols; i++)
460 c[0].ycols[k][i] = *ptr++;
463 if (!(rank >= rch1 && rank < rch1+P1active)) {
466 for (
int p=rch1; p<rch1+P1active; p++) {
468 if (p - rch1 == rank % P1active) { dest = p;
break; }
469 }
else if (p - rch1 == (rank - P1active) % P1active) {
474 auto buf = comm.recv<std::size_t>(dest, 1);
475 auto ptr = buf.data();
476 for (std::size_t k=0; k<I.size(); k++) {
477 auto c1ycols = *ptr++;
478 c[1].ycols[k].resize(c1ycols);
479 for (std::size_t i=0; i<c1ycols; i++)
480 c[1].ycols[k][i] = *ptr++;
486 void combine_child_ycols(
const std::vector<bool>& odiag) {
487 for (std::size_t k=0; k<I.size(); k++) {
492 auto c0ycols = c[0].ycols[k].size();
493 auto c1ycols = c[1].ycols[k].size();
494 ycols[k].resize(c0ycols + c1ycols);
496 (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
498 (c[1].ycols[k].begin(), c[1].ycols[k].end(),
499 ycols[k].begin()+c0ycols);
503 #endif //DOXYGEN_SHOULD_SKIP_THIS
508 #endif // HSS_EXTRA_MPI_HPP