28 #ifndef HSS_EXTRA_MPI_HPP
29 #define HSS_EXTRA_MPI_HPP
31 #include "HSSExtra.hpp"
32 #include "dense/DistributedMatrix.hpp"
37 #ifndef DOXYGEN_SHOULD_SKIP_THIS
38 template<
typename scalar_t>
39 class WorkCompressMPI :
public WorkCompressBase<scalar_t> {
41 std::vector<WorkCompressMPI<scalar_t>> c;
42 DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43 DistributedMatrix<scalar_t> Qr, Qc;
45 std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46 void split(
const std::pair<std::size_t,std::size_t>& dim) {
49 c[0].offset = this->offset;
50 c[1].offset = this->offset + dim;
51 c[0].lvl = c[1].lvl = this->lvl + 1;
54 void create_sequential() {
56 w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57 (
new WorkCompress<scalar_t>());
58 w_seq->lvl = this->lvl;
61 #endif //DOXYGEN_SHOULD_SKIP_THIS
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS
64 template<
typename scalar_t,
65 typename real_t=
typename RealType<scalar_t>::value_type>
66 class WorkCompressMPIANN :
public WorkCompressBase<scalar_t> {
68 std::vector<WorkCompressMPIANN<scalar_t>> c;
69 DistributedMatrix<scalar_t> S;
70 std::vector<std::pair<std::size_t,real_t>> ids_scores;
71 std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
73 void split(
const std::pair<std::size_t,std::size_t>& dim) {
76 c[0].offset = this->offset;
77 c[1].offset = this->offset + dim;
78 c[0].lvl = c[1].lvl = this->lvl + 1;
81 void create_sequential() {
83 w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84 (
new WorkCompressANN<scalar_t>());
85 w_seq->lvl = this->lvl;
88 #endif //DOXYGEN_SHOULD_SKIP_THIS
90 #ifndef DOXYGEN_SHOULD_SKIP_THIS
91 template<
typename scalar_t>
class WorkApplyMPI {
93 std::pair<std::size_t,std::size_t> offset;
94 std::vector<WorkApplyMPI<scalar_t>> c;
95 DistributedMatrix<scalar_t> tmp1, tmp2;
96 std::unique_ptr<WorkApply<scalar_t>> w_seq;
98 #endif //DOXYGEN_SHOULD_SKIP_THIS
101 template<
typename scalar_t>
class HSSMatrixBase;
103 #ifndef DOXYGEN_SHOULD_SKIP_THIS
104 template<
typename scalar_t>
class DistSubLeaf {
107 (
int cols,
const HSSMatrixBase<scalar_t>* H,
const BLACSGrid* lg)
108 : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
111 (
int cols,
const HSSMatrixBase<scalar_t>* H,
const BLACSGrid* lg,
112 const DistributedMatrix<scalar_t>& dist)
113 : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114 void from_block_row(DistributedMatrix<scalar_t>& dist)
const
115 { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116 DistributedMatrix<scalar_t> leaf;
117 DenseMatrix<scalar_t> sub;
118 const BLACSGrid* grid_local()
const {
return grid_loc_; }
119 int cols()
const {
return cols_; }
122 void allocate_block_row()
123 { hss_->allocate_block_row(cols_, sub, leaf); }
124 void to_block_row(
const DistributedMatrix<scalar_t>& dist)
125 { hss_->to_block_row(dist, sub, leaf); }
128 const HSSMatrixBase<scalar_t>* hss_;
129 const BLACSGrid* grid_loc_;
131 #endif //DOXYGEN_SHOULD_SKIP_THIS
139 std::cout <<
"ranges=[";
140 for (std::size_t p=0; p<_ranges.size()/5; p++)
141 std::cout << rlo(p) <<
"," << rhi(p) <<
"/"
142 << clo(p) <<
"," << chi(p) <<
"/" << leaf_procs(p) <<
" ";
143 std::cout <<
"];" << std::endl;
145 int rlo(
int p)
const {
return _ranges[5*p+0]; }
146 int rhi(
int p)
const {
return _ranges[5*p+1]; }
147 int clo(
int p)
const {
return _ranges[5*p+2]; }
148 int chi(
int p)
const {
return _ranges[5*p+3]; }
149 int leaf_procs(
int p)
const {
return _ranges[5*p+4]; }
150 int& rlo(
int p) {
return _ranges[5*p+0]; }
151 int& rhi(
int p) {
return _ranges[5*p+1]; }
152 int& clo(
int p) {
return _ranges[5*p+2]; }
153 int& chi(
int p) {
return _ranges[5*p+3]; }
154 int& leaf_procs(
int p) {
return _ranges[5*p+4]; }
156 std::vector<int> _ranges;
159 #ifndef DOXYGEN_SHOULD_SKIP_THIS
160 template<
typename scalar_t>
class WorkFactorMPI {
162 std::vector<WorkFactorMPI<scalar_t>> c;
169 std::unique_ptr<WorkFactor<scalar_t>> w_seq;
171 #endif //DOXYGEN_SHOULD_SKIP_THIS
198 std::size_t mem =
sizeof(*this) + L_.memory() + Vt0_.memory()
199 + W1_.memory() + Q_.memory() + D_.memory()
200 +
sizeof(int)*piv_.size();
201 for (
auto& c : ch_) mem += c.memory();
202 if (factors_seq_) mem += factors_seq_->memory();
214 std::size_t nnz = L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
215 + Q_.nonzeros() + D_.nonzeros();
216 for (
auto& c : ch_) nnz += c.nonzeros();
217 if (factors_seq_) nnz += factors_seq_->nonzeros();
255 std::vector<HSSFactorsMPI<scalar_t>> ch_;
256 std::unique_ptr<HSSFactors<scalar_t>> factors_seq_;
275 std::vector<int> piv_;
279 #ifndef DOXYGEN_SHOULD_SKIP_THIS
280 template<
typename scalar_t>
class WorkSolveMPI {
282 std::vector<WorkSolveMPI<scalar_t>> c;
283 std::unique_ptr<WorkSolve<scalar_t>> w_seq;
293 std::pair<std::size_t,std::size_t> offset;
295 #endif //DOXYGEN_SHOULD_SKIP_THIS
298 #ifndef DOXYGEN_SHOULD_SKIP_THIS
299 template<
typename scalar_t>
class WorkExtractMPI {
301 std::vector<WorkExtractMPI<scalar_t>> c;
302 DistributedMatrix<scalar_t> y, z;
303 std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
304 std::unique_ptr<WorkExtract<scalar_t>> w_seq;
305 void split_extraction_sets
306 (
const std::pair<std::size_t,std::size_t>& dim) {
309 c[0].I.reserve(I.size());
310 c[1].I.reserve(I.size());
312 if (i < dim.first) c[0].I.push_back(i);
313 else c[1].I.push_back(i - dim.first);
314 c[0].J.reserve(J.size());
315 c[1].J.reserve(J.size());
317 if (j < dim.second) c[0].J.push_back(j);
318 else c[1].J.push_back(j - dim.second);
321 void communicate_child_ycols(
const MPIComm& comm,
int rch1) {
324 auto c0ycols = c[0].ycols.size();
325 auto c1ycols = c[1].ycols.size();
326 comm.broadcast_from(c0ycols, rch0);
327 comm.broadcast_from(c1ycols, rch1);
328 c[0].ycols.resize(c0ycols);
329 c[1].ycols.resize(c1ycols);
330 comm.broadcast_from(c[0].ycols, rch0);
331 comm.broadcast_from(c[1].ycols, rch1);
333 void combine_child_ycols() {
334 auto c0ycols = c[0].ycols.size();
335 auto c1ycols = c[1].ycols.size();
336 ycols.resize(c0ycols + c1ycols);
337 std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
338 std::copy(c[1].ycols.begin(), c[1].ycols.end(),
339 ycols.begin()+c0ycols);
342 #endif //DOXYGEN_SHOULD_SKIP_THIS
345 #ifndef DOXYGEN_SHOULD_SKIP_THIS
346 template<
typename scalar_t>
class WorkExtractBlocksMPI {
348 WorkExtractBlocksMPI(std::size_t nb) {
358 std::vector<WorkExtractBlocksMPI<scalar_t>> c;
359 std::vector<DistributedMatrix<scalar_t>> y, z;
360 std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
361 std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
363 void split_extraction_sets
364 (
const std::pair<std::size_t,std::size_t>& dim) {
370 for (std::size_t k=0; k<nb; k++) {
371 c[0].I[k].reserve(I[k].size());
372 c[1].I[k].reserve(I[k].size());
374 if (i < dim.first) c[0].I[k].push_back(i);
375 else c[1].I[k].push_back(i - dim.first);
376 c[0].J[k].reserve(J[k].size());
377 c[1].J[k].reserve(J[k].size());
379 if (j < dim.second) c[0].J[k].push_back(j);
380 else c[1].J[k].push_back(j - dim.second);
385 void communicate_child_ycols(
const MPIComm& comm,
int rch1) {
386 int rank = comm.rank(), P = comm.size();
387 int P0total = rch1, P1total = P - rch1;
388 int pcols = std::floor(std::sqrt((
float)P0total));
389 int prows = P0total / pcols;
390 int P0active = prows * pcols;
391 pcols = std::floor(std::sqrt((
float)P1total));
392 prows = P1total / pcols;
393 int P1active = prows * pcols;
394 std::vector<MPI_Request> sreq;
397 std::vector<std::size_t> sbuf0, sbuf1;
398 if (rank < P0active) {
399 if (rank < (P-P0active)) {
403 std::size_t ssize = 0;
404 for (std::size_t k=0; k<I.size(); k++)
405 ssize += 1 + c[0].ycols[k].size();
406 sbuf0.reserve(ssize);
407 for (std::size_t k=0; k<I.size(); k++) {
408 sbuf0.push_back(c[0].ycols[k].size());
409 for (
auto i : c[0].ycols[k])
412 for (
int p=P0active; p<P; p++)
413 if (rank == (p - P0active) % P0active) {
415 comm.isend(sbuf0, p, 0, &sreq.back());
419 if (rank >= rch1 && rank < rch1+P1active) {
420 if ((rank-rch1) < (P-P1active)) {
425 std::size_t ssize = 0;
426 for (std::size_t k=0; k<I.size(); k++)
427 ssize += 1 + c[1].ycols[k].size();
428 sbuf1.reserve(ssize);
429 for (std::size_t k=0; k<I.size(); k++) {
430 sbuf1.push_back(c[1].ycols[k].size());
431 for (
auto i : c[1].ycols[k])
434 for (
int p=0; p<rch1; p++)
435 if (rank - rch1 == p % P1active) {
437 comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
439 for (
int p=rch1+P1active; p<P; p++)
440 if (rank - rch1 == (p - P1active) % P1active) {
442 comm.isend(sbuf1, p, 1, &sreq.back());
447 if (rank >= P0active) {
450 for (
int p=0; p<P0active; p++)
451 if (p == (rank - P0active) % P0active) { dest = p;
break; }
453 auto buf = comm.recv<std::size_t>(dest, 0);
454 auto ptr = buf.data();
455 for (std::size_t k=0; k<I.size(); k++) {
456 auto c0ycols = *ptr++;
457 c[0].ycols[k].resize(c0ycols);
458 for (std::size_t i=0; i<c0ycols; i++)
459 c[0].ycols[k][i] = *ptr++;
462 if (!(rank >= rch1 && rank < rch1+P1active)) {
465 for (
int p=rch1; p<rch1+P1active; p++) {
467 if (p - rch1 == rank % P1active) { dest = p;
break; }
468 }
else if (p - rch1 == (rank - P1active) % P1active) {
473 auto buf = comm.recv<std::size_t>(dest, 1);
474 auto ptr = buf.data();
475 for (std::size_t k=0; k<I.size(); k++) {
476 auto c1ycols = *ptr++;
477 c[1].ycols[k].resize(c1ycols);
478 for (std::size_t i=0; i<c1ycols; i++)
479 c[1].ycols[k][i] = *ptr++;
485 void combine_child_ycols(
const std::vector<bool>& odiag) {
486 for (std::size_t k=0; k<I.size(); k++) {
491 auto c0ycols = c[0].ycols[k].size();
492 auto c1ycols = c[1].ycols[k].size();
493 ycols[k].resize(c0ycols + c1ycols);
495 (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
497 (c[1].ycols[k].begin(), c[1].ycols[k].end(),
498 ycols[k].begin()+c0ycols);
502 #endif //DOXYGEN_SHOULD_SKIP_THIS
507 #endif // HSS_EXTRA_MPI_HPP