HSSExtraMPI.hpp
1 /*
2  * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
3  * Regents of the University of California, through Lawrence Berkeley
4  * National Laboratory (subject to receipt of any required approvals
5  * from the U.S. Dept. of Energy). All rights reserved.
6  *
7  * If you have questions about your rights to use or distribute this
8  * software, please contact Berkeley Lab's Technology Transfer
9  * Department at TTD@lbl.gov.
10  *
11  * NOTICE. This software is owned by the U.S. Department of Energy. As
12  * such, the U.S. Government has been granted for itself and others
13  * acting on its behalf a paid-up, nonexclusive, irrevocable,
14  * worldwide license in the Software to reproduce, prepare derivative
15  * works, and perform publicly and display publicly. Beginning five
16  * (5) years after the date permission to assert copyright is obtained
17  * from the U.S. Department of Energy, and subject to any subsequent
18  * five (5) year renewals, the U.S. Government is granted for itself
19  * and others acting on its behalf a paid-up, nonexclusive,
20  * irrevocable, worldwide license in the Software to reproduce,
21  * prepare derivative works, distribute copies to the public, perform
22  * publicly and display publicly, and to permit others to do so.
23  *
24  * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
25  * (Lawrence Berkeley National Lab, Computational Research
26  * Division).
27  */
28 #ifndef HSS_EXTRA_MPI_HPP
29 #define HSS_EXTRA_MPI_HPP
30 
31 #include "HSSExtra.hpp"
32 #include "dense/DistributedMatrix.hpp"
33 
34 namespace strumpack {
35  namespace HSS {
36 
37 #ifndef DOXYGEN_SHOULD_SKIP_THIS
38  template<typename scalar_t>
39  class WorkCompressMPI : public WorkCompressBase<scalar_t> {
40  public:
41  std::vector<WorkCompressMPI<scalar_t>> c;
42  DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43  DistributedMatrix<scalar_t> Qr, Qc;
44  int dR = 0, dS = 0;
45  std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46  void split(const std::pair<std::size_t,std::size_t>& dim) {
47  if (c.empty()) {
48  c.resize(2);
49  c[0].offset = this->offset;
50  c[1].offset = this->offset + dim;
51  c[0].lvl = c[1].lvl = this->lvl + 1;
52  }
53  }
54  void create_sequential() {
55  if (!w_seq)
56  w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57  (new WorkCompress<scalar_t>());
58  w_seq->lvl = this->lvl;
59  }
60  };
61 #endif //DOXYGEN_SHOULD_SKIP_THIS
62 
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS
64  template<typename scalar_t,
65  typename real_t=typename RealType<scalar_t>::value_type>
66  class WorkCompressMPIANN : public WorkCompressBase<scalar_t> {
67  public:
68  std::vector<WorkCompressMPIANN<scalar_t>> c;
69  DistributedMatrix<scalar_t> S;
70  std::vector<std::pair<std::size_t,real_t>> ids_scores;
71  std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
72 
73  void split(const std::pair<std::size_t,std::size_t>& dim) {
74  if (c.empty()) {
75  c.resize(2);
76  c[0].offset = this->offset;
77  c[1].offset = this->offset + dim;
78  c[0].lvl = c[1].lvl = this->lvl + 1;
79  }
80  }
81  void create_sequential() {
82  if (!w_seq)
83  w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84  (new WorkCompressANN<scalar_t>());
85  w_seq->lvl = this->lvl;
86  }
87  };
88 #endif //DOXYGEN_SHOULD_SKIP_THIS
89 
90 #ifndef DOXYGEN_SHOULD_SKIP_THIS
91  template<typename scalar_t> class WorkApplyMPI {
92  public:
93  std::pair<std::size_t,std::size_t> offset;
94  std::vector<WorkApplyMPI<scalar_t>> c;
95  DistributedMatrix<scalar_t> tmp1, tmp2;
96  std::unique_ptr<WorkApply<scalar_t>> w_seq;
97  };
98 #endif //DOXYGEN_SHOULD_SKIP_THIS
99 
100 
101  template<typename scalar_t> class HSSMatrixBase;
102 
103 #ifndef DOXYGEN_SHOULD_SKIP_THIS
104  template<typename scalar_t> class DistSubLeaf {
105  public:
106  DistSubLeaf
107  (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg)
108  : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
110  DistSubLeaf
111  (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg,
112  const DistributedMatrix<scalar_t>& dist)
113  : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114  void from_block_row(DistributedMatrix<scalar_t>& dist) const
115  { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116  DistributedMatrix<scalar_t> leaf;
117  DenseMatrix<scalar_t> sub;
118  const BLACSGrid* grid_local() const { return grid_loc_; }
119  int cols() const { return cols_; }
120 
121  private:
122  void allocate_block_row()
123  { hss_->allocate_block_row(cols_, sub, leaf); }
124  void to_block_row(const DistributedMatrix<scalar_t>& dist)
125  { hss_->to_block_row(dist, sub, leaf); }
126 
127  const int cols_;
128  const HSSMatrixBase<scalar_t>* hss_;
129  const BLACSGrid* grid_loc_;
130  };
131 #endif //DOXYGEN_SHOULD_SKIP_THIS
132 
133 
135  public:
136  TreeLocalRanges() {}
137  TreeLocalRanges(int P) : _ranges(5*P) {}
138  void print() const {
139  std::cout << "ranges=[";
140  for (std::size_t p=0; p<_ranges.size()/5; p++)
141  std::cout << rlo(p) << "," << rhi(p) << "/"
142  << clo(p) << "," << chi(p) << "/" << leaf_procs(p) << " ";
143  std::cout << "];" << std::endl;
144  }
145  int rlo(int p) const { return _ranges[5*p+0]; }
146  int rhi(int p) const { return _ranges[5*p+1]; }
147  int clo(int p) const { return _ranges[5*p+2]; }
148  int chi(int p) const { return _ranges[5*p+3]; }
149  int leaf_procs(int p) const { return _ranges[5*p+4]; }
150  int& rlo(int p) { return _ranges[5*p+0]; }
151  int& rhi(int p) { return _ranges[5*p+1]; }
152  int& clo(int p) { return _ranges[5*p+2]; }
153  int& chi(int p) { return _ranges[5*p+3]; }
154  int& leaf_procs(int p) { return _ranges[5*p+4]; }
155  private:
156  std::vector<int> _ranges; // rlo, rhi, clo, chi, leaf_procs
157  int _ctxt = -1;
158  };
159 
160 #ifndef DOXYGEN_SHOULD_SKIP_THIS
161  template<typename scalar_t> class WorkFactorMPI {
162  public:
163  std::vector<WorkFactorMPI<scalar_t>> c;
164 
165  // (U.cols x U.cols) \tilde(D)
167 
168  // (U.cols x V.cols) bottom part of \tilde{V}
170  std::unique_ptr<WorkFactor<scalar_t>> w_seq;
171  };
172 #endif //DOXYGEN_SHOULD_SKIP_THIS
173 
174 
185  template<typename scalar_t> class HSSFactorsMPI {
186  template<typename T> friend class HSSMatrixMPI;
187  template<typename T> friend class HSSMatrixBase;
188 
189  public:
190 
198  std::size_t memory() {
199  std::size_t mem = sizeof(*this) + L_.memory() + Vt0_.memory()
200  + W1_.memory() + Q_.memory() + D_.memory()
201  + sizeof(int)*piv_.size();
202  for (auto& c : ch_) mem += c.memory();
203  if (factors_seq_) mem += factors_seq_->memory();
204  return mem;
205  }
206 
214  std::size_t nonzeros() const {
215  std::size_t nnz = L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
216  + Q_.nonzeros() + D_.nonzeros();
217  for (auto& c : ch_) nnz += c.nonzeros();
218  if (factors_seq_) nnz += factors_seq_->nonzeros();
219  return nnz;
220  }
221 
229  std::size_t total_memory(const MPIComm& c) {
230  return c.all_reduce(memory(), MPI_SUM);
231  }
232 
233 
241  std::size_t total_nonzeros(const MPIComm& c) {
242  return c.all_reduce(nonzeros(), MPI_SUM);
243  }
244 
248  const DistributedMatrix<scalar_t>& Vhat() const { return Vt0_; }
249 
253  DistributedMatrix<scalar_t>& Vhat() { return Vt0_; }
254 
255  private:
256  std::vector<HSSFactorsMPI<scalar_t>> ch_;
257  std::unique_ptr<HSSFactors<scalar_t>> factors_seq_;
258 
259  // (U.rows-U.cols x U.rows-U.cols), empty at the root
261 
262  // (U.rows-U.cols x V.cols)
263  // at the root, Vt0_ stored Vhat
265 
266  // (U.cols x U.rows) bottom part of W
267  // if (U.rows == U.cols) then W == I and is not stored!
269 
270  // (U.rows x U.rows) Q from LQ(W0)
271  // if (U.rows == U.cols) then Q == I and is not stored!
273 
274  // (U.rows x U.rows) at the root holds LU(D), else empty
276  std::vector<int> piv_; // hold permutation from LU(D) at root
277  };
278 
279 
280 #ifndef DOXYGEN_SHOULD_SKIP_THIS
281  template<typename scalar_t> class WorkSolveMPI {
282  public:
283  std::vector<WorkSolveMPI<scalar_t>> c;
284  std::unique_ptr<WorkSolve<scalar_t>> w_seq;
285 
286  // do we need all these?? x only used in bwd, y only used in fwd??
288  DistributedMatrix<scalar_t> ft1; // TODO document the sizes here
291 
292  // DO NOT STORE reduced_rhs here!!!
293  DistributedMatrix<scalar_t> reduced_rhs;
294  std::pair<std::size_t,std::size_t> offset;
295  };
296 #endif //DOXYGEN_SHOULD_SKIP_THIS
297 
298 
299 #ifndef DOXYGEN_SHOULD_SKIP_THIS
300  template<typename scalar_t> class WorkExtractMPI {
301  public:
302  std::vector<WorkExtractMPI<scalar_t>> c;
303  DistributedMatrix<scalar_t> y, z;
304  std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
305  std::unique_ptr<WorkExtract<scalar_t>> w_seq;
306  void split_extraction_sets
307  (const std::pair<std::size_t,std::size_t>& dim) {
308  if (c.empty()) {
309  c.resize(2);
310  c[0].I.reserve(I.size());
311  c[1].I.reserve(I.size());
312  for (auto i : I)
313  if (i < dim.first) c[0].I.push_back(i);
314  else c[1].I.push_back(i - dim.first);
315  c[0].J.reserve(J.size());
316  c[1].J.reserve(J.size());
317  for (auto j : J)
318  if (j < dim.second) c[0].J.push_back(j);
319  else c[1].J.push_back(j - dim.second);
320  }
321  }
322  void communicate_child_ycols(const MPIComm& comm, int rch1) {
323  // TODO optimize these 4 bcasts?
324  auto rch0 = 0;
325  auto c0ycols = c[0].ycols.size();
326  auto c1ycols = c[1].ycols.size();
327  comm.broadcast_from(c0ycols, rch0);
328  comm.broadcast_from(c1ycols, rch1);
329  c[0].ycols.resize(c0ycols);
330  c[1].ycols.resize(c1ycols);
331  comm.broadcast_from(c[0].ycols, rch0);
332  comm.broadcast_from(c[1].ycols, rch1);
333  }
334  void combine_child_ycols() {
335  auto c0ycols = c[0].ycols.size();
336  auto c1ycols = c[1].ycols.size();
337  ycols.resize(c0ycols + c1ycols);
338  std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
339  std::copy(c[1].ycols.begin(), c[1].ycols.end(),
340  ycols.begin()+c0ycols);
341  }
342  };
343 #endif //DOXYGEN_SHOULD_SKIP_THIS
344 
345 
346 #ifndef DOXYGEN_SHOULD_SKIP_THIS
347  template<typename scalar_t> class WorkExtractBlocksMPI {
348  public:
349  WorkExtractBlocksMPI(std::size_t nb) {
350  y.resize(nb);
351  z.resize(nb);
352  I.resize(nb);
353  J.resize(nb);
354  rl2g.resize(nb);
355  cl2g.resize(nb);
356  ycols.resize(nb);
357  zcols.resize(nb);
358  }
359  std::vector<WorkExtractBlocksMPI<scalar_t>> c;
360  std::vector<DistributedMatrix<scalar_t>> y, z;
361  std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
362  std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
363 
364  void split_extraction_sets
365  (const std::pair<std::size_t,std::size_t>& dim) {
366  if (c.empty()) {
367  auto nb = I.size();
368  c.reserve(2);
369  c.emplace_back(nb);
370  c.emplace_back(nb);
371  for (std::size_t k=0; k<nb; k++) {
372  c[0].I[k].reserve(I[k].size());
373  c[1].I[k].reserve(I[k].size());
374  for (auto i : I[k])
375  if (i < dim.first) c[0].I[k].push_back(i);
376  else c[1].I[k].push_back(i - dim.first);
377  c[0].J[k].reserve(J[k].size());
378  c[1].J[k].reserve(J[k].size());
379  for (auto j : J[k])
380  if (j < dim.second) c[0].J[k].push_back(j);
381  else c[1].J[k].push_back(j - dim.second);
382  }
383  }
384  }
385 
386  void communicate_child_ycols(const MPIComm& comm, int rch1) {
387  int rank = comm.rank(), P = comm.size();
388  int P0total = rch1, P1total = P - rch1;
389  int pcols = std::floor(std::sqrt((float)P0total));
390  int prows = P0total / pcols;
391  int P0active = prows * pcols;
392  pcols = std::floor(std::sqrt((float)P1total));
393  prows = P1total / pcols;
394  int P1active = prows * pcols;
395  std::vector<MPI_Request> sreq;
396  sreq.reserve(P);
397  int sreqs = 0;
398  std::vector<std::size_t> sbuf0, sbuf1;
399  if (rank < P0active) {
400  if (rank < (P-P0active)) {
401  // I'm one of the first P-P0active processes that are active
402  // on child0, so I need to send to one or more others which
403  // are not active on child0, ie the ones in [P0active,P)
404  std::size_t ssize = 0;
405  for (std::size_t k=0; k<I.size(); k++)
406  ssize += 1 + c[0].ycols[k].size();
407  sbuf0.reserve(ssize);
408  for (std::size_t k=0; k<I.size(); k++) {
409  sbuf0.push_back(c[0].ycols[k].size());
410  for (auto i : c[0].ycols[k])
411  sbuf0.push_back(i);
412  }
413  for (int p=P0active; p<P; p++)
414  if (rank == (p - P0active) % P0active) {
415  sreq.emplace_back();
416  comm.isend(sbuf0, p, 0, &sreq.back());
417  }
418  }
419  }
420  if (rank >= rch1 && rank < rch1+P1active) {
421  if ((rank-rch1) < (P-P1active)) {
422  // I'm one of the first P-P1active processes that are active
423  // on child1, so I need to send to one or more others which
424  // are not active on child1, ie the ones in [0,rch1) union
425  // [rch1+P1active,P)
426  std::size_t ssize = 0;
427  for (std::size_t k=0; k<I.size(); k++)
428  ssize += 1 + c[1].ycols[k].size();
429  sbuf1.reserve(ssize);
430  for (std::size_t k=0; k<I.size(); k++) {
431  sbuf1.push_back(c[1].ycols[k].size());
432  for (auto i : c[1].ycols[k])
433  sbuf1.push_back(i);
434  }
435  for (int p=0; p<rch1; p++)
436  if (rank - rch1 == p % P1active) {
437  sreq.emplace_back();
438  comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
439  }
440  for (int p=rch1+P1active; p<P; p++)
441  if (rank - rch1 == (p - P1active) % P1active) {
442  sreq.emplace_back();
443  comm.isend(sbuf1, p, 1, &sreq.back());
444  }
445  }
446  }
447 
448  if (rank >= P0active) {
449  // I'm not active on child0, so I need to receive
450  int dest = -1;
451  for (int p=0; p<P0active; p++)
452  if (p == (rank - P0active) % P0active) { dest = p; break; }
453  assert(dest >= 0);
454  auto buf = comm.recv<std::size_t>(dest, 0);
455  auto ptr = buf.data();
456  for (std::size_t k=0; k<I.size(); k++) {
457  auto c0ycols = *ptr++;
458  c[0].ycols[k].resize(c0ycols);
459  for (std::size_t i=0; i<c0ycols; i++)
460  c[0].ycols[k][i] = *ptr++;
461  }
462  }
463  if (!(rank >= rch1 && rank < rch1+P1active)) {
464  // I'm not active on child1, so I need to receive
465  int dest = -1;
466  for (int p=rch1; p<rch1+P1active; p++) {
467  if (rank < rch1) {
468  if (p - rch1 == rank % P1active) { dest = p; break; }
469  } else if (p - rch1 == (rank - P1active) % P1active) {
470  dest = p; break;
471  }
472  }
473  assert(dest >= 0);
474  auto buf = comm.recv<std::size_t>(dest, 1);
475  auto ptr = buf.data();
476  for (std::size_t k=0; k<I.size(); k++) {
477  auto c1ycols = *ptr++;
478  c[1].ycols[k].resize(c1ycols);
479  for (std::size_t i=0; i<c1ycols; i++)
480  c[1].ycols[k][i] = *ptr++;
481  }
482  }
483  wait_all(sreq);
484  }
485 
486  void combine_child_ycols(const std::vector<bool>& odiag) {
487  for (std::size_t k=0; k<I.size(); k++) {
488  if (!odiag[k]) {
489  ycols[k].clear();
490  continue;
491  }
492  auto c0ycols = c[0].ycols[k].size();
493  auto c1ycols = c[1].ycols[k].size();
494  ycols[k].resize(c0ycols + c1ycols);
495  std::copy
496  (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
497  std::copy
498  (c[1].ycols[k].begin(), c[1].ycols[k].end(),
499  ycols[k].begin()+c0ycols);
500  }
501  }
502  };
503 #endif //DOXYGEN_SHOULD_SKIP_THIS
504 
505  } // end namespace HSS
506 } // end namespace strumpack
507 
508 #endif // HSS_EXTRA_MPI_HPP
strumpack::HSS::HSSFactorsMPI::total_nonzeros
std::size_t total_nonzeros(const MPIComm &c)
Definition: HSSExtraMPI.hpp:241
strumpack::MPIComm
Wrapper class around an MPI_Comm object.
Definition: MPIWrapper.hpp:190
strumpack
Definition: StrumpackOptions.hpp:42
strumpack::DistributedMatrix
Definition: CompressedSparseMatrix.hpp:58
strumpack::MPIComm::all_reduce
T all_reduce(T t, MPI_Op op) const
Definition: MPIWrapper.hpp:479
strumpack::HSS::HSSFactorsMPI::Vhat
const DistributedMatrix< scalar_t > & Vhat() const
Definition: HSSExtraMPI.hpp:248
strumpack::wait_all
void wait_all(std::vector< MPIRequest > &reqs)
Definition: MPIWrapper.hpp:168
strumpack::copy
void copy(std::size_t m, std::size_t n, const DenseMatrix< scalar_t > &a, std::size_t ia, std::size_t ja, DenseMatrix< scalar_t > &b, std::size_t ib, std::size_t jb)
Definition: DenseMatrix.hpp:1163
strumpack::HSS::HSSFactorsMPI
Contains data related to ULV factorization of a distributed HSS matrix.
Definition: HSSExtraMPI.hpp:185
strumpack::HSS::HSSFactorsMPI::Vhat
DistributedMatrix< scalar_t > & Vhat()
Definition: HSSExtraMPI.hpp:253
strumpack::HSS::HSSFactorsMPI::total_memory
std::size_t total_memory(const MPIComm &c)
Definition: HSSExtraMPI.hpp:229
strumpack::HSS::HSSFactorsMPI::nonzeros
std::size_t nonzeros() const
Definition: HSSExtraMPI.hpp:214
strumpack::CompressionType::HSS
@ HSS
strumpack::HSS::HSSFactorsMPI::memory
std::size_t memory()
Definition: HSSExtraMPI.hpp:198
strumpack::HSS::HSSMatrixMPI
Definition: HSSMatrix.hpp:54
strumpack::HSS::TreeLocalRanges
Definition: HSSExtraMPI.hpp:134
strumpack::HSS::HSSMatrixBase
Abstract base class for Hierarchically Semi-Separable (HSS) matrices.
Definition: HSSMatrixBase.hpp:79