HSSExtraMPI.hpp
1 /*
2  * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
3  * Regents of the University of California, through Lawrence Berkeley
4  * National Laboratory (subject to receipt of any required approvals
5  * from the U.S. Dept. of Energy). All rights reserved.
6  *
7  * If you have questions about your rights to use or distribute this
8  * software, please contact Berkeley Lab's Technology Transfer
9  * Department at TTD@lbl.gov.
10  *
11  * NOTICE. This software is owned by the U.S. Department of Energy. As
12  * such, the U.S. Government has been granted for itself and others
13  * acting on its behalf a paid-up, nonexclusive, irrevocable,
14  * worldwide license in the Software to reproduce, prepare derivative
15  * works, and perform publicly and display publicly. Beginning five
16  * (5) years after the date permission to assert copyright is obtained
17  * from the U.S. Department of Energy, and subject to any subsequent
18  * five (5) year renewals, the U.S. Government is granted for itself
19  * and others acting on its behalf a paid-up, nonexclusive,
20  * irrevocable, worldwide license in the Software to reproduce,
21  * prepare derivative works, distribute copies to the public, perform
22  * publicly and display publicly, and to permit others to do so.
23  *
24  * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
25  * (Lawrence Berkeley National Lab, Computational Research
26  * Division).
27  */
28 #ifndef HSS_EXTRA_MPI_HPP
29 #define HSS_EXTRA_MPI_HPP
30 
31 #include "HSSExtra.hpp"
32 #include "dense/DistributedMatrix.hpp"
33 
34 namespace strumpack {
35  namespace HSS {
36 
37 #ifndef DOXYGEN_SHOULD_SKIP_THIS
38  template<typename scalar_t>
39  class WorkCompressMPI : public WorkCompressBase<scalar_t> {
40  public:
41  std::vector<WorkCompressMPI<scalar_t>> c;
42  DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43  DistributedMatrix<scalar_t> Qr, Qc;
44  int dR = 0, dS = 0;
45  std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46  void split(const std::pair<std::size_t,std::size_t>& dim) {
47  if (c.empty()) {
48  c.resize(2);
49  c[0].offset = this->offset;
50  c[1].offset = this->offset + dim;
51  c[0].lvl = c[1].lvl = this->lvl + 1;
52  }
53  }
54  void create_sequential() {
55  if (!w_seq)
56  w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57  (new WorkCompress<scalar_t>());
58  w_seq->lvl = this->lvl;
59  }
60  };
61 #endif //DOXYGEN_SHOULD_SKIP_THIS
62 
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS
64  template<typename scalar_t,
65  typename real_t=typename RealType<scalar_t>::value_type>
66  class WorkCompressMPIANN : public WorkCompressBase<scalar_t> {
67  public:
68  std::vector<WorkCompressMPIANN<scalar_t>> c;
69  DistributedMatrix<scalar_t> S;
70  std::vector<std::pair<std::size_t,real_t>> ids_scores;
71  std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
72 
73  void split(const std::pair<std::size_t,std::size_t>& dim) {
74  if (c.empty()) {
75  c.resize(2);
76  c[0].offset = this->offset;
77  c[1].offset = this->offset + dim;
78  c[0].lvl = c[1].lvl = this->lvl + 1;
79  }
80  }
81  void create_sequential() {
82  if (!w_seq)
83  w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84  (new WorkCompressANN<scalar_t>());
85  w_seq->lvl = this->lvl;
86  }
87  };
88 #endif //DOXYGEN_SHOULD_SKIP_THIS
89 
90 #ifndef DOXYGEN_SHOULD_SKIP_THIS
91  template<typename scalar_t> class WorkApplyMPI {
92  public:
93  std::pair<std::size_t,std::size_t> offset;
94  std::vector<WorkApplyMPI<scalar_t>> c;
95  DistributedMatrix<scalar_t> tmp1, tmp2;
96  std::unique_ptr<WorkApply<scalar_t>> w_seq;
97  };
98 #endif //DOXYGEN_SHOULD_SKIP_THIS
99 
100 
101  template<typename scalar_t> class HSSMatrixBase;
102 
103 #ifndef DOXYGEN_SHOULD_SKIP_THIS
104  template<typename scalar_t> class DistSubLeaf {
105  public:
106  DistSubLeaf
107  (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg)
108  : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
110  DistSubLeaf
111  (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg,
112  const DistributedMatrix<scalar_t>& dist)
113  : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114  void from_block_row(DistributedMatrix<scalar_t>& dist) const
115  { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116  DistributedMatrix<scalar_t> leaf;
117  DenseMatrix<scalar_t> sub;
118  const BLACSGrid* grid_local() const { return grid_loc_; }
119  int cols() const { return cols_; }
120 
121  private:
122  void allocate_block_row()
123  { hss_->allocate_block_row(cols_, sub, leaf); }
124  void to_block_row(const DistributedMatrix<scalar_t>& dist)
125  { hss_->to_block_row(dist, sub, leaf); }
126 
127  const int cols_;
128  const HSSMatrixBase<scalar_t>* hss_;
129  const BLACSGrid* grid_loc_;
130  };
131 #endif //DOXYGEN_SHOULD_SKIP_THIS
132 
133 
135  public:
136  TreeLocalRanges() {}
137  TreeLocalRanges(int P) : _ranges(5*P) {}
138  void print() const {
139  std::cout << "ranges=[";
140  for (std::size_t p=0; p<_ranges.size()/5; p++)
141  std::cout << rlo(p) << "," << rhi(p) << "/"
142  << clo(p) << "," << chi(p) << "/" << leaf_procs(p) << " ";
143  std::cout << "];" << std::endl;
144  }
145  int rlo(int p) const { return _ranges[5*p+0]; }
146  int rhi(int p) const { return _ranges[5*p+1]; }
147  int clo(int p) const { return _ranges[5*p+2]; }
148  int chi(int p) const { return _ranges[5*p+3]; }
149  int leaf_procs(int p) const { return _ranges[5*p+4]; }
150  int& rlo(int p) { return _ranges[5*p+0]; }
151  int& rhi(int p) { return _ranges[5*p+1]; }
152  int& clo(int p) { return _ranges[5*p+2]; }
153  int& chi(int p) { return _ranges[5*p+3]; }
154  int& leaf_procs(int p) { return _ranges[5*p+4]; }
155  private:
156  std::vector<int> _ranges; // rlo, rhi, clo, chi, leaf_procs
157  };
158 
159 #ifndef DOXYGEN_SHOULD_SKIP_THIS
160  template<typename scalar_t> class WorkFactorMPI {
161  public:
162  std::vector<WorkFactorMPI<scalar_t>> c;
163 
164  // (U.cols x U.cols) \tilde(D)
166 
167  // (U.cols x V.cols) bottom part of \tilde{V}
169  std::unique_ptr<WorkFactor<scalar_t>> w_seq;
170  };
171 #endif //DOXYGEN_SHOULD_SKIP_THIS
172 
173 
184  template<typename scalar_t> class HSSFactorsMPI {
185  template<typename T> friend class HSSMatrixMPI;
186  template<typename T> friend class HSSMatrixBase;
187 
188  public:
189 
197  std::size_t memory() {
198  std::size_t mem = sizeof(*this) + L_.memory() + Vt0_.memory()
199  + W1_.memory() + Q_.memory() + D_.memory()
200  + sizeof(int)*piv_.size();
201  for (auto& c : ch_) mem += c.memory();
202  if (factors_seq_) mem += factors_seq_->memory();
203  return mem;
204  }
205 
213  std::size_t nonzeros() const {
214  std::size_t nnz = L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
215  + Q_.nonzeros() + D_.nonzeros();
216  for (auto& c : ch_) nnz += c.nonzeros();
217  if (factors_seq_) nnz += factors_seq_->nonzeros();
218  return nnz;
219  }
220 
228  std::size_t total_memory(const MPIComm& c) {
229  return c.all_reduce(memory(), MPI_SUM);
230  }
231 
232 
240  std::size_t total_nonzeros(const MPIComm& c) {
241  return c.all_reduce(nonzeros(), MPI_SUM);
242  }
243 
247  const DistributedMatrix<scalar_t>& Vhat() const { return Vt0_; }
248 
252  DistributedMatrix<scalar_t>& Vhat() { return Vt0_; }
253 
254  private:
255  std::vector<HSSFactorsMPI<scalar_t>> ch_;
256  std::unique_ptr<HSSFactors<scalar_t>> factors_seq_;
257 
258  // (U.rows-U.cols x U.rows-U.cols), empty at the root
260 
261  // (U.rows-U.cols x V.cols)
262  // at the root, Vt0_ stored Vhat
264 
265  // (U.cols x U.rows) bottom part of W
266  // if (U.rows == U.cols) then W == I and is not stored!
268 
269  // (U.rows x U.rows) Q from LQ(W0)
270  // if (U.rows == U.cols) then Q == I and is not stored!
272 
273  // (U.rows x U.rows) at the root holds LU(D), else empty
275  std::vector<int> piv_; // hold permutation from LU(D) at root
276  };
277 
278 
279 #ifndef DOXYGEN_SHOULD_SKIP_THIS
280  template<typename scalar_t> class WorkSolveMPI {
281  public:
282  std::vector<WorkSolveMPI<scalar_t>> c;
283  std::unique_ptr<WorkSolve<scalar_t>> w_seq;
284 
285  // do we need all these?? x only used in bwd, y only used in fwd??
287  DistributedMatrix<scalar_t> ft1; // TODO document the sizes here
290 
291  // DO NOT STORE reduced_rhs here!!!
292  DistributedMatrix<scalar_t> reduced_rhs;
293  std::pair<std::size_t,std::size_t> offset;
294  };
295 #endif //DOXYGEN_SHOULD_SKIP_THIS
296 
297 
298 #ifndef DOXYGEN_SHOULD_SKIP_THIS
299  template<typename scalar_t> class WorkExtractMPI {
300  public:
301  std::vector<WorkExtractMPI<scalar_t>> c;
302  DistributedMatrix<scalar_t> y, z;
303  std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
304  std::unique_ptr<WorkExtract<scalar_t>> w_seq;
305  void split_extraction_sets
306  (const std::pair<std::size_t,std::size_t>& dim) {
307  if (c.empty()) {
308  c.resize(2);
309  c[0].I.reserve(I.size());
310  c[1].I.reserve(I.size());
311  for (auto i : I)
312  if (i < dim.first) c[0].I.push_back(i);
313  else c[1].I.push_back(i - dim.first);
314  c[0].J.reserve(J.size());
315  c[1].J.reserve(J.size());
316  for (auto j : J)
317  if (j < dim.second) c[0].J.push_back(j);
318  else c[1].J.push_back(j - dim.second);
319  }
320  }
321  void communicate_child_ycols(const MPIComm& comm, int rch1) {
322  // TODO optimize these 4 bcasts?
323  auto rch0 = 0;
324  auto c0ycols = c[0].ycols.size();
325  auto c1ycols = c[1].ycols.size();
326  comm.broadcast_from(c0ycols, rch0);
327  comm.broadcast_from(c1ycols, rch1);
328  c[0].ycols.resize(c0ycols);
329  c[1].ycols.resize(c1ycols);
330  comm.broadcast_from(c[0].ycols, rch0);
331  comm.broadcast_from(c[1].ycols, rch1);
332  }
333  void combine_child_ycols() {
334  auto c0ycols = c[0].ycols.size();
335  auto c1ycols = c[1].ycols.size();
336  ycols.resize(c0ycols + c1ycols);
337  std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
338  std::copy(c[1].ycols.begin(), c[1].ycols.end(),
339  ycols.begin()+c0ycols);
340  }
341  };
342 #endif //DOXYGEN_SHOULD_SKIP_THIS
343 
344 
345 #ifndef DOXYGEN_SHOULD_SKIP_THIS
346  template<typename scalar_t> class WorkExtractBlocksMPI {
347  public:
348  WorkExtractBlocksMPI(std::size_t nb) {
349  y.resize(nb);
350  z.resize(nb);
351  I.resize(nb);
352  J.resize(nb);
353  rl2g.resize(nb);
354  cl2g.resize(nb);
355  ycols.resize(nb);
356  zcols.resize(nb);
357  }
358  std::vector<WorkExtractBlocksMPI<scalar_t>> c;
359  std::vector<DistributedMatrix<scalar_t>> y, z;
360  std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
361  std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
362 
363  void split_extraction_sets
364  (const std::pair<std::size_t,std::size_t>& dim) {
365  if (c.empty()) {
366  auto nb = I.size();
367  c.reserve(2);
368  c.emplace_back(nb);
369  c.emplace_back(nb);
370  for (std::size_t k=0; k<nb; k++) {
371  c[0].I[k].reserve(I[k].size());
372  c[1].I[k].reserve(I[k].size());
373  for (auto i : I[k])
374  if (i < dim.first) c[0].I[k].push_back(i);
375  else c[1].I[k].push_back(i - dim.first);
376  c[0].J[k].reserve(J[k].size());
377  c[1].J[k].reserve(J[k].size());
378  for (auto j : J[k])
379  if (j < dim.second) c[0].J[k].push_back(j);
380  else c[1].J[k].push_back(j - dim.second);
381  }
382  }
383  }
384 
385  void communicate_child_ycols(const MPIComm& comm, int rch1) {
386  int rank = comm.rank(), P = comm.size();
387  int P0total = rch1, P1total = P - rch1;
388  int pcols = std::floor(std::sqrt((float)P0total));
389  int prows = P0total / pcols;
390  int P0active = prows * pcols;
391  pcols = std::floor(std::sqrt((float)P1total));
392  prows = P1total / pcols;
393  int P1active = prows * pcols;
394  std::vector<MPI_Request> sreq;
395  sreq.reserve(P);
396  int sreqs = 0;
397  std::vector<std::size_t> sbuf0, sbuf1;
398  if (rank < P0active) {
399  if (rank < (P-P0active)) {
400  // I'm one of the first P-P0active processes that are active
401  // on child0, so I need to send to one or more others which
402  // are not active on child0, ie the ones in [P0active,P)
403  std::size_t ssize = 0;
404  for (std::size_t k=0; k<I.size(); k++)
405  ssize += 1 + c[0].ycols[k].size();
406  sbuf0.reserve(ssize);
407  for (std::size_t k=0; k<I.size(); k++) {
408  sbuf0.push_back(c[0].ycols[k].size());
409  for (auto i : c[0].ycols[k])
410  sbuf0.push_back(i);
411  }
412  for (int p=P0active; p<P; p++)
413  if (rank == (p - P0active) % P0active) {
414  sreq.emplace_back();
415  comm.isend(sbuf0, p, 0, &sreq.back());
416  }
417  }
418  }
419  if (rank >= rch1 && rank < rch1+P1active) {
420  if ((rank-rch1) < (P-P1active)) {
421  // I'm one of the first P-P1active processes that are active
422  // on child1, so I need to send to one or more others which
423  // are not active on child1, ie the ones in [0,rch1) union
424  // [rch1+P1active,P)
425  std::size_t ssize = 0;
426  for (std::size_t k=0; k<I.size(); k++)
427  ssize += 1 + c[1].ycols[k].size();
428  sbuf1.reserve(ssize);
429  for (std::size_t k=0; k<I.size(); k++) {
430  sbuf1.push_back(c[1].ycols[k].size());
431  for (auto i : c[1].ycols[k])
432  sbuf1.push_back(i);
433  }
434  for (int p=0; p<rch1; p++)
435  if (rank - rch1 == p % P1active) {
436  sreq.emplace_back();
437  comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
438  }
439  for (int p=rch1+P1active; p<P; p++)
440  if (rank - rch1 == (p - P1active) % P1active) {
441  sreq.emplace_back();
442  comm.isend(sbuf1, p, 1, &sreq.back());
443  }
444  }
445  }
446 
447  if (rank >= P0active) {
448  // I'm not active on child0, so I need to receive
449  int dest = -1;
450  for (int p=0; p<P0active; p++)
451  if (p == (rank - P0active) % P0active) { dest = p; break; }
452  assert(dest >= 0);
453  auto buf = comm.recv<std::size_t>(dest, 0);
454  auto ptr = buf.data();
455  for (std::size_t k=0; k<I.size(); k++) {
456  auto c0ycols = *ptr++;
457  c[0].ycols[k].resize(c0ycols);
458  for (std::size_t i=0; i<c0ycols; i++)
459  c[0].ycols[k][i] = *ptr++;
460  }
461  }
462  if (!(rank >= rch1 && rank < rch1+P1active)) {
463  // I'm not active on child1, so I need to receive
464  int dest = -1;
465  for (int p=rch1; p<rch1+P1active; p++) {
466  if (rank < rch1) {
467  if (p - rch1 == rank % P1active) { dest = p; break; }
468  } else if (p - rch1 == (rank - P1active) % P1active) {
469  dest = p; break;
470  }
471  }
472  assert(dest >= 0);
473  auto buf = comm.recv<std::size_t>(dest, 1);
474  auto ptr = buf.data();
475  for (std::size_t k=0; k<I.size(); k++) {
476  auto c1ycols = *ptr++;
477  c[1].ycols[k].resize(c1ycols);
478  for (std::size_t i=0; i<c1ycols; i++)
479  c[1].ycols[k][i] = *ptr++;
480  }
481  }
482  wait_all(sreq);
483  }
484 
485  void combine_child_ycols(const std::vector<bool>& odiag) {
486  for (std::size_t k=0; k<I.size(); k++) {
487  if (!odiag[k]) {
488  ycols[k].clear();
489  continue;
490  }
491  auto c0ycols = c[0].ycols[k].size();
492  auto c1ycols = c[1].ycols[k].size();
493  ycols[k].resize(c0ycols + c1ycols);
494  std::copy
495  (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
496  std::copy
497  (c[1].ycols[k].begin(), c[1].ycols[k].end(),
498  ycols[k].begin()+c0ycols);
499  }
500  }
501  };
502 #endif //DOXYGEN_SHOULD_SKIP_THIS
503 
504  } // end namespace HSS
505 } // end namespace strumpack
506 
507 #endif // HSS_EXTRA_MPI_HPP
strumpack::HSS::HSSFactorsMPI::total_nonzeros
std::size_t total_nonzeros(const MPIComm &c)
Definition: HSSExtraMPI.hpp:240
strumpack::MPIComm
Wrapper class around an MPI_Comm object.
Definition: MPIWrapper.hpp:190
strumpack
Definition: StrumpackOptions.hpp:42
strumpack::DistributedMatrix
Definition: CompressedSparseMatrix.hpp:58
strumpack::MPIComm::all_reduce
T all_reduce(T t, MPI_Op op) const
Definition: MPIWrapper.hpp:483
strumpack::HSS::HSSFactorsMPI::Vhat
const DistributedMatrix< scalar_t > & Vhat() const
Definition: HSSExtraMPI.hpp:247
strumpack::wait_all
void wait_all(std::vector< MPIRequest > &reqs)
Definition: MPIWrapper.hpp:168
strumpack::copy
void copy(std::size_t m, std::size_t n, const DenseMatrix< scalar_t > &a, std::size_t ia, std::size_t ja, DenseMatrix< scalar_t > &b, std::size_t ib, std::size_t jb)
Definition: DenseMatrix.hpp:1207
strumpack::HSS::HSSFactorsMPI
Contains data related to ULV factorization of a distributed HSS matrix.
Definition: HSSExtraMPI.hpp:184
strumpack::HSS::HSSFactorsMPI::Vhat
DistributedMatrix< scalar_t > & Vhat()
Definition: HSSExtraMPI.hpp:252
strumpack::HSS::HSSFactorsMPI::total_memory
std::size_t total_memory(const MPIComm &c)
Definition: HSSExtraMPI.hpp:228
strumpack::HSS::HSSFactorsMPI::nonzeros
std::size_t nonzeros() const
Definition: HSSExtraMPI.hpp:213
strumpack::CompressionType::HSS
@ HSS
strumpack::HSS::HSSFactorsMPI::memory
std::size_t memory()
Definition: HSSExtraMPI.hpp:197
strumpack::HSS::HSSMatrixMPI
Definition: HSSMatrix.hpp:55
strumpack::HSS::TreeLocalRanges
Definition: HSSExtraMPI.hpp:134
strumpack::HSS::HSSMatrixBase
Abstract base class for Hierarchically Semi-Separable (HSS) matrices.
Definition: HSSMatrixBase.hpp:81