HSSExtraMPI.hpp
1 /*
2  * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
3  * Regents of the University of California, through Lawrence Berkeley
4  * National Laboratory (subject to receipt of any required approvals
5  * from the U.S. Dept. of Energy). All rights reserved.
6  *
7  * If you have questions about your rights to use or distribute this
8  * software, please contact Berkeley Lab's Technology Transfer
9  * Department at TTD@lbl.gov.
10  *
11  * NOTICE. This software is owned by the U.S. Department of Energy. As
12  * such, the U.S. Government has been granted for itself and others
13  * acting on its behalf a paid-up, nonexclusive, irrevocable,
14  * worldwide license in the Software to reproduce, prepare derivative
15  * works, and perform publicly and display publicly. Beginning five
16  * (5) years after the date permission to assert copyright is obtained
17  * from the U.S. Department of Energy, and subject to any subsequent
18  * five (5) year renewals, the U.S. Government is granted for itself
19  * and others acting on its behalf a paid-up, nonexclusive,
20  * irrevocable, worldwide license in the Software to reproduce,
21  * prepare derivative works, distribute copies to the public, perform
22  * publicly and display publicly, and to permit others to do so.
23  *
24  * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
25  * (Lawrence Berkeley National Lab, Computational Research
26  * Division).
27  */
28 #ifndef HSS_EXTRA_MPI_HPP
29 #define HSS_EXTRA_MPI_HPP
30 
31 #include "HSSExtra.hpp"
33 
34 namespace strumpack {
35  namespace HSS {
36 
37 #ifndef DOXYGEN_SHOULD_SKIP_THIS
38  template<typename scalar_t>
39  class WorkCompressMPI : public WorkCompressBase<scalar_t> {
40  public:
41  std::vector<WorkCompressMPI<scalar_t>> c;
42  DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43  DistributedMatrix<scalar_t> Qr, Qc;
44  int dR = 0, dS = 0;
45  std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46  void split(const std::pair<std::size_t,std::size_t>& dim) {
47  if (c.empty()) {
48  c.resize(2);
49  c[0].offset = this->offset;
50  c[1].offset = this->offset + dim;
51  c[0].lvl = c[1].lvl = this->lvl + 1;
52  }
53  }
54  void create_sequential() {
55  if (!w_seq)
56  w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57  (new WorkCompress<scalar_t>());
58  w_seq->lvl = this->lvl;
59  }
60  };
61 #endif //DOXYGEN_SHOULD_SKIP_THIS
62 
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS
64  template<typename scalar_t,
65  typename real_t=typename RealType<scalar_t>::value_type>
66  class WorkCompressMPIANN : public WorkCompressBase<scalar_t> {
67  public:
68  std::vector<WorkCompressMPIANN<scalar_t>> c;
69  DistributedMatrix<scalar_t> S;
70  std::vector<std::pair<std::size_t,real_t>> ids_scores;
71  std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
72 
73  void split(const std::pair<std::size_t,std::size_t>& dim) {
74  if (c.empty()) {
75  c.resize(2);
76  c[0].offset = this->offset;
77  c[1].offset = this->offset + dim;
78  c[0].lvl = c[1].lvl = this->lvl + 1;
79  }
80  }
81  void create_sequential() {
82  if (!w_seq)
83  w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84  (new WorkCompressANN<scalar_t>());
85  w_seq->lvl = this->lvl;
86  }
87  };
88 #endif //DOXYGEN_SHOULD_SKIP_THIS
89 
90 #ifndef DOXYGEN_SHOULD_SKIP_THIS
91  template<typename scalar_t> class WorkApplyMPI {
92  public:
93  std::pair<std::size_t,std::size_t> offset;
94  std::vector<WorkApplyMPI<scalar_t>> c;
95  DistributedMatrix<scalar_t> tmp1, tmp2;
96  std::unique_ptr<WorkApply<scalar_t>> w_seq;
97  };
98 #endif //DOXYGEN_SHOULD_SKIP_THIS
99 
100 
101  template<typename scalar_t> class HSSMatrixBase;
102 
103 #ifndef DOXYGEN_SHOULD_SKIP_THIS
104  template<typename scalar_t> class DistSubLeaf {
105  public:
106  DistSubLeaf
107  (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg)
108  : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
110  DistSubLeaf
111  (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg,
112  const DistributedMatrix<scalar_t>& dist)
113  : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114  void from_block_row(DistributedMatrix<scalar_t>& dist) const
115  { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116  DistributedMatrix<scalar_t> leaf;
117  DenseMatrix<scalar_t> sub;
118  const BLACSGrid* grid_local() const { return grid_loc_; }
119  int cols() const { return cols_; }
120 
121  private:
122  void allocate_block_row()
123  { hss_->allocate_block_row(cols_, sub, leaf); }
124  void to_block_row(const DistributedMatrix<scalar_t>& dist)
125  { hss_->to_block_row(dist, sub, leaf); }
126 
127  const int cols_;
128  const HSSMatrixBase<scalar_t>* hss_;
129  const BLACSGrid* grid_loc_;
130  };
131 #endif //DOXYGEN_SHOULD_SKIP_THIS
132 
133 
135  public:
136  TreeLocalRanges() {}
137  TreeLocalRanges(int P) : ranges_(5*P) {}
138  void print() const {
139  std::cout << "ranges=[";
140  for (std::size_t p=0; p<ranges_.size()/5; p++)
141  std::cout << rlo(p) << "," << rhi(p) << "/"
142  << clo(p) << "," << chi(p) << "/" << leaf_procs(p) << " ";
143  std::cout << "];" << std::endl;
144  }
145  int rlo(int p) const { return ranges_[5*p+0]; }
146  int rhi(int p) const { return ranges_[5*p+1]; }
147  int clo(int p) const { return ranges_[5*p+2]; }
148  int chi(int p) const { return ranges_[5*p+3]; }
149  int leaf_procs(int p) const { return ranges_[5*p+4]; }
150  int& rlo(int p) { return ranges_[5*p+0]; }
151  int& rhi(int p) { return ranges_[5*p+1]; }
152  int& clo(int p) { return ranges_[5*p+2]; }
153  int& chi(int p) { return ranges_[5*p+3]; }
154  int& leaf_procs(int p) { return ranges_[5*p+4]; }
155  private:
156  std::vector<int> ranges_; // rlo, rhi, clo, chi, leaf_procs
157  };
158 
159 #ifndef DOXYGEN_SHOULD_SKIP_THIS
160  template<typename scalar_t> class WorkFactorMPI {
161  public:
162  std::vector<WorkFactorMPI<scalar_t>> c;
163 
164  // (U.cols x U.cols) \tilde(D)
166 
167  // (U.cols x V.cols) bottom part of \tilde{V}
169  std::unique_ptr<WorkFactor<scalar_t>> w_seq;
170  };
171 #endif //DOXYGEN_SHOULD_SKIP_THIS
172 
173 
184  template<typename scalar_t> class HSSFactorsMPI {
185  template<typename T> friend class HSSMatrixMPI;
186  template<typename T> friend class HSSMatrixBase;
187 
188  public:
189 
197  std::size_t memory() {
198  return sizeof(*this) + L_.memory() + Vt0_.memory()
199  + W1_.memory() + Q_.memory() + D_.memory()
200  + sizeof(int)*piv_.size();
201  }
202 
210  std::size_t nonzeros() const {
211  return L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
212  + Q_.nonzeros() + D_.nonzeros();
213  }
214 
218  const DistributedMatrix<scalar_t>& Vhat() const { return Vt0_; }
219 
223  DistributedMatrix<scalar_t>& Vhat() { return Vt0_; }
224 
225  private:
226  // (U.rows-U.cols x U.rows-U.cols), empty at the root
228 
229  // (U.rows-U.cols x V.cols)
230  // at the root, Vt0_ stored Vhat
232 
233  // (U.cols x U.rows) bottom part of W
234  // if (U.rows == U.cols) then W == I and is not stored!
236 
237  // (U.rows x U.rows) Q from LQ(W0)
238  // if (U.rows == U.cols) then Q == I and is not stored!
240 
241  // (U.rows x U.rows) at the root holds LU(D), else empty
243  std::vector<int> piv_; // hold permutation from LU(D) at root
244  };
245 
246 
247 #ifndef DOXYGEN_SHOULD_SKIP_THIS
248  template<typename scalar_t> class WorkSolveMPI {
249  public:
250  std::vector<WorkSolveMPI<scalar_t>> c;
251  std::unique_ptr<WorkSolve<scalar_t>> w_seq;
252 
253  // do we need all these?? x only used in bwd, y only used in fwd??
255  DistributedMatrix<scalar_t> ft1; // TODO document the sizes here
258 
259  // DO NOT STORE reduced_rhs here!!!
260  DistributedMatrix<scalar_t> reduced_rhs;
261  std::pair<std::size_t,std::size_t> offset;
262  };
263 #endif //DOXYGEN_SHOULD_SKIP_THIS
264 
265 
266 #ifndef DOXYGEN_SHOULD_SKIP_THIS
267  template<typename scalar_t> class WorkExtractMPI {
268  public:
269  std::vector<WorkExtractMPI<scalar_t>> c;
270  DistributedMatrix<scalar_t> y, z;
271  std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
272  std::unique_ptr<WorkExtract<scalar_t>> w_seq;
273  void split_extraction_sets
274  (const std::pair<std::size_t,std::size_t>& dim) {
275  if (c.empty()) {
276  c.resize(2);
277  c[0].I.reserve(I.size());
278  c[1].I.reserve(I.size());
279  for (auto i : I)
280  if (i < dim.first) c[0].I.push_back(i);
281  else c[1].I.push_back(i - dim.first);
282  c[0].J.reserve(J.size());
283  c[1].J.reserve(J.size());
284  for (auto j : J)
285  if (j < dim.second) c[0].J.push_back(j);
286  else c[1].J.push_back(j - dim.second);
287  }
288  }
289  void communicate_child_ycols(const MPIComm& comm, int rch1) {
290  // TODO optimize these 4 bcasts?
291  auto rch0 = 0;
292  auto c0ycols = c[0].ycols.size();
293  auto c1ycols = c[1].ycols.size();
294  comm.broadcast_from(c0ycols, rch0);
295  comm.broadcast_from(c1ycols, rch1);
296  c[0].ycols.resize(c0ycols);
297  c[1].ycols.resize(c1ycols);
298  comm.broadcast_from(c[0].ycols, rch0);
299  comm.broadcast_from(c[1].ycols, rch1);
300  }
301  void combine_child_ycols() {
302  auto c0ycols = c[0].ycols.size();
303  auto c1ycols = c[1].ycols.size();
304  ycols.resize(c0ycols + c1ycols);
305  std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
306  std::copy(c[1].ycols.begin(), c[1].ycols.end(),
307  ycols.begin()+c0ycols);
308  }
309  };
310 #endif //DOXYGEN_SHOULD_SKIP_THIS
311 
312 
313 #ifndef DOXYGEN_SHOULD_SKIP_THIS
314  template<typename scalar_t> class WorkExtractBlocksMPI {
315  public:
316  WorkExtractBlocksMPI(std::size_t nb) {
317  y.resize(nb);
318  z.resize(nb);
319  I.resize(nb);
320  J.resize(nb);
321  rl2g.resize(nb);
322  cl2g.resize(nb);
323  ycols.resize(nb);
324  zcols.resize(nb);
325  }
326  std::vector<WorkExtractBlocksMPI<scalar_t>> c;
327  std::vector<DistributedMatrix<scalar_t>> y, z;
328  std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
329  std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
330 
331  void split_extraction_sets
332  (const std::pair<std::size_t,std::size_t>& dim) {
333  if (c.empty()) {
334  auto nb = I.size();
335  c.reserve(2);
336  c.emplace_back(nb);
337  c.emplace_back(nb);
338  for (std::size_t k=0; k<nb; k++) {
339  c[0].I[k].reserve(I[k].size());
340  c[1].I[k].reserve(I[k].size());
341  for (auto i : I[k])
342  if (i < dim.first) c[0].I[k].push_back(i);
343  else c[1].I[k].push_back(i - dim.first);
344  c[0].J[k].reserve(J[k].size());
345  c[1].J[k].reserve(J[k].size());
346  for (auto j : J[k])
347  if (j < dim.second) c[0].J[k].push_back(j);
348  else c[1].J[k].push_back(j - dim.second);
349  }
350  }
351  }
352 
353  void communicate_child_ycols(const MPIComm& comm, int rch1) {
354  int rank = comm.rank(), P = comm.size();
355  int P0total = rch1, P1total = P - rch1;
356  int pcols = std::floor(std::sqrt((float)P0total));
357  int prows = P0total / pcols;
358  int P0active = prows * pcols;
359  pcols = std::floor(std::sqrt((float)P1total));
360  prows = P1total / pcols;
361  int P1active = prows * pcols;
362  std::vector<MPI_Request> sreq;
363  sreq.reserve(P);
364  int sreqs = 0;
365  std::vector<std::size_t> sbuf0, sbuf1;
366  if (rank < P0active) {
367  if (rank < (P-P0active)) {
368  // I'm one of the first P-P0active processes that are active
369  // on child0, so I need to send to one or more others which
370  // are not active on child0, ie the ones in [P0active,P)
371  std::size_t ssize = 0;
372  for (std::size_t k=0; k<I.size(); k++)
373  ssize += 1 + c[0].ycols[k].size();
374  sbuf0.reserve(ssize);
375  for (std::size_t k=0; k<I.size(); k++) {
376  sbuf0.push_back(c[0].ycols[k].size());
377  for (auto i : c[0].ycols[k])
378  sbuf0.push_back(i);
379  }
380  for (int p=P0active; p<P; p++)
381  if (rank == (p - P0active) % P0active) {
382  sreq.emplace_back();
383  comm.isend(sbuf0, p, 0, &sreq.back());
384  }
385  }
386  }
387  if (rank >= rch1 && rank < rch1+P1active) {
388  if ((rank-rch1) < (P-P1active)) {
389  // I'm one of the first P-P1active processes that are active
390  // on child1, so I need to send to one or more others which
391  // are not active on child1, ie the ones in [0,rch1) union
392  // [rch1+P1active,P)
393  std::size_t ssize = 0;
394  for (std::size_t k=0; k<I.size(); k++)
395  ssize += 1 + c[1].ycols[k].size();
396  sbuf1.reserve(ssize);
397  for (std::size_t k=0; k<I.size(); k++) {
398  sbuf1.push_back(c[1].ycols[k].size());
399  for (auto i : c[1].ycols[k])
400  sbuf1.push_back(i);
401  }
402  for (int p=0; p<rch1; p++)
403  if (rank - rch1 == p % P1active) {
404  sreq.emplace_back();
405  comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
406  }
407  for (int p=rch1+P1active; p<P; p++)
408  if (rank - rch1 == (p - P1active) % P1active) {
409  sreq.emplace_back();
410  comm.isend(sbuf1, p, 1, &sreq.back());
411  }
412  }
413  }
414 
415  if (rank >= P0active) {
416  // I'm not active on child0, so I need to receive
417  int dest = -1;
418  for (int p=0; p<P0active; p++)
419  if (p == (rank - P0active) % P0active) { dest = p; break; }
420  assert(dest >= 0);
421  auto buf = comm.recv<std::size_t>(dest, 0);
422  auto ptr = buf.data();
423  for (std::size_t k=0; k<I.size(); k++) {
424  auto c0ycols = *ptr++;
425  c[0].ycols[k].resize(c0ycols);
426  for (std::size_t i=0; i<c0ycols; i++)
427  c[0].ycols[k][i] = *ptr++;
428  }
429  }
430  if (!(rank >= rch1 && rank < rch1+P1active)) {
431  // I'm not active on child1, so I need to receive
432  int dest = -1;
433  for (int p=rch1; p<rch1+P1active; p++) {
434  if (rank < rch1) {
435  if (p - rch1 == rank % P1active) { dest = p; break; }
436  } else if (p - rch1 == (rank - P1active) % P1active) {
437  dest = p; break;
438  }
439  }
440  assert(dest >= 0);
441  auto buf = comm.recv<std::size_t>(dest, 1);
442  auto ptr = buf.data();
443  for (std::size_t k=0; k<I.size(); k++) {
444  auto c1ycols = *ptr++;
445  c[1].ycols[k].resize(c1ycols);
446  for (std::size_t i=0; i<c1ycols; i++)
447  c[1].ycols[k][i] = *ptr++;
448  }
449  }
450  wait_all(sreq);
451  }
452 
453  void combine_child_ycols(const std::vector<bool>& odiag) {
454  for (std::size_t k=0; k<I.size(); k++) {
455  if (!odiag[k]) {
456  ycols[k].clear();
457  continue;
458  }
459  auto c0ycols = c[0].ycols[k].size();
460  auto c1ycols = c[1].ycols[k].size();
461  ycols[k].resize(c0ycols + c1ycols);
462  std::copy
463  (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
464  std::copy
465  (c[1].ycols[k].begin(), c[1].ycols[k].end(),
466  ycols[k].begin()+c0ycols);
467  }
468  }
469  };
470 #endif //DOXYGEN_SHOULD_SKIP_THIS
471 
472  } // end namespace HSS
473 } // end namespace strumpack
474 
475 #endif // HSS_EXTRA_MPI_HPP
Contains the DistributedMatrix and DistributedMatrixWrapper classes, wrappers around ScaLAPACK/PBLAS ...
2D block cyclicly distributed matrix, as used by ScaLAPACK.
Definition: DistributedMatrix.hpp:84
Contains data related to ULV factorization of a distributed HSS matrix.
Definition: HSSExtraMPI.hpp:184
std::size_t memory()
Definition: HSSExtraMPI.hpp:197
std::size_t nonzeros() const
Definition: HSSExtraMPI.hpp:210
DistributedMatrix< scalar_t > & Vhat()
Definition: HSSExtraMPI.hpp:223
const DistributedMatrix< scalar_t > & Vhat() const
Definition: HSSExtraMPI.hpp:218
Abstract base class for Hierarchically Semi-Separable (HSS) matrices.
Definition: HSSMatrixBase.hpp:83
Distributed memory implementation of the HSS (Hierarchically Semi-Separable) matrix format.
Definition: HSSMatrixMPI.hpp:68
Definition: HSSExtraMPI.hpp:134
Definition: StrumpackOptions.hpp:43
void wait_all(std::vector< MPIRequest > &reqs)
Definition: MPIWrapper.hpp:173
void copy(std::size_t m, std::size_t n, const DenseMatrix< scalar_from_t > &a, std::size_t ia, std::size_t ja, DenseMatrix< scalar_to_t > &b, std::size_t ib, std::size_t jb)
Definition: DenseMatrix.hpp:1231