HSSExtraMPI.hpp
1/*
2 * STRUMPACK -- STRUctured Matrices PACKage, Copyright (c) 2014, The
3 * Regents of the University of California, through Lawrence Berkeley
4 * National Laboratory (subject to receipt of any required approvals
5 * from the U.S. Dept. of Energy). All rights reserved.
6 *
7 * If you have questions about your rights to use or distribute this
8 * software, please contact Berkeley Lab's Technology Transfer
9 * Department at TTD@lbl.gov.
10 *
11 * NOTICE. This software is owned by the U.S. Department of Energy. As
12 * such, the U.S. Government has been granted for itself and others
13 * acting on its behalf a paid-up, nonexclusive, irrevocable,
14 * worldwide license in the Software to reproduce, prepare derivative
15 * works, and perform publicly and display publicly. Beginning five
16 * (5) years after the date permission to assert copyright is obtained
17 * from the U.S. Department of Energy, and subject to any subsequent
18 * five (5) year renewals, the U.S. Government is granted for itself
19 * and others acting on its behalf a paid-up, nonexclusive,
20 * irrevocable, worldwide license in the Software to reproduce,
21 * prepare derivative works, distribute copies to the public, perform
22 * publicly and display publicly, and to permit others to do so.
23 *
24 * Developers: Pieter Ghysels, Francois-Henry Rouet, Xiaoye S. Li.
25 * (Lawrence Berkeley National Lab, Computational Research
26 * Division).
27 */
28#ifndef HSS_EXTRA_MPI_HPP
29#define HSS_EXTRA_MPI_HPP
30
31#include "HSSExtra.hpp"
33
34namespace strumpack {
35 namespace HSS {
36
37#ifndef DOXYGEN_SHOULD_SKIP_THIS
38 template<typename scalar_t>
39 class WorkCompressMPI : public WorkCompressBase<scalar_t> {
40 public:
41 std::vector<WorkCompressMPI<scalar_t>> c;
42 DistributedMatrix<scalar_t> Rr, Rc, Sr, Sc;
43 DistributedMatrix<scalar_t> Qr, Qc;
44 int dR = 0, dS = 0;
45 std::unique_ptr<WorkCompress<scalar_t>> w_seq;
46 void split(const std::pair<std::size_t,std::size_t>& dim) {
47 if (c.empty()) {
48 c.resize(2);
49 c[0].offset = this->offset;
50 c[1].offset = this->offset + dim;
51 c[0].lvl = c[1].lvl = this->lvl + 1;
52 }
53 }
54 void create_sequential() {
55 if (!w_seq)
56 w_seq = std::unique_ptr<WorkCompress<scalar_t>>
57 (new WorkCompress<scalar_t>());
58 w_seq->lvl = this->lvl;
59 }
60 };
61#endif //DOXYGEN_SHOULD_SKIP_THIS
62
63#ifndef DOXYGEN_SHOULD_SKIP_THIS
64 template<typename scalar_t,
65 typename real_t=typename RealType<scalar_t>::value_type>
66 class WorkCompressMPIANN : public WorkCompressBase<scalar_t> {
67 public:
68 std::vector<WorkCompressMPIANN<scalar_t>> c;
69 DistributedMatrix<scalar_t> S;
70 std::vector<std::pair<std::size_t,real_t>> ids_scores;
71 std::unique_ptr<WorkCompressANN<scalar_t>> w_seq;
72
73 void split(const std::pair<std::size_t,std::size_t>& dim) {
74 if (c.empty()) {
75 c.resize(2);
76 c[0].offset = this->offset;
77 c[1].offset = this->offset + dim;
78 c[0].lvl = c[1].lvl = this->lvl + 1;
79 }
80 }
81 void create_sequential() {
82 if (!w_seq)
83 w_seq = std::unique_ptr<WorkCompressANN<scalar_t>>
84 (new WorkCompressANN<scalar_t>());
85 w_seq->lvl = this->lvl;
86 }
87 };
88#endif //DOXYGEN_SHOULD_SKIP_THIS
89
90#ifndef DOXYGEN_SHOULD_SKIP_THIS
91 template<typename scalar_t> class WorkApplyMPI {
92 public:
93 std::pair<std::size_t,std::size_t> offset;
94 std::vector<WorkApplyMPI<scalar_t>> c;
95 DistributedMatrix<scalar_t> tmp1, tmp2;
96 std::unique_ptr<WorkApply<scalar_t>> w_seq;
97 };
98#endif //DOXYGEN_SHOULD_SKIP_THIS
99
100
101 template<typename scalar_t> class HSSMatrixBase;
102
103#ifndef DOXYGEN_SHOULD_SKIP_THIS
104 template<typename scalar_t> class DistSubLeaf {
105 public:
106 DistSubLeaf
107 (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg)
108 : cols_(cols), hss_(H), grid_loc_(lg) { allocate_block_row(); }
110 DistSubLeaf
111 (int cols, const HSSMatrixBase<scalar_t>* H, const BLACSGrid* lg,
112 const DistributedMatrix<scalar_t>& dist)
113 : cols_(cols), hss_(H), grid_loc_(lg) { to_block_row(dist); }
114 void from_block_row(DistributedMatrix<scalar_t>& dist) const
115 { hss_->from_block_row(dist, sub, leaf, grid_loc_); }
116 DistributedMatrix<scalar_t> leaf;
117 DenseMatrix<scalar_t> sub;
118 const BLACSGrid* grid_local() const { return grid_loc_; }
119 int cols() const { return cols_; }
120
121 private:
122 void allocate_block_row()
123 { hss_->allocate_block_row(cols_, sub, leaf); }
124 void to_block_row(const DistributedMatrix<scalar_t>& dist)
125 { hss_->to_block_row(dist, sub, leaf); }
126
127 const int cols_;
128 const HSSMatrixBase<scalar_t>* hss_;
129 const BLACSGrid* grid_loc_;
130 };
131#endif //DOXYGEN_SHOULD_SKIP_THIS
132
133
135 public:
136 TreeLocalRanges() {}
137 TreeLocalRanges(int P) : ranges_(5*P) {}
138 void print() const {
139 std::cout << "ranges=[";
140 for (std::size_t p=0; p<ranges_.size()/5; p++)
141 std::cout << rlo(p) << "," << rhi(p) << "/"
142 << clo(p) << "," << chi(p) << "/" << leaf_procs(p) << " ";
143 std::cout << "];" << std::endl;
144 }
145 int rlo(int p) const { return ranges_[5*p+0]; }
146 int rhi(int p) const { return ranges_[5*p+1]; }
147 int clo(int p) const { return ranges_[5*p+2]; }
148 int chi(int p) const { return ranges_[5*p+3]; }
149 int leaf_procs(int p) const { return ranges_[5*p+4]; }
150 int& rlo(int p) { return ranges_[5*p+0]; }
151 int& rhi(int p) { return ranges_[5*p+1]; }
152 int& clo(int p) { return ranges_[5*p+2]; }
153 int& chi(int p) { return ranges_[5*p+3]; }
154 int& leaf_procs(int p) { return ranges_[5*p+4]; }
155 private:
156 std::vector<int> ranges_; // rlo, rhi, clo, chi, leaf_procs
157 };
158
159#ifndef DOXYGEN_SHOULD_SKIP_THIS
160 template<typename scalar_t> class WorkFactorMPI {
161 public:
162 std::vector<WorkFactorMPI<scalar_t>> c;
163
164 // (U.cols x U.cols) \tilde(D)
166
167 // (U.cols x V.cols) bottom part of \tilde{V}
169 std::unique_ptr<WorkFactor<scalar_t>> w_seq;
170 };
171#endif //DOXYGEN_SHOULD_SKIP_THIS
172
173
184 template<typename scalar_t> class HSSFactorsMPI {
185 template<typename T> friend class HSSMatrixMPI;
186 template<typename T> friend class HSSMatrixBase;
187
188 public:
189
197 std::size_t memory() {
198 return sizeof(*this) + L_.memory() + Vt0_.memory()
199 + W1_.memory() + Q_.memory() + D_.memory()
200 + sizeof(int)*piv_.size();
201 }
202
210 std::size_t nonzeros() const {
211 return L_.nonzeros() + Vt0_.nonzeros() + W1_.nonzeros()
212 + Q_.nonzeros() + D_.nonzeros();
213 }
214
218 const DistributedMatrix<scalar_t>& Vhat() const { return Vt0_; }
219
224
225 private:
226 // (U.rows-U.cols x U.rows-U.cols), empty at the root
228
229 // (U.rows-U.cols x V.cols)
230 // at the root, Vt0_ stored Vhat
232
233 // (U.cols x U.rows) bottom part of W
234 // if (U.rows == U.cols) then W == I and is not stored!
236
237 // (U.rows x U.rows) Q from LQ(W0)
238 // if (U.rows == U.cols) then Q == I and is not stored!
240
241 // (U.rows x U.rows) at the root holds LU(D), else empty
243 std::vector<int> piv_; // hold permutation from LU(D) at root
244 };
245
246
247#ifndef DOXYGEN_SHOULD_SKIP_THIS
248 template<typename scalar_t> class WorkSolveMPI {
249 public:
250 std::vector<WorkSolveMPI<scalar_t>> c;
251 std::unique_ptr<WorkSolve<scalar_t>> w_seq;
252
253 // do we need all these?? x only used in bwd, y only used in fwd??
255 DistributedMatrix<scalar_t> ft1; // TODO document the sizes here
258
259 // DO NOT STORE reduced_rhs here!!!
260 DistributedMatrix<scalar_t> reduced_rhs;
261 std::pair<std::size_t,std::size_t> offset;
262 };
263#endif //DOXYGEN_SHOULD_SKIP_THIS
264
265
266#ifndef DOXYGEN_SHOULD_SKIP_THIS
267 template<typename scalar_t> class WorkExtractMPI {
268 public:
269 std::vector<WorkExtractMPI<scalar_t>> c;
270 DistributedMatrix<scalar_t> y, z;
271 std::vector<std::size_t> I, J, rl2g, cl2g, ycols, zcols;
272 std::unique_ptr<WorkExtract<scalar_t>> w_seq;
273 void split_extraction_sets
274 (const std::pair<std::size_t,std::size_t>& dim) {
275 if (c.empty()) {
276 c.resize(2);
277 c[0].I.reserve(I.size());
278 c[1].I.reserve(I.size());
279 for (auto i : I)
280 if (i < dim.first) c[0].I.push_back(i);
281 else c[1].I.push_back(i - dim.first);
282 c[0].J.reserve(J.size());
283 c[1].J.reserve(J.size());
284 for (auto j : J)
285 if (j < dim.second) c[0].J.push_back(j);
286 else c[1].J.push_back(j - dim.second);
287 }
288 }
289 void communicate_child_ycols(const MPIComm& comm, int rch1) {
290 // TODO optimize these 4 bcasts?
291 auto rch0 = 0;
292 auto c0ycols = c[0].ycols.size();
293 auto c1ycols = c[1].ycols.size();
294 comm.broadcast_from(c0ycols, rch0);
295 comm.broadcast_from(c1ycols, rch1);
296 c[0].ycols.resize(c0ycols);
297 c[1].ycols.resize(c1ycols);
298 comm.broadcast_from(c[0].ycols, rch0);
299 comm.broadcast_from(c[1].ycols, rch1);
300 }
301 void combine_child_ycols() {
302 auto c0ycols = c[0].ycols.size();
303 auto c1ycols = c[1].ycols.size();
304 ycols.resize(c0ycols + c1ycols);
305 std::copy(c[0].ycols.begin(), c[0].ycols.end(), ycols.begin());
306 std::copy(c[1].ycols.begin(), c[1].ycols.end(),
307 ycols.begin()+c0ycols);
308 }
309 };
310#endif //DOXYGEN_SHOULD_SKIP_THIS
311
312
313#ifndef DOXYGEN_SHOULD_SKIP_THIS
314 template<typename scalar_t> class WorkExtractBlocksMPI {
315 public:
316 WorkExtractBlocksMPI(std::size_t nb) {
317 y.resize(nb);
318 z.resize(nb);
319 I.resize(nb);
320 J.resize(nb);
321 rl2g.resize(nb);
322 cl2g.resize(nb);
323 ycols.resize(nb);
324 zcols.resize(nb);
325 }
326 std::vector<WorkExtractBlocksMPI<scalar_t>> c;
327 std::vector<DistributedMatrix<scalar_t>> y, z;
328 std::vector<std::vector<std::size_t>> I, J, rl2g, cl2g, ycols, zcols;
329 std::vector<std::unique_ptr<WorkExtract<scalar_t>>> w_seq;
330
331 void split_extraction_sets
332 (const std::pair<std::size_t,std::size_t>& dim) {
333 if (c.empty()) {
334 auto nb = I.size();
335 c.reserve(2);
336 c.emplace_back(nb);
337 c.emplace_back(nb);
338 for (std::size_t k=0; k<nb; k++) {
339 c[0].I[k].reserve(I[k].size());
340 c[1].I[k].reserve(I[k].size());
341 for (auto i : I[k])
342 if (i < dim.first) c[0].I[k].push_back(i);
343 else c[1].I[k].push_back(i - dim.first);
344 c[0].J[k].reserve(J[k].size());
345 c[1].J[k].reserve(J[k].size());
346 for (auto j : J[k])
347 if (j < dim.second) c[0].J[k].push_back(j);
348 else c[1].J[k].push_back(j - dim.second);
349 }
350 }
351 }
352
353 void communicate_child_ycols(const MPIComm& comm, int rch1) {
354 int rank = comm.rank(), P = comm.size();
355 int P0total = rch1, P1total = P - rch1;
356 int pcols = std::floor(std::sqrt((float)P0total));
357 int prows = P0total / pcols;
358 int P0active = prows * pcols;
359 pcols = std::floor(std::sqrt((float)P1total));
360 prows = P1total / pcols;
361 int P1active = prows * pcols;
362 std::vector<MPI_Request> sreq;
363 sreq.reserve(P);
364 int sreqs = 0;
365 std::vector<std::size_t> sbuf0, sbuf1;
366 if (rank < P0active) {
367 if (rank < (P-P0active)) {
368 // I'm one of the first P-P0active processes that are active
369 // on child0, so I need to send to one or more others which
370 // are not active on child0, ie the ones in [P0active,P)
371 std::size_t ssize = 0;
372 for (std::size_t k=0; k<I.size(); k++)
373 ssize += 1 + c[0].ycols[k].size();
374 sbuf0.reserve(ssize);
375 for (std::size_t k=0; k<I.size(); k++) {
376 sbuf0.push_back(c[0].ycols[k].size());
377 for (auto i : c[0].ycols[k])
378 sbuf0.push_back(i);
379 }
380 for (int p=P0active; p<P; p++)
381 if (rank == (p - P0active) % P0active) {
382 sreq.emplace_back();
383 comm.isend(sbuf0, p, 0, &sreq.back());
384 }
385 }
386 }
387 if (rank >= rch1 && rank < rch1+P1active) {
388 if ((rank-rch1) < (P-P1active)) {
389 // I'm one of the first P-P1active processes that are active
390 // on child1, so I need to send to one or more others which
391 // are not active on child1, ie the ones in [0,rch1) union
392 // [rch1+P1active,P)
393 std::size_t ssize = 0;
394 for (std::size_t k=0; k<I.size(); k++)
395 ssize += 1 + c[1].ycols[k].size();
396 sbuf1.reserve(ssize);
397 for (std::size_t k=0; k<I.size(); k++) {
398 sbuf1.push_back(c[1].ycols[k].size());
399 for (auto i : c[1].ycols[k])
400 sbuf1.push_back(i);
401 }
402 for (int p=0; p<rch1; p++)
403 if (rank - rch1 == p % P1active) {
404 sreq.emplace_back();
405 comm.isend(sbuf1, p, 1, &sreq[sreqs++]);
406 }
407 for (int p=rch1+P1active; p<P; p++)
408 if (rank - rch1 == (p - P1active) % P1active) {
409 sreq.emplace_back();
410 comm.isend(sbuf1, p, 1, &sreq.back());
411 }
412 }
413 }
414
415 if (rank >= P0active) {
416 // I'm not active on child0, so I need to receive
417 int dest = -1;
418 for (int p=0; p<P0active; p++)
419 if (p == (rank - P0active) % P0active) { dest = p; break; }
420 assert(dest >= 0);
421 auto buf = comm.recv<std::size_t>(dest, 0);
422 auto ptr = buf.data();
423 for (std::size_t k=0; k<I.size(); k++) {
424 auto c0ycols = *ptr++;
425 c[0].ycols[k].resize(c0ycols);
426 for (std::size_t i=0; i<c0ycols; i++)
427 c[0].ycols[k][i] = *ptr++;
428 }
429 }
430 if (!(rank >= rch1 && rank < rch1+P1active)) {
431 // I'm not active on child1, so I need to receive
432 int dest = -1;
433 for (int p=rch1; p<rch1+P1active; p++) {
434 if (rank < rch1) {
435 if (p - rch1 == rank % P1active) { dest = p; break; }
436 } else if (p - rch1 == (rank - P1active) % P1active) {
437 dest = p; break;
438 }
439 }
440 assert(dest >= 0);
441 auto buf = comm.recv<std::size_t>(dest, 1);
442 auto ptr = buf.data();
443 for (std::size_t k=0; k<I.size(); k++) {
444 auto c1ycols = *ptr++;
445 c[1].ycols[k].resize(c1ycols);
446 for (std::size_t i=0; i<c1ycols; i++)
447 c[1].ycols[k][i] = *ptr++;
448 }
449 }
450 wait_all(sreq);
451 }
452
453 void combine_child_ycols(const std::vector<bool>& odiag) {
454 for (std::size_t k=0; k<I.size(); k++) {
455 if (!odiag[k]) {
456 ycols[k].clear();
457 continue;
458 }
459 auto c0ycols = c[0].ycols[k].size();
460 auto c1ycols = c[1].ycols[k].size();
461 ycols[k].resize(c0ycols + c1ycols);
463 (c[0].ycols[k].begin(), c[0].ycols[k].end(), ycols[k].begin());
465 (c[1].ycols[k].begin(), c[1].ycols[k].end(),
466 ycols[k].begin()+c0ycols);
467 }
468 }
469 };
470#endif //DOXYGEN_SHOULD_SKIP_THIS
471
472 } // end namespace HSS
473} // end namespace strumpack
474
475#endif // HSS_EXTRA_MPI_HPP
Contains the DistributedMatrix and DistributedMatrixWrapper classes, wrappers around ScaLAPACK/PBLAS ...
2D block cyclicly distributed matrix, as used by ScaLAPACK.
Definition: DistributedMatrix.hpp:84
Contains data related to ULV factorization of a distributed HSS matrix.
Definition: HSSExtraMPI.hpp:184
std::size_t memory()
Definition: HSSExtraMPI.hpp:197
std::size_t nonzeros() const
Definition: HSSExtraMPI.hpp:210
DistributedMatrix< scalar_t > & Vhat()
Definition: HSSExtraMPI.hpp:223
const DistributedMatrix< scalar_t > & Vhat() const
Definition: HSSExtraMPI.hpp:218
Abstract base class for Hierarchically Semi-Separable (HSS) matrices.
Definition: HSSMatrixBase.hpp:83
Distributed memory implementation of the HSS (Hierarchically Semi-Separable) matrix format.
Definition: HSSMatrixMPI.hpp:68
Definition: HSSExtraMPI.hpp:134
Definition: StrumpackOptions.hpp:43
void wait_all(std::vector< MPIRequest > &reqs)
Definition: MPIWrapper.hpp:173
void copy(std::size_t m, std::size_t n, const DenseMatrix< scalar_from_t > &a, std::size_t ia, std::size_t ja, DenseMatrix< scalar_to_t > &b, std::size_t ib, std::size_t jb)
Definition: DenseMatrix.hpp:1231