KTH framework for Nek5000 toolboxes; testing version  0.0.1
crs_amg.c
Go to the documentation of this file.
1 #include <stddef.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <math.h>
6 #include <sys/stat.h>
7 #include <limits.h>
8 #include "gslib.h"
9 
10 #define crs_setup PREFIXED_NAME(crs_amg_setup)
11 #define crs_solve PREFIXED_NAME(crs_amg_solve)
12 #define crs_stats PREFIXED_NAME(crs_amg_stats)
13 #define crs_free PREFIXED_NAME(crs_amg_free )
14 
15 #ifndef AMG_BLOCK_ROWS
16 # define AMG_BLOCK_ROWS 2400
17 #endif
18 
19 #ifndef AMG_MAX_ROWS
20 # define AMG_MAX_ROWS 12000
21 #endif
22 
23 #include "crs_amg_io.h"
24 
25 static double get_time(void)
26 {
27 #ifdef GS_TIMING
28  return comm_time();
29 #else
30  return 0;
31 #endif
32 }
33 
34 static void barrier(const struct comm *c)
35 {
36 #ifdef GS_BARRIER
37  comm_barrier(c);
38 #endif
39 }
40 
41 /* sparse matrix, condensed sparse row */
42 struct csr_mat {
43  uint rn, cn, *row_off, *col;
44  double *a;
45 };
46 
47 /* z = alpha y + beta M x */
48 static double apply_M(
49  double *z, const double alpha, const double *y,
50  const double beta, const struct csr_mat *const M, const double *x)
51 {
52  uint i; const uint rn=M->rn;
53  const uint *const row_off = M->row_off, *col = M->col;
54  const double *a = M->a;
55  const double t0 = get_time();
56  for(i=0;i<rn;++i) {
57  uint j; const uint je=row_off[i+1]; double t = 0;
58  for(j=row_off[i]; j<je; ++j) t += (*a++) * x[*col++];
59  *z++ = alpha*(*y++) + beta*t;
60  }
61  return get_time()-t0;
62 }
63 
64 /* z = M^t x */
65 static double apply_Mt(
66  double *const z, const struct csr_mat *const M, const double *x)
67 {
68  uint i; const uint rn=M->rn,cn=M->cn;
69  const uint *const row_off = M->row_off, *col = M->col;
70  const double *a = M->a;
71  const double t0 = get_time();
72  for(i=0;i<cn;++i) z[i]=0;
73  for(i=0;i<rn;++i) {
74  uint j; const uint je=row_off[i+1]; const double xi = *x++;
75  for(j=row_off[i]; j<je; ++j) z[*col++] += (*a++) * xi;
76  }
77  return get_time()-t0;
78 }
79 
80 struct Q { uint nloc; struct gs_data *gsh; };
81 
82 /* ve = Q v */
83 static double apply_Q(
84  double *const ve, const struct Q *const Q, const double *const v,
85  const struct comm *comm)
86 {
87  double t0;
88  memcpy(ve,v,Q->nloc*sizeof(double));
89  barrier(comm); t0=get_time();
90  gs(ve,gs_double,gs_add,0,Q->gsh,0);
91  return get_time()-t0;
92 }
93 
94 /* z := alpha y + beta Q^t x
95  (x := Q Q^t x as a side effect)
96  */
97 static double apply_Qt(
98  double *z, const double alpha, const double *y,
99  const double beta, const struct Q *Q, double *x,
100  const struct comm *comm)
101 {
102  double t0,t1;
103  uint i; const uint nloc=Q->nloc;
104  barrier(comm); t0=get_time();
105  gs(x,gs_double,gs_add,1,Q->gsh,0);
106  t1 = get_time() - t0;
107  for(i=0;i<nloc;++i) *z++ = alpha*(*y++) + beta*(*x++);
108  return t1;
109 }
110 
111 struct crs_data {
112  struct comm comm;
113  struct gs_data *gs_top;
114  uint un, *umap; /* number of unique id's on this proc, map to user ids */
115  double tni; /* 1 / (total number of unique ids) ... for computing mean */
117  unsigned levels;
118  unsigned *cheb_m; /* cheb_m [levels-1] : smoothing steps */
119  double *cheb_rho; /* cheb_rho[levels-1] : spectral radius of smoother */
120  uint *lvl_offset;
121  double *Dff; /* diagonal smoother, F-relaxation */
122  struct Q *Q_W, *Q_AfP, *Q_Aff;
123  struct csr_mat *W, *AfP, *Aff;
124  double *b, *x, *c, *c_old, *r, *buf;
125  double *timing; uint timing_n;
126 };
127 
128 static void amg_exec(struct crs_data *const amg)
129 {
130  unsigned lvl; const unsigned levels=amg->levels;
131  const uint *const off = amg->lvl_offset;
132  double *const b = amg->b, *const x = amg->x;
133  double *c = amg->c, *c_old = amg->c_old, *r = amg->r;
134  double *timing = amg->timing;
135  /* restrict down all levels */
136  for(lvl=0;lvl<levels-1;++lvl,timing+=6) {
137  double *const b_l = b+off[lvl], *const b_lp1 = b+off[lvl+1];
138  /* b_{l+1} += W^t b_l */
139  timing[1]+=apply_Mt(amg->buf, &amg->W[lvl],b_l);
140  timing[0]+=apply_Qt(b_lp1, 1,b_lp1, 1,&amg->Q_W[lvl],amg->buf, &amg->comm);
141  }
142  /* solve bottom equation (1 dof) */
143  { const uint i=off[levels-1];
144  if(off[levels]-i) x[i]=amg->Dff[i]*b[i]; }
145  for(lvl=levels-1;lvl--;) {
146  double *const b_l = b+off[lvl];
147  double *const x_l = x+off[lvl], *const x_lp1 = x+off[lvl+1];
148  const double *const d_l = amg->Dff+off[lvl];
149  const uint n = off[lvl+1]-off[lvl]; uint i;
150  const unsigned m = amg->cheb_m[lvl]; unsigned ci;
151  double alpha, beta, gamma;
152  timing = amg->timing + lvl*6;
153  /* buf = Q x_{l+1} */
154  timing[2]+=apply_Q(amg->buf, &amg->Q_AfP[lvl],x_lp1, &amg->comm);
155  /* x_l = W x_{l+1} */
156  timing[3]+=apply_M(x_l, 0,b_l, 1,&amg->W [lvl], amg->buf);
157  /* b_l -= AfP x_{l+1} */
158  timing[3]+=apply_M(b_l, 1,b_l, -1,&amg->AfP[lvl], amg->buf);
159  /* c_1 = Dff b_l */
160  for(i=0;i<n;++i) c[i]=d_l[i]*b_l[i];
161  if(m>1) {
162  alpha = amg->cheb_rho[lvl]/2, alpha*=alpha;
163  gamma = 2*alpha/(1-2*alpha), beta = 1 + gamma;
164  /* r_1 = b_l - Aff c_1 */
165  timing[4]+=apply_Q(amg->buf, &amg->Q_Aff[lvl],c, &amg->comm);
166  timing[5]+=apply_M(r, 1,b_l, -1,&amg->Aff[lvl],amg->buf);
167  /* c_2 = (1+gamma)(c_1 + Dff r_1) */
168  { double *const temp = c; c=c_old,c_old=temp; }
169  for(i=0;i<n;++i) c[i] = beta*(c_old[i]+d_l[i]*r[i]);
170  }
171  for(ci=3;ci<=m;++ci) {
172  gamma = alpha*beta, gamma = gamma/(1-gamma), beta = 1 + gamma;
173  /* r_i = b_l - Aff c_i */
174  timing[4]+=apply_Q(amg->buf, &amg->Q_Aff[lvl],c, &amg->comm);
175  timing[5]+=apply_M(r, 1,b_l, -1,&amg->Aff[lvl],amg->buf);
176  /* c_{i+1} = (1+gamma)*(c_i+D r_i) - gamma c_{i-1} */
177  { double *const temp = c; c=c_old,c_old=temp; }
178  for(i=0;i<n;++i) c[i] = beta*(c_old[i]+d_l[i]*r[i]) - gamma*c[i];
179  }
180  for(i=0;i<n;++i) x_l[i]+=c[i];
181  }
182  amg->timing_n++;
183 }
184 
185 void crs_solve(double *x, struct crs_data *data, double *b)
186 {
187  uint i; const uint un = data->un; const uint *const umap = data->umap;
188  double *const ub = data->b, *const ux = data->x;
189 
190  gs(b, gs_double,gs_add, 1, data->gs_top, 0);
191  for(i=0;i<un;++i) ub[i]=b[umap[i]];
192 
193  amg_exec(data);
194 
195  if(data->null_space) {
196  const double avg = data->tni*comm_reduce_double(&data->comm, gs_add, ux,un);
197  for(i=0;i<un;++i) ux[i] -= avg;
198  }
199 
200  for(i=0;i<un;++i) x[umap[i]]=ux[i];
201  gs(x, gs_double,gs_add, 0, data->gs_top, 0);
202 }
203 
204 void crs_stats(const struct crs_data *const data)
205 {
206  const unsigned lm1 = data->levels-1;
207  double *avg = tmalloc(double, 2*6*lm1);
208  double ni = 1/((double)data->timing_n * data->comm.np);
209  uint i;
210  for(i=0;i<6*lm1;++i) avg[i] = ni*data->timing[i];
211  comm_allreduce(&data->comm,gs_double,gs_add, avg,6*lm1, avg+6*lm1);
212  if(data->comm.id==0) {
213  double *t = avg; unsigned lvl;
214  printf("AMG stats:\n");
215  for(lvl=0;lvl<lm1;++lvl,t+=6)
216  printf(" [lvl %02u] comm: Wt=%0.3e W,AfP=%0.3e Aff=%0.3e\n"
217  " [lvl %02u] work: Wt=%0.3e W,AfP=%0.3e Aff=%0.3e\n",
218  lvl, t[0],t[2],t[4], lvl, t[1],t[3],t[5]);
219  }
220  free(avg);
221 }
222 
223 /*==========================================================================
224 
225  Setup
226 
227  ==========================================================================*/
228 
229 
230 /* remote id - designates the i-th uknown owned by proc p */
231 struct rid { uint p,i; };
232 
233 /* the data for each id to be read from amg.dat */
234 struct id_data {
235  double D;
236  ulong id;
237  unsigned level;
238 };
239 
240 /* global non-zero (matrix entry) */
241 struct gnz { ulong i,j; double a; };
242 
243 /*
244  The user ids id[n] are assigned to unique procs.
245 
246  Output
247  uid --- ulong array; uid[0 ... uid.n-1] are the ids owned by this proc
248  rid_map --- id[i] is uid[ rid_map[i].i ] on proc rid_map[i].p
249  map = assign_dofs(...) --- uid[i] is id[map[i]]
250 
251  map is returned only when rid_map is null, and vice versa
252 */
253 static uint *assign_dofs(struct array *const uid,
254  struct rid *const rid_map,
255  const ulong *const id, const uint n,
256  const uint p,
257  struct gs_data *gs_top, buffer *const buf)
258 {
259  uint i,count, *map=0;
260  struct rid *const rid = rid_map?rid_map:tmalloc(struct rid,n);
261  for(i=0;i<n;++i) rid[i].p=p, rid[i].i=i;
262  gs_vec(n?&rid[0].p:0,2, gs_sint,gs_add,0, gs_top,buf);
263  for(count=0,i=0;i<n;++i) {
264  if(rid[i].i==i && id[i]!=0 && rid[i].p==p) ++count; }
265  array_init(ulong,uid,count); uid->n=count;
266  if(rid_map==0) {
267  map=tmalloc(uint,count);
268  for(count=0,i=0;i<n;++i) {
269  if(rid[i].i==i && id[i]!=0 && rid[i].p==p) map[count++]=i; }
270  }
271  { ulong *const uid_p = uid->ptr;
272  for(count=0,i=0;i<n;++i) if(rid[i].i==i && id[i]!=0 && rid[i].p==p)
273  uid_p[count]=id[i], rid[i].i=count, ++count;
274  }
275  if(rid_map) gs_vec(n?&rid[0].p:0,2, gs_sint,gs_add,0, gs_top,buf);
276  else free(rid);
277  return map;
278 }
279 
280 static void localize_rows(struct gnz *p, uint nz,
281  const ulong *const uid, const uint *const id_perm)
282 {
283  uint i=0; while(nz--) {
284  const ulong id=p->i;
285  while(uid[i]<id) ++i;
286  (p++)->i=id_perm[i];
287  }
288 }
289 
290 static void find_mat_offs(uint *const off, const int levels,
291  const struct gnz *const p, const uint nz,
292  const struct id_data *const id)
293 {
294  int lvl=-1; uint i; for(i=0;i<nz;++i) {
295  const int lvl_i = id[p[i].i].level;
296  while(lvl<lvl_i) off[++lvl]=i;
297  }
298  while(lvl<levels-1) off[++lvl]=i;
299 }
300 
301 static void compress_mat(
302  uint *const row_off, const int rn, uint *const col, double *const a,
303  const struct gnz *const p, const uint nz)
304 {
305  int r = -1; uint k;
306  for(r=-1, k=0;k<nz;++k) {
307  const int row = p[k].i;
308  while(r<row) row_off[++r]=k;
309  col[k] = p[k].j;
310  a[k] = p[k].a;
311  }
312  while(r<rn) row_off[++r]=k;
313 }
314 
315 static void amg_setup_mat(
316  struct array *ids, const uint nloc, struct csr_mat *const mat,
317  const ulong *uid, const uint uid_n, const uint *id_perm,
318  const uint i0, const uint j0, struct gnz *const p, const uint nz,
319  buffer *buf)
320 {
321  slong *id = ids->ptr; ulong last_id = ULONG_MAX;
322  const uint ne = ids->n;
323  uint k,i=0,ie=nloc;
324  sarray_sort(struct gnz,p,nz, j,1, buf); /* sort by col */
325  for(k=0;k<nz;++k) {
326  const ulong j_id = p[k].j;
327  p[k].i = p[k].i - i0;
328  if(j_id == last_id) { p[k].j = p[k-1].j; continue; }
329  last_id = j_id;
330  /* check local ids */
331  while(i<uid_n && uid[i]<j_id) ++i;
332  if(i<uid_n && uid[i]==j_id) { p[k].j=id_perm[i]-j0; continue; }
333  /* check old group of remote ids */
334  while(ie<ne && (ulong)(-id[ie])<j_id) ++ie;
335  if(ie<ne && (ulong)(-id[ie])==j_id) { p[k].j=ie; continue; }
336  /* not found, add to new group of remote ids */
337  id = array_reserve(slong, ids, ids->n+1);
338  id[ids->n] = -(slong)j_id, p[k].j = ids->n++;
339  }
340  sarray_sort_2(struct gnz,p,nz, i,1, j,1, buf);
341 
342  mat->cn = ids->n;
343  compress_mat(mat->row_off,mat->rn, mat->col,mat->a, p,nz);
344 }
345 
346 static uint amg_setup_mats(
347  struct crs_data *const data,
348  const ulong *const uid, const uint uid_n,
349  const uint *const id_perm,
350  const struct id_data *const id,
351  struct array *mat,
352  buffer *buf)
353 {
354  const uint *const off = data->lvl_offset;
355  struct array ide = null_array; uint max_e=0;
356  const unsigned levels=data->levels;
357  unsigned lvl, m;
358  uint *mat_off[3]; struct csr_mat *csr_mat[3];
359  data->Q_W = tmalloc(struct Q, (levels-1)*3);
360  data->Q_AfP = data->Q_W + (levels-1);
361  data->Q_Aff = data->Q_AfP + (levels-1);
362  csr_mat[0] = data->W = tmalloc(struct csr_mat, (levels-1)*3);
363  csr_mat[1] = data->AfP = data->W + (levels-1);
364  csr_mat[2] = data->Aff = data->AfP + (levels-1);
365  mat_off[0] = tmalloc(uint, levels*3);
366  mat_off[1] = mat_off[0]+levels;
367  mat_off[2] = mat_off[1]+levels;
368  for(m=0;m<3;++m) {
369  /* change row from uid to local index */
370  localize_rows(mat[m].ptr,mat[m].n, uid,id_perm);
371  /* sort by row */
372  sarray_sort(struct gnz,mat[m].ptr,mat[m].n, i,1, buf);
373  /* find offsets of each level */
374  find_mat_offs(mat_off[m],levels, mat[m].ptr,mat[m].n, id);
375  }
376  /* allocate CSR arrays */
377  if(levels>1) {
378  uint *ui = tmalloc(uint, 3*((off[levels-1]-off[0])+(levels-1))
379  +mat[0].n+mat[1].n+mat[2].n);
380  double *a = tmalloc(double, mat[0].n+mat[1].n+mat[2].n);
381  for(m=0;m<3;++m) for(lvl=0;lvl<levels-1;++lvl) {
382  const uint rn = off[lvl+1]-off[lvl];
383  const uint nz = mat_off[m][lvl+1]-mat_off[m][lvl];
384  csr_mat[m][lvl].rn = rn;
385  csr_mat[m][lvl].row_off = ui, ui += rn+1;
386  csr_mat[m][lvl].col = ui, ui += nz;
387  csr_mat[m][lvl].a = a, a += nz;
388  }
389  }
390  for(lvl=0;lvl<levels-1;++lvl) {
391  uint i; const uint j0=off[lvl+1], nloc = off[levels]-j0;
392  slong *p = array_reserve(slong, &ide, nloc);
393  for(i=0;i<nloc;++i) p[i] = id[i+j0].id;
394  data->Q_W[lvl].nloc = data->Q_AfP[lvl].nloc = ide.n = nloc;
395  amg_setup_mat(&ide,nloc, &data->W[lvl],
396  uid, uid_n, id_perm, off[lvl],j0,
397  (struct gnz*)mat[0].ptr + mat_off[0][lvl],
398  mat_off[0][lvl+1]-mat_off[0][lvl], buf);
399  data->Q_W[lvl].gsh = gs_setup(ide.ptr,ide.n, &data->comm, 0,gs_auto,1);
400  amg_setup_mat(&ide,nloc, &data->AfP[lvl],
401  uid, uid_n, id_perm, off[lvl],j0,
402  (struct gnz*)mat[1].ptr + mat_off[1][lvl],
403  mat_off[1][lvl+1]-mat_off[1][lvl], buf);
404  data->Q_AfP[lvl].gsh = gs_setup(ide.ptr,ide.n, &data->comm, 0,gs_auto,1);
405  if(ide.n>max_e) max_e=ide.n;
406  }
407  for(lvl=0;lvl<levels-1;++lvl) {
408  uint i; const uint j0=off[lvl], nloc = off[lvl+1]-j0;
409  slong *p = array_reserve(slong, &ide, nloc);
410  for(i=0;i<nloc;++i) p[i] = id[i+j0].id;
411  data->Q_Aff[lvl].nloc = ide.n = nloc;
412  amg_setup_mat(&ide,nloc, &data->Aff[lvl],
413  uid, uid_n, id_perm, j0,j0,
414  (struct gnz*)mat[2].ptr + mat_off[2][lvl],
415  mat_off[2][lvl+1]-mat_off[2][lvl], buf);
416  data->Q_Aff[lvl].gsh = gs_setup(ide.ptr,ide.n, &data->comm, 0,gs_auto,1);
417  if(ide.n>max_e) max_e=ide.n;
418  }
419  free(mat_off[0]);
420  array_free(&ide);
421  return max_e;
422 }
423 
424 static uint *compute_offsets(
425  const struct id_data *const id, const uint n,
426  const int levels)
427 {
428  uint i, *off = tmalloc(uint, levels+1);
429  int lvl = -1;
430  for(i=0;i<n;++i) {
431  const int lvl_i = id[i].level;
432  while(lvl<lvl_i) off[++lvl]=i;
433  }
434  while(lvl<levels) off[++lvl]=i;
435  return off;
436 }
437 
438 static void read_data(
439  struct crs_data *const data,
440  struct array *ids, struct array mats[3],
441  struct crystal *const cr,
442  const ulong *uid, const uint uid_n, const char *datafname);
443 
444 static void read_data_mpiio(
445  struct crs_data *const data,
446  struct array *ids, struct array mats[3],
447  struct crystal *const cr,
448  const ulong *uid, const uint uid_n, const char *datafname);
449 
450 static void amg_setup_aux(struct crs_data *data, uint n, const ulong *id, const char *datafname)
451 {
452  struct crystal cr;
453  struct array uid; uint *id_perm;
454  struct array ids, mat[3];
455  uint max_e; ulong temp_long;
456 
457  crystal_init(&cr, &data->comm);
458  data->umap = assign_dofs(&uid,0, id,n,data->comm.id,data->gs_top,&cr.data);
459  data->un = uid.n;
460 
461  sortp_long(&cr.data,0, uid.ptr,uid.n,sizeof(ulong));
462  sarray_permute(ulong,uid.ptr ,uid.n, cr.data.ptr, &temp_long);
463  sarray_permute(uint ,data->umap,uid.n, cr.data.ptr, &max_e);
464 
465 #ifdef USEMPIIO
466  read_data_mpiio(data, &ids, mat, &cr, uid.ptr,uid.n, datafname);
467 #else
468  read_data(data, &ids, mat, &cr, uid.ptr,uid.n, datafname);
469 #endif
470 
471  /* we should have data for every uid;
472  if not, then the data is for a smaller problem than we were given */
473  { int not_happy = ids.n==uid.n ? 0 : 1;
474  if(comm_reduce_int(&data->comm, gs_max, &not_happy,1)) {
475  comm_barrier(&data->comm);
476  if(data->comm.id==0)
477  fail(1,__FILE__,__LINE__,"AMG: missing data for some rows");
478  else die(1);
479  }
480  }
481 
482  sarray_sort(struct id_data,ids.ptr,ids.n, id,1, &cr.data);
483  sarray_sort(struct id_data,ids.ptr,ids.n, level,0, &cr.data);
484  sarray_permute(uint,data->umap,ids.n, cr.data.ptr, &max_e);
485  id_perm = tmalloc(uint, uid.n);
486  sarray_perm_invert(id_perm, cr.data.ptr, uid.n);
487  /* the global id uid[i] will have local index id_perm[i]
488  (the local storage is sorted by level) */
489 
490  data->lvl_offset = compute_offsets(ids.ptr,ids.n, data->levels);
491 
492  max_e = amg_setup_mats(data,uid.ptr,uid.n,id_perm,ids.ptr,mat, &cr.data);
493 
494  { const unsigned levels=data->levels;
495  const uint *const off = data->lvl_offset;
496  struct id_data *const id = ids.ptr; const uint n = ids.n;
497  double *d;
498  uint i,max_f=0;
499  for(i=0;i<levels-1;++i) {
500  const uint nf = off[i+1]-off[i];
501  if(nf>max_f) max_f=nf;
502  }
503  d = data->Dff = tmalloc(double, 3*n + 3*max_f + max_e + 6*(levels-1));
504  data->x = data->Dff + n;
505  data->b = data->x + n;
506  data->c = data->b + n;
507  data->c_old = data->c + max_f;
508  data->r = data->c_old + max_f;
509  data->buf = data->r + max_f;
510  data->timing = data->buf + max_e;
511  for(i=0;i<n;++i) d[i]=id[i].D;
512  for(i=0;i<6*(levels-1);++i) data->timing[i]=0;
513  data->timing_n=0;
514  }
515 
516  free(id_perm);
517  array_free(&ids);
518  array_free(&mat[0]);
519  array_free(&mat[1]);
520  array_free(&mat[2]);
521  array_free(&uid);
522  crystal_free(&cr);
523 }
524 
525 static void amg_dump(
526  uint n, const ulong *id,
527  uint nz, const uint *Ai, const uint *Aj, const double *A,
528  struct crs_data *data, const char *datafname);
529 
531  uint n, const ulong *id,
532  uint nz, const uint *Ai, const uint *Aj, const double *A,
533  uint null_space, const struct comm *comm, const char *datafname, uint *ierr)
534 {
535  struct crs_data *data = tmalloc(struct crs_data,1);
536  struct stat info;
537  int dump;
538 
539  comm_dup(&data->comm,comm);
540 
541  dump = 0;
542  if(data->comm.id==0) {
543  char str1[132];
544  char str2[132];
545  char str3[132];
546  char str4[132];
547  sprintf(str1,"%s.amg.dat",datafname);
548  sprintf(str2,"%s.amg_W.dat",datafname);
549  sprintf(str3,"%s.amg_AfP.dat",datafname);
550  sprintf(str4,"%s.amg_Aff.dat",datafname);
551  if(stat(str1,&info)!=0) dump++;
552  if(stat(str2,&info)!=0) dump++;
553  if(stat(str3,&info)!=0) dump++;
554  if(stat(str4,&info)!=0) dump++;
555  if(dump!=0) printf("AMG files not found, dumping data ...\n"), fflush(stdout);
556  }
557  comm_bcast(&data->comm,&dump,sizeof(int),0);
558 
559  data->gs_top = gs_setup((const slong*)id,n, &data->comm, 1,
560  dump?gs_crystal_router:gs_auto, !dump);
561 
562  if(dump) {
563  amg_dump(n,id,nz,Ai,Aj,A,data,datafname);
564  gs_free(data->gs_top);
565 
566  if(data->comm.id==0) {
567  printf("AMG dump successful. Run AMG setup tool to continue.\n");
568  fflush(stdout);
569  }
570  comm_barrier(&data->comm);
571  comm_free(&data->comm);
572  free(data);
573  *ierr=1;
574  } else {
575  data->null_space = null_space;
576  amg_setup_aux(data, n,id, datafname);
577  *ierr=0;
578  }
579  return data;
580 }
581 
582 void crs_free(struct crs_data *data)
583 {
584  const unsigned levels = data->levels;
585 
586  free(data->Dff);
587 
588  if(levels>1) {
589  unsigned lvl;
590  for(lvl=0;lvl<levels-1;++lvl)
591  gs_free(data->Q_Aff[lvl].gsh),
592  gs_free(data->Q_AfP[lvl].gsh),
593  gs_free(data->Q_W[lvl].gsh);
594  free(data->W[0].a);
595  free(data->W[0].row_off);
596  }
597 
598  free(data->W);
599  free(data->Q_W);
600 
601  free(data->lvl_offset);
602 
603  free(data->cheb_m);
604  free(data->cheb_rho);
605 
606  free(data->umap);
607  gs_free(data->gs_top);
608  comm_free(&data->comm);
609 
610  free(data);
611 }
612 
613 /*==========================================================================
614 
615  Find ID
616 
617  As proc 0 reads in the files, it needs to know where to send the data.
618  The find_id function below takes a sorted list of id's (with no repeats)
619  and outputs the corresponding list of owning procs.
620 
621  ==========================================================================*/
622 
623 struct find_id_map { ulong id; uint p; };
624 
625 struct find_id_data {
626  struct array map;
627  struct array work;
628  struct crystal *cr;
629 };
630 
631 static void find_id_setup(
632  struct find_id_data *const data,
633  const ulong *id, uint n,
634  struct crystal *const cr)
635 {
636  const uint np = cr->comm.np;
637  uint i; struct find_id_map *q;
638 
639  data->cr = cr;
640  data->work.ptr=0, data->work.n=0, data->work.max=0;
641  array_init(struct find_id_map, &data->map, n);
642  data->map.n=n;
643  for(q=data->map.ptr,i=0;i<n;++i)
644  q[i].id = id[i], q[i].p = id[i] % np;
645  sarray_transfer(struct find_id_map,&data->map,p,1,cr);
646  sarray_sort(struct find_id_map,data->map.ptr,data->map.n, id,1, &cr->data);
647 }
648 
649 static void find_id_free(struct find_id_data *const data)
650 {
651  array_free(&data->map);
652  array_free(&data->work);
653 }
654 
655 struct find_id_work { ulong id; uint p; uint wp; };
656 
657 static int find_id(
658  uint *p_out, const unsigned p_stride,
659  struct find_id_data *const data,
660  const ulong *id, const unsigned id_stride, const uint n)
661 {
662  struct find_id_work *p, *p_end;
663  const struct find_id_map *q=data->map.ptr, *const q_end = q+data->map.n;
664  const uint np = data->cr->comm.np;
665  int not_found = 0;
666 
667  uint nn;
668  p = array_reserve(struct find_id_work, &data->work, n);
669  for(nn=n;nn;--nn,id=(const ulong*)((const char*)id+id_stride))
670  p->id=*id, p->p=0, p->wp = (*id)%np, ++p;
671  data->work.n=n;
672 
673  /* send to work proc */
674  sarray_transfer(struct find_id_work,&data->work,wp,1,data->cr);
675 
676  /* match id's with rid's */
677  sarray_sort(struct find_id_work,data->work.ptr,data->work.n, id,1,
678  &data->cr->data);
679  for(p=data->work.ptr,p_end=p+data->work.n;p!=p_end;++p) {
680  while(q!=q_end && q->id<p->id) ++q;
681  if(q==q_end) break;
682  if(q->id!=p->id) not_found=1,p->p=UINT_MAX;
683  else p->p=q->p;
684  }
685  for(;p!=p_end;++p) not_found=1,p->p=UINT_MAX;
686 
687  /* send back */
688  sarray_transfer(struct find_id_work,&data->work,wp,0,data->cr);
689 
690  /* map back to user data */
691  sarray_sort(struct find_id_work,data->work.ptr,data->work.n, id,1,
692  &data->cr->data);
693  p = data->work.ptr;
694  for(nn=n;nn;--nn,p_out=(uint*)((char*)p_out+p_stride))
695  *p_out = p->p, ++p;
696 
697  return comm_reduce_int(&data->cr->comm, gs_max, &not_found,1);
698 }
699 
700 
701 /*==========================================================================
702 
703  Read amg.dat, amg_*.dat
704 
705  The function read_data is responsible for reading these files,
706  and creating arrays
707  ids of struct id_data
708  mat[3] of struct gnz (W, AfP, Aff)
709  distributed to the appropriate procs
710 
711  ==========================================================================*/
712 
713 static ulong read_level_data(struct crs_data *data, struct file f)
714 {
715  unsigned i,n; ulong tn; double *buf;
716  int updt=1;
717  int code=0;
718  if(data->comm.id==0) {
719  double hdr; dread(&hdr,1,f,&code);
720  if(code==0) {
721  printf("AMG version %3.2f\n",hdr);
722  if(hdr!=2.01) {
723  printf("Outdates AMG dat files\n");
724  updt=0;
725  }
726  double t; dread(&t,1,f,&code);
727  data->levels = t;
728  printf("AMG: %u levels\n", data->levels);
729  }
730  }
731  comm_bcast(&data->comm, &data->levels,sizeof(unsigned), 0);
732  comm_bcast(&data->comm, &updt,sizeof(int), 0);
733  comm_bcast(&data->comm, &code,sizeof(int), 0);
734  if(!updt||code!=0) die(1);
735 
736  n = data->levels-1;
737  data->cheb_m = tmalloc(unsigned,n);
738  data->cheb_rho = tmalloc(double ,n);
739 
740  buf = tmalloc(double, 2*n+1);
741  if(data->comm.id==0) dread(buf,2*n+1,f,&code);
742  comm_bcast(&data->comm, &code,sizeof(int), 0);
743  if(code!=0) die(1);
744 
745  comm_bcast(&data->comm, buf,(2*n+1)*sizeof(double), 0);
746  for(i=0;i<n;++i) data->cheb_m[i] = buf[i];
747  for(i=0;i<n;++i) data->cheb_rho[i] = buf[n+i];
748  tn = buf[2*n];
749  data->tni = 1/(double)tn;
750  free(buf);
751 
752  if(data->comm.id==0) {
753  printf("AMG Chebyshev smoother data:\n");
754  for(i=0;i<n;++i)
755  printf("AMG level %u: %u iterations with rho = %g\n",
756  i+1, data->cheb_m[i], (double)data->cheb_rho[i]);
757  printf("AMG: %lu rows\n", (unsigned long)tn);
758  }
759 
760  return tn;
761 }
762 
763 static void read_data(
764  struct crs_data *const data,
765  struct array *ids, struct array mat[3],
766  struct crystal *const cr,
767  const ulong *uid, const uint uid_n, const char *datafname)
768 {
769  int code;
770  struct find_id_data fid;
771  const uint pid = data->comm.id;
772  ulong r,tn;
773  struct array read_buffer=null_array;
774  struct array id_buffer = null_array, mat_buffer = null_array;
775  uint *row_lens=0, *id_proc=0, *id_perm=0;
776  struct array mat_proc = null_array;
777  struct file f={0,0}, fm[3]={{0,0},{0,0},{0,0}};
778  unsigned m;
779  ids->ptr=0, ids->n=0, ids->max=0;
780  for(m=0;m<3;++m) mat[m].ptr=0,mat[m].n=0,mat[m].max=0;
781  find_id_setup(&fid, uid,uid_n, cr);
782  code=0;
783  if(pid==0) {
784  char str1[132];
785  char str2[132];
786  char str3[132];
787  char str4[132];
788  sprintf(str1,"%s.amg.dat",datafname);
789  sprintf(str2,"%s.amg_W.dat",datafname);
790  sprintf(str3,"%s.amg_AfP.dat",datafname);
791  sprintf(str4,"%s.amg_Aff.dat",datafname);
792  f = dopen(str1,'r',&code);
793  fm[0] = dopen(str2,'r',&code);
794  fm[1] = dopen(str3,'r',&code);
795  fm[2] = dopen(str4,'r',&code);
796  if(code==0) {
797  array_init(double, &read_buffer, 6*AMG_BLOCK_ROWS);
798  row_lens = tmalloc(uint, 5*AMG_BLOCK_ROWS);
799  id_proc = row_lens + 3*AMG_BLOCK_ROWS;
800  id_perm = id_proc + AMG_BLOCK_ROWS;
801  array_reserve(struct id_data, &id_buffer, AMG_BLOCK_ROWS);
802  }
803  }
804  comm_bcast(&data->comm,&code,sizeof(int),0);
805  if(code!=0) die(1);
806 
807  tn = read_level_data(data,f);
808  for(r=0;r<tn;r+=AMG_BLOCK_ROWS) {
809  unsigned nr = tn-r>AMG_BLOCK_ROWS ? AMG_BLOCK_ROWS : (unsigned)(tn-r);
810  uint mat_size[3]={0,0,0};
811 
812  /* read id data */
813  if(pid==0) {
814  unsigned i;
815  struct id_data *idp = id_buffer.ptr;
816  double *b = read_buffer.ptr;
817  dread(b,nr*6, f,&code);
818  if(code==0) {
819  printf("AMG: reading through row %lu, pass %u/%u\n",
820  (unsigned long)b[(nr-1)*6],
821  (unsigned)(r/AMG_BLOCK_ROWS+1),
822  (unsigned)((tn+AMG_BLOCK_ROWS-1)/AMG_BLOCK_ROWS));
823  for(i=0;i<nr;++i) {
824  idp[i].id = *b++;
825  idp[i].level = (uint)(*b++) - 1;
826  mat_size[0] += row_lens[3*i+0] = *b++; /* W row length */
827  mat_size[1] += row_lens[3*i+1] = *b++; /* AfP row length */
828  mat_size[2] += row_lens[3*i+2] = *b++; /* Aff row length */
829  idp[i].D = *b++;
830  }
831  id_buffer.n=nr;
832 
833  /* sort id_buffer, remember how to undo */
834  sarray_sort(struct id_data,idp,nr, id,1, &cr->data);
835  sarray_perm_invert(id_perm, cr->data.ptr, nr);
836  }
837  } else
838  id_buffer.n=0;
839 
840  comm_bcast(&data->comm,&code,sizeof(int),0);
841  if(code!=0)die(1);
842 
843  /* find who owns each row */
844  if(find_id(id_proc,sizeof(uint), &fid,
845  (const ulong*)((const char*)id_buffer.ptr+offsetof(struct id_data,id)),
846  sizeof(struct id_data), id_buffer.n)) {
847  if(pid==0)
848  fail(1,__FILE__,__LINE__,"AMG: data has more rows than given problem");
849  else die(1);
850  }
851  if(pid==0) { /* undo sorting of id_buffer */
852  buffer_reserve(&cr->data,sizeof(struct id_data)+sizeof(uint));
853  sarray_permute(struct id_data,id_buffer.ptr,nr, id_perm, cr->data.ptr);
854  sarray_permute(uint, id_proc, nr, id_perm, cr->data.ptr);
855  }
856  /* read matrix data */
857  for(m=0;m<3;++m) {
858  if(pid==0) {
859  const struct id_data *const idp = id_buffer.ptr;
860  unsigned i; uint j,rl;
861  struct gnz *p = array_reserve(struct gnz, &mat_buffer, mat_size[m]);
862  double *b = array_reserve(double, &read_buffer, 2*mat_size[m]);
863  uint *proc = array_reserve(uint, &mat_proc, mat_size[m]);
864  dread(b,2*mat_size[m], fm[m],&code);
865  if(code==0) {
866  for(i=0;i<nr;++i) {
867  const ulong i_id = idp[i].id;
868  const uint i_p = id_proc[i];
869  for(rl=row_lens[3*i+m],j=0;j<rl;++j)
870  p->i = i_id, p->j = *b++, p->a = *b++, *proc++ = i_p, ++p;
871  }
872  mat_buffer.n = mat_size[m];
873  }
874  } else
875  mat_buffer.n = 0;
876  comm_bcast(&data->comm,&code,sizeof(int),0);
877  if(code!=0)die(1);
878  sarray_transfer_ext(struct gnz,&mat_buffer,mat_proc.ptr,sizeof(uint),cr);
879  array_cat(struct gnz,&mat[m],mat_buffer.ptr,mat_buffer.n);
880  }
881  /* send id_data to owner */
882  sarray_transfer_ext(struct id_data,&id_buffer,id_proc,sizeof(uint),cr);
883  array_cat(struct id_data,ids,id_buffer.ptr,id_buffer.n);
884  }
885  array_free(&id_buffer);
886  array_free(&mat_buffer);
887  if(pid==0) {
888  array_free(&read_buffer);
889  free(row_lens);
890  array_free(&mat_proc);
891  dclose(fm[2]);
892  dclose(fm[1]);
893  dclose(fm[0]);
894  dclose(f);
895  }
896  find_id_free(&fid);
897 }
898 
899 #ifdef USEMPIIO
900 static void read_data_mpiio(struct crs_data *const data,
901  struct array *ids, struct array mat[3],
902  struct crystal *const cr,
903  const ulong *uid, const uint uid_n, const char *datafname)
904 {
905  int code;
906  struct find_id_data fid;
907  const uint pid = data->comm.id;
908  struct array read_buffer=null_array;
909  struct array id_buffer = null_array, mat_buffer = null_array;
910  uint *row_lens=0, *id_proc=0, *id_perm=0;
911  struct array mat_proc = null_array;
912  ulong scan_in, scan_buf[2];
913  uint mat_size[3] = {0,0,0};
914  unsigned i,m;
915 
916  ids->ptr = 0;
917  ids->n = 0;
918  ids->max = 0;
919  for(m = 0; m < 3; ++m) {
920  mat[m].ptr = 0;
921  mat[m].n = 0;
922  mat[m].max = 0;
923  }
924  find_id_setup(&fid, uid,uid_n, cr);
925  code = 0;
926 
927  struct file f = {0,0};
928  f = dopen("amg.dat", 'r', &code);
929  if(code != 0) die(1);
930  ulong tn = read_level_data(data, f);
931  dclose(f);
932 
933  if(pid==0)
934  printf("AMG: preading through rows ... ");
935 
936  struct sfile fs = {0,0};
937  struct sfile fsm[3] = {{0,0},{0,0},{0,0}};
938  char str1[132];
939  char str2[132];
940  char str3[132];
941  char str4[132];
942  sprintf(str1,"%s.amg.dat",datafname);
943  sprintf(str2,"%s.amg_W.dat",datafname);
944  sprintf(str3,"%s.amg_AfP.dat",datafname);
945  sprintf(str4,"%s.amg_Aff.dat",datafname);
946  fs = dopen_mpi(str1,'r', &data->comm, &code);
947  fsm[0] = dopen_mpi(str2,'r', &data->comm, &code);
948  fsm[1] = dopen_mpi(str3,'r', &data->comm, &code);
949  fsm[2] = dopen_mpi(str4,'r', &data->comm, &code);
950  code = comm_reduce_int(&data->comm, gs_max, &code, 1);
951  if(code != 0)
952  die(1);
953 
954  array_init(double, &read_buffer, 6*AMG_MAX_ROWS);
955  row_lens = tmalloc(uint, 5*AMG_MAX_ROWS);
956  id_proc = row_lens + 3*AMG_MAX_ROWS;
957  id_perm = id_proc + AMG_MAX_ROWS;
958  array_reserve(struct id_data, &id_buffer, AMG_MAX_ROWS);
959 
960  /* assign rows to proc */
961  unsigned nr = tn/data->comm.np;
962  for(i = 0; i < tn%data->comm.np;++i)
963  if(i == pid) nr++;
964 
965  if(nr>AMG_MAX_ROWS) {
966  if(pid==0)
967  fail(1,__FILE__,__LINE__,"AMG: AMG_MAX_ROWS too small!");
968  die(1);
969  }
970 
971  /* read id data */
972  ulong nr_off[2];
973  scan_in = (ulong)nr;
974  comm_scan(nr_off, &data->comm, gs_long, gs_add, &scan_in, 1, scan_buf);
975  const ulong off0 = 1+1+1+2*(data->levels-1)+1;
976  dview_mpi(off0+nr_off[0]*6, fs, &code);
977  code = comm_reduce_int(&data->comm, gs_max, &code,1);
978  if(code != 0)
979  die(1);
980 
981  double *b = read_buffer.ptr;
982  dread_mpi(b, nr*6, fs, &code);
983  code = comm_reduce_int(&data->comm, gs_max, &code,1);
984  if (code != 0)
985  die(1);
986 
987  struct id_data *idp = id_buffer.ptr;
988  for (i = 0; i < nr; ++i) {
989  idp[i].id = *b++;
990  idp[i].level = (uint)(*b++) - 1;
991  mat_size[0] += row_lens[3*i+0] = *b++; /* W row length */
992  mat_size[1] += row_lens[3*i+1] = *b++; /* AfP row length */
993  mat_size[2] += row_lens[3*i+2] = *b++; /* Aff row length */
994  idp[i].D = *b++;
995  }
996  id_buffer.n = nr;
997 
998  /* sort id_buffer, remember how to undo */
999  sarray_sort(struct id_data,idp,nr, id,1, &cr->data);
1000  sarray_perm_invert(id_perm, cr->data.ptr, nr);
1001  if (find_id(id_proc,sizeof(uint), &fid,
1002  (const ulong*)((const char*)id_buffer.ptr+offsetof(struct id_data,id)),
1003  sizeof(struct id_data), id_buffer.n))
1004  code = 1;
1005  if (comm_reduce_int(&data->comm, gs_max, &code, 1)) {
1006  if (pid==0)
1007  fail(1,__FILE__,__LINE__,"AMG: data has more rows than given problem");
1008  die(1);
1009  }
1010 
1011  /* undo sorting of id_buffer */
1012  buffer_reserve(&cr->data,sizeof(struct id_data)+sizeof(uint));
1013  sarray_permute(struct id_data,id_buffer.ptr,nr, id_perm, cr->data.ptr);
1014  sarray_permute(uint, id_proc, nr, id_perm, cr->data.ptr);
1015 
1016  /* read matrix data */
1017  for (m = 0; m < 3; ++m) {
1018  ulong mat_size_off[2];
1019  scan_in = (ulong)mat_size[m];
1020  comm_scan(mat_size_off, &data->comm, gs_long, gs_add, &scan_in, 1, scan_buf);
1021  dview_mpi(1+2*mat_size_off[0], fsm[m], &code);
1022  code = comm_reduce_int(&data->comm, gs_max, &code, 1);
1023  if (code != 0)
1024  die(1);
1025 
1026  double *b = array_reserve(double, &read_buffer, 2*mat_size[m]);
1027  dread_mpi(b, 2*mat_size[m], fsm[m], &code);
1028  code = comm_reduce_int(&data->comm, gs_max, &code, 1);
1029  if (code != 0)
1030  die(1);
1031 
1032  const struct id_data *const idp = id_buffer.ptr;
1033  struct gnz *p = array_reserve(struct gnz, &mat_buffer, mat_size[m]);
1034  uint *proc = array_reserve(uint, &mat_proc, mat_size[m]);
1035  unsigned i;
1036  for (i = 0; i < nr; ++i) {
1037  const ulong i_id = idp[i].id;
1038  const uint i_p = id_proc[i];
1039  unsigned j,rl;
1040  for (rl = row_lens[3*i+m], j=0; j < rl; ++j) {
1041  p->i = i_id;
1042  p->j = *b++;
1043  p->a = *b++;
1044  *proc++ = i_p;
1045  ++p;
1046  }
1047  }
1048  mat_buffer.n = mat_size[m];
1049  sarray_transfer_ext(struct gnz, &mat_buffer, mat_proc.ptr, sizeof(uint), cr);
1050  array_cat(struct gnz, &mat[m], mat_buffer.ptr, mat_buffer.n);
1051  sarray_sort(struct gnz, (struct gnz*)mat[m].ptr, mat[m].n, i, 1, &cr->data);
1052  }
1053 
1054  /* send id_data to owner */
1055  sarray_transfer_ext(struct id_data, &id_buffer, id_proc, sizeof(uint), cr);
1056  array_cat(struct id_data,ids, id_buffer.ptr, id_buffer.n);
1057  sarray_sort(struct id_data, ids->ptr, ids->n, id, 1, &cr->data);
1058 
1059  /* clean-up */
1060  array_free(&id_buffer);
1061  array_free(&mat_buffer);
1062  array_free(&read_buffer);
1063  free(row_lens);
1064  array_free(&mat_proc);
1065  dclose_mpi(fsm[2]);
1066  dclose_mpi(fsm[1]);
1067  dclose_mpi(fsm[0]);
1068  dclose_mpi(fs);
1069  find_id_free(&fid);
1070 
1071  if (pid == 0)
1072  printf("done\n");
1073 }
1074 #endif
1075 
1076 
1077 
1078 /*==========================================================================
1079 
1080  Write amgdmp_*.dat
1081 
1082  The user's matrix is first assembled, then written out.
1083 
1084  ==========================================================================*/
1085 
1086 
1089 
1090 #define rid_equal(a,b) ((a).p==(b).p && (a).i==(b).i)
1091 
1092 /* rnz is a mnemonic for remote non zero */
1093 struct rnz {
1094  double v; struct rid i,j;
1095 };
1096 #define nz_pos_equal(a,b) \
1097  (rid_equal((a).i,(b).i) && rid_equal((a).j,(b).j))
1098 
1099 static void mat_sort(struct array *const mat,
1100  const enum mat_order order, buffer *const buf)
1101 {
1102  switch(order) {
1103  case col_major: sarray_sort_4(struct rnz,mat->ptr,mat->n,
1104  j.p,0,j.i,0, i.p,0,i.i,0, buf); break;
1105  case row_major: sarray_sort_4(struct rnz,mat->ptr,mat->n,
1106  i.p,0,i.i,0, j.p,0,j.i,0, buf); break;
1107  }
1108 }
1109 
1110 /* assumes matrix is already sorted */
1111 static void mat_condense_sorted(struct array *const mat)
1112 {
1113  struct rnz *dst,*src, *const end=(struct rnz*)mat->ptr+mat->n;
1114  if(mat->n<=1) return;
1115  for(dst=mat->ptr;;++dst) {
1116  if(dst+1==end) return;
1117  if(nz_pos_equal(dst[0],dst[1])) break;
1118  }
1119  for(src=dst+1;src!=end;++src) {
1120  if(nz_pos_equal(*dst,*src))
1121  dst->v += src->v;
1122  else
1123  *(++dst) = *src;
1124  }
1125  mat->n = (dst+1)-(struct rnz*)mat->ptr;
1126 }
1127 
1128 static void mat_condense(
1129  struct array *const mat, const enum mat_order order, buffer *const buf)
1130 {
1131  mat_sort(mat,order,buf); mat_condense_sorted(mat);
1132 }
1133 
1134 static void mat_distribute(
1135  struct array *const mat, const enum distr d, const enum mat_order o,
1136  struct crystal *const cr)
1137 {
1138  switch(d) {
1139  case row_distr: mat_condense(mat,row_major,&cr->data);
1140  sarray_transfer(struct rnz,mat, i.p,0, cr); break;
1141  case col_distr: mat_condense(mat,col_major,&cr->data);
1142  sarray_transfer(struct rnz,mat, j.p,0, cr); break;
1143  }
1144  mat_condense(mat,o,&cr->data);
1145 }
1146 
1148  struct rid rid; ulong id;
1149 };
1150 
1151 static void mat_list_nonlocal_sorted(
1152  struct array *const nonlocal_id,
1153  const struct array *const mat, const enum distr d,
1154  const ulong *uid, struct crystal *const cr)
1155 {
1156  const uint pid = cr->comm.id;
1157  struct rid last_rid;
1158  const struct rnz *p, *const e=(const struct rnz*)mat->ptr+mat->n;
1159  uint count; struct labelled_rid *out, *end;
1160  #define BUILD_LIST(k) do { \
1161  last_rid.p=UINT_MAX,last_rid.i=UINT_MAX; \
1162  for(count=0,p=mat->ptr;p!=e;++p) { \
1163  if(p->k.p==pid || rid_equal(last_rid,p->k)) continue; \
1164  last_rid=p->k; ++count; \
1165  } \
1166  array_init(struct labelled_rid, nonlocal_id, count); \
1167  nonlocal_id->n=count; out=nonlocal_id->ptr; \
1168  last_rid.p=UINT_MAX,last_rid.i=UINT_MAX; \
1169  for(p=mat->ptr;p!=e;++p) { \
1170  if(p->k.p==pid || rid_equal(last_rid,p->k)) continue; \
1171  (out++)->rid=last_rid=p->k; \
1172  } \
1173  } while(0)
1174  switch(d) {
1175  case row_distr: BUILD_LIST(j); break;
1176  case col_distr: BUILD_LIST(i); break;
1177  }
1178  #undef BUILD_LIST
1179  sarray_transfer(struct labelled_rid,nonlocal_id,rid.p,1,cr);
1180  for(out=nonlocal_id->ptr,end=out+nonlocal_id->n;out!=end;++out)
1181  out->id=uid[out->rid.i];
1182  sarray_transfer(struct labelled_rid,nonlocal_id,rid.p,1,cr);
1183  sarray_sort_2(struct labelled_rid,nonlocal_id->ptr,nonlocal_id->n,
1184  rid.p,0, rid.i,0, &cr->data);
1185 }
1186 
1187 static void mat_list_nonlocal(
1188  struct array *const nonlocal_id,
1189  struct array *const mat, const enum distr d,
1190  const ulong *uid, struct crystal *const cr)
1191 {
1192  switch(d) {
1193  case row_distr: mat_sort(mat,col_major,&cr->data); break;
1194  case col_distr: mat_sort(mat,row_major,&cr->data); break;
1195  }
1196  mat_list_nonlocal_sorted(nonlocal_id,mat,d,uid,cr);
1197 }
1198 
1199 static uint dump_matrix_setdata(
1200  buffer *const buf, /* output; ok if this is one of cr's buffers */
1201  struct array *const mat, const ulong *const uid,
1202  struct crystal *const cr)
1203 {
1204  const uint pid = cr->comm.id;
1205  struct array nonlocal_id;
1206  double *vi, *vj, *va; uint n;
1207  const struct rnz *nz, *enz;
1208  const struct labelled_rid *rlbl;
1209 
1210  mat_distribute(mat,row_distr,col_major,cr);
1211  n = mat->n;
1212 
1213  mat_list_nonlocal_sorted(&nonlocal_id,mat,row_distr,uid,cr);
1214 
1215  buffer_reserve(buf,3*n*sizeof(double));
1216  vi=buf->ptr, vj=vi+n, va=vj+n;
1217  rlbl = nonlocal_id.ptr;
1218  for(nz=mat->ptr,enz=nz+n;nz!=enz;++nz) {
1219  *vi++ = uid[nz->i.i];
1220  *va++ = nz->v;
1221  if(nz->j.p==pid)
1222  *vj++ = uid[nz->j.i];
1223  else {
1224  const uint jp = nz->j.p, ji = nz->j.i;
1225  while(rlbl->rid.p<jp) ++rlbl;
1226  if(rlbl->rid.p!=jp) printf("dump_matrix: FAIL!!!\n");
1227  while(rlbl->rid.i<ji) ++rlbl;
1228  if(rlbl->rid.i!=ji) printf("dump_matrix: FAIL!!!\n");
1229  *vj++ = rlbl->id;
1230  }
1231  }
1232  array_free(&nonlocal_id);
1233  return n;
1234 }
1235 
1236 static void dump_matrix(
1237  struct array *const mat, const ulong *const uid,
1238  struct crystal *const cr, const char *datafname)
1239 {
1240  const struct comm *comm = &cr->comm;
1241  const uint pid = comm->id, np = comm->np;
1242  buffer *const buf = &cr->data;
1243  struct file fi={0,0}, fj={0,0}, fp={0,0};
1244  uint i,n;
1245  int code;
1246 
1247  n = dump_matrix_setdata(buf, mat,uid,cr);
1248 
1249  code = 0;
1250  if(pid==0) {
1251  char str1[132];
1252  char str2[132];
1253  char str3[132];
1254  sprintf(str1,"%s.amgdmp_i.dat",datafname);
1255  sprintf(str2,"%s.amgdmp_j.dat",datafname);
1256  sprintf(str3,"%s.amgdmp_p.dat",datafname);
1257  fi=dopen(str1,'w',&code);
1258  fj=dopen(str2,'w',&code);
1259  fp=dopen(str3,'w',&code);
1260  }
1261  comm_bcast(comm,&code,sizeof(int),0);
1262  if(code!=0) die(1);
1263 
1264  for(i=0;i<np;++i) {
1265  comm_barrier(comm);
1266  if(pid!=0 && i==pid)
1267  comm_send(comm, &n,sizeof(uint), 0,i),
1268  comm_send(comm, buf->ptr,3*n*sizeof(double), 0,np+i);
1269  else if(pid==0) {
1270  double *v;
1271  printf("AMG writing data from proc %u\n",(unsigned)i),fflush(stdout);
1272  if(i!=0) {
1273  comm_recv(comm, &n,sizeof(uint), i,i);
1274  buffer_reserve(buf,3*n*sizeof(double));
1275  comm_recv(comm, buf->ptr,3*n*sizeof(double), i,np+i);
1276  }
1277  v = buf->ptr;
1278  if(code==0) {
1279  dwrite(fi, v , n,&code);
1280  dwrite(fj, v+ n, n,&code);
1281  dwrite(fp, v+2*n, n,&code);
1282  }
1283  }
1284  }
1285  if(pid==0) dclose(fi),dclose(fj),dclose(fp);
1286  comm_bcast(comm,&code,sizeof(int),0);
1287  if(code!=0) die(1);
1288  comm_barrier(comm);
1289 }
1290 
1291 /* assumes data->comm and data->gs_top are set */
1292 static void amg_dump(
1293  uint n, const ulong *id,
1294  uint nz, const uint *Ai, const uint *Aj, const double *A,
1295  struct crs_data *data, const char *datafname)
1296 {
1297  struct crystal cr;
1298  struct array uid; struct rid *rid_map = tmalloc(struct rid,n);
1299 
1300  struct array mat;
1301  uint k; struct rnz *out;
1302 
1303  crystal_init(&cr, &data->comm);
1304  assign_dofs(&uid,rid_map, id,n,data->comm.id,data->gs_top,&cr.data);
1305 
1306  array_init(struct rnz,&mat,nz);
1307  for(out=mat.ptr,k=0;k<nz;++k) {
1308  uint i=Ai[k], j=Aj[k]; double a=A[k];
1309  if(id[i]==0 || id[j]==0 || fabs(a)==0) continue;
1310  out->v = a, out->i=rid_map[i], out->j=rid_map[j];
1311  ++out;
1312  }
1313  mat.n = out-(struct rnz*)mat.ptr;
1314  free(rid_map);
1315 
1316  dump_matrix(&mat,uid.ptr,&cr,datafname);
1317 
1318  array_free(&uid);
1319  crystal_free(&cr);
1320 }
#define crs_solve
Definition: crs_amg.c:11
#define crs_stats
Definition: crs_amg.c:12
#define AMG_BLOCK_ROWS
Definition: crs_amg.c:16
mat_order
Definition: crs_amg.c:1087
@ row_major
Definition: crs_amg.c:1087
@ col_major
Definition: crs_amg.c:1087
#define crs_setup
Definition: crs_amg.c:10
#define rid_equal(a, b)
Definition: crs_amg.c:1090
#define crs_free
Definition: crs_amg.c:13
#define AMG_MAX_ROWS
Definition: crs_amg.c:20
distr
Definition: crs_amg.c:1088
@ col_distr
Definition: crs_amg.c:1088
@ row_distr
Definition: crs_amg.c:1088
#define BUILD_LIST(k)
#define nz_pos_equal(a, b)
Definition: crs_amg.c:1096
The nomenclature of the interpolating fields saved by Nek into the binary file int_fld is here explained v
Definition: field_list.txt:2
Definition: crs_amg.c:80
struct gs_data * gsh
Definition: crs_amg.c:80
uint nloc
Definition: crs_amg.c:80
double * timing
Definition: crs_amg.c:125
struct gs_data * gs_top
Definition: crs_amg.c:113
uint * lvl_offset
Definition: crs_amg.c:120
struct csr_mat * Aff
Definition: crs_amg.c:123
unsigned * cheb_m
Definition: crs_amg.c:118
uint timing_n
Definition: crs_amg.c:125
struct csr_mat * AfP
Definition: crs_amg.c:123
struct Q * Q_Aff
Definition: crs_amg.c:122
int null_space
Definition: crs_amg.c:116
double * buf
Definition: crs_amg.c:124
struct comm comm
Definition: crs_amg.c:112
uint * umap
Definition: crs_amg.c:114
uint un
Definition: crs_amg.c:114
double * x
Definition: crs_amg.c:124
double * cheb_rho
Definition: crs_amg.c:119
double * Dff
Definition: crs_amg.c:121
double tni
Definition: crs_amg.c:115
struct csr_mat * W
Definition: crs_amg.c:123
struct Q * Q_W
Definition: crs_amg.c:122
double * c_old
Definition: crs_amg.c:124
double * r
Definition: crs_amg.c:124
struct Q * Q_AfP
Definition: crs_amg.c:122
double * b
Definition: crs_amg.c:124
unsigned levels
Definition: crs_amg.c:117
double * c
Definition: crs_amg.c:124
uint n
Definition: crs_xxt.c:204
uint cn
Definition: crs_amg.c:43
double * a
Definition: crs_amg.c:44
uint rn
Definition: crs_amg.c:43
uint * col
Definition: crs_amg.c:43
uint * row_off
Definition: crs_amg.c:43
struct array map
Definition: crs_amg.c:626
struct array work
Definition: crs_amg.c:627
struct crystal * cr
Definition: crs_amg.c:628
uint p
Definition: crs_amg.c:623
ulong id
Definition: crs_amg.c:623
ulong id
Definition: crs_amg.c:655
Definition: crs_amg.c:241
ulong i
Definition: crs_amg.c:241
ulong j
Definition: crs_amg.c:241
double a
Definition: crs_amg.c:241
struct comm comm
Definition: fem_amg_preco.h:43
double D
Definition: crs_amg.c:235
ulong id
Definition: crs_amg.c:236
unsigned level
Definition: crs_amg.c:237
struct rid rid
Definition: crs_amg.c:1148
ulong id
Definition: crs_amg.c:1148
Definition: crs_amg.c:231
uint i
Definition: crs_amg.c:231
uint p
Definition: crs_amg.c:231
Definition: crs_amg.c:1093
struct rid i j
Definition: crs_amg.c:1094
double v
Definition: crs_amg.c:1094