10 #define crs_setup PREFIXED_NAME(crs_amg_setup)
11 #define crs_solve PREFIXED_NAME(crs_amg_solve)
12 #define crs_stats PREFIXED_NAME(crs_amg_stats)
13 #define crs_free PREFIXED_NAME(crs_amg_free )
15 #ifndef AMG_BLOCK_ROWS
16 # define AMG_BLOCK_ROWS 2400
20 # define AMG_MAX_ROWS 12000
25 static double get_time(
void)
34 static void barrier(
const struct comm *c)
48 static double apply_M(
49 double *z,
const double alpha,
const double *y,
50 const double beta,
const struct csr_mat *
const M,
const double *x)
52 uint i;
const uint rn=M->
rn;
53 const uint *
const row_off = M->
row_off, *col = M->
col;
54 const double *a = M->
a;
55 const double t0 = get_time();
57 uint j;
const uint je=row_off[i+1];
double t = 0;
58 for(j=row_off[i]; j<je; ++j) t += (*a++) * x[*col++];
59 *z++ = alpha*(*y++) + beta*t;
65 static double apply_Mt(
66 double *
const z,
const struct csr_mat *
const M,
const double *x)
68 uint i;
const uint rn=M->
rn,cn=M->
cn;
69 const uint *
const row_off = M->
row_off, *col = M->
col;
70 const double *a = M->
a;
71 const double t0 = get_time();
72 for(i=0;i<cn;++i) z[i]=0;
74 uint j;
const uint je=row_off[i+1];
const double xi = *x++;
75 for(j=row_off[i]; j<je; ++j) z[*col++] += (*a++) * xi;
83 static double apply_Q(
84 double *
const ve,
const struct Q *
const Q,
const double *
const v,
88 memcpy(ve,
v,
Q->
nloc*
sizeof(
double));
89 barrier(
comm); t0=get_time();
90 gs(ve,gs_double,gs_add,0,
Q->
gsh,0);
97 static double apply_Qt(
98 double *z,
const double alpha,
const double *y,
99 const double beta,
const struct Q *
Q,
double *x,
103 uint i;
const uint nloc=
Q->
nloc;
104 barrier(
comm); t0=get_time();
105 gs(x,gs_double,gs_add,1,
Q->
gsh,0);
106 t1 = get_time() - t0;
107 for(i=0;i<nloc;++i) *z++ = alpha*(*y++) + beta*(*x++);
128 static void amg_exec(
struct crs_data *
const amg)
130 unsigned lvl;
const unsigned levels=amg->
levels;
132 double *
const b = amg->
b, *
const x = amg->
x;
133 double *c = amg->
c, *c_old = amg->
c_old, *r = amg->
r;
134 double *timing = amg->
timing;
136 for(lvl=0;lvl<levels-1;++lvl,timing+=6) {
137 double *
const b_l = b+off[lvl], *
const b_lp1 = b+off[lvl+1];
139 timing[1]+=apply_Mt(amg->
buf, &amg->
W[lvl],b_l);
140 timing[0]+=apply_Qt(b_lp1, 1,b_lp1, 1,&amg->
Q_W[lvl],amg->
buf, &amg->
comm);
143 {
const uint i=off[levels-1];
144 if(off[levels]-i) x[i]=amg->
Dff[i]*b[i]; }
145 for(lvl=levels-1;lvl--;) {
146 double *
const b_l = b+off[lvl];
147 double *
const x_l = x+off[lvl], *
const x_lp1 = x+off[lvl+1];
148 const double *
const d_l = amg->
Dff+off[lvl];
149 const uint
n = off[lvl+1]-off[lvl]; uint i;
150 const unsigned m = amg->
cheb_m[lvl];
unsigned ci;
151 double alpha, beta, gamma;
152 timing = amg->
timing + lvl*6;
154 timing[2]+=apply_Q(amg->
buf, &amg->
Q_AfP[lvl],x_lp1, &amg->
comm);
156 timing[3]+=apply_M(x_l, 0,b_l, 1,&amg->
W [lvl], amg->
buf);
158 timing[3]+=apply_M(b_l, 1,b_l, -1,&amg->
AfP[lvl], amg->
buf);
160 for(i=0;i<
n;++i) c[i]=d_l[i]*b_l[i];
162 alpha = amg->
cheb_rho[lvl]/2, alpha*=alpha;
163 gamma = 2*alpha/(1-2*alpha), beta = 1 + gamma;
165 timing[4]+=apply_Q(amg->
buf, &amg->
Q_Aff[lvl],c, &amg->
comm);
166 timing[5]+=apply_M(r, 1,b_l, -1,&amg->
Aff[lvl],amg->
buf);
168 {
double *
const temp = c; c=c_old,c_old=temp; }
169 for(i=0;i<
n;++i) c[i] = beta*(c_old[i]+d_l[i]*r[i]);
171 for(ci=3;ci<=m;++ci) {
172 gamma = alpha*beta, gamma = gamma/(1-gamma), beta = 1 + gamma;
174 timing[4]+=apply_Q(amg->
buf, &amg->
Q_Aff[lvl],c, &amg->
comm);
175 timing[5]+=apply_M(r, 1,b_l, -1,&amg->
Aff[lvl],amg->
buf);
177 {
double *
const temp = c; c=c_old,c_old=temp; }
178 for(i=0;i<
n;++i) c[i] = beta*(c_old[i]+d_l[i]*r[i]) - gamma*c[i];
180 for(i=0;i<
n;++i) x_l[i]+=c[i];
187 uint i;
const uint un = data->
un;
const uint *
const umap = data->
umap;
188 double *
const ub = data->
b, *
const ux = data->
x;
190 gs(b, gs_double,gs_add, 1, data->
gs_top, 0);
191 for(i=0;i<un;++i) ub[i]=b[umap[i]];
196 const double avg = data->
tni*comm_reduce_double(&data->
comm, gs_add, ux,un);
197 for(i=0;i<un;++i) ux[i] -= avg;
200 for(i=0;i<un;++i) x[umap[i]]=ux[i];
201 gs(x, gs_double,gs_add, 0, data->
gs_top, 0);
206 const unsigned lm1 = data->
levels-1;
207 double *avg = tmalloc(
double, 2*6*lm1);
210 for(i=0;i<6*lm1;++i) avg[i] = ni*data->
timing[i];
211 comm_allreduce(&data->
comm,gs_double,gs_add, avg,6*lm1, avg+6*lm1);
212 if(data->
comm.id==0) {
213 double *t = avg;
unsigned lvl;
214 printf(
"AMG stats:\n");
215 for(lvl=0;lvl<lm1;++lvl,t+=6)
216 printf(
" [lvl %02u] comm: Wt=%0.3e W,AfP=%0.3e Aff=%0.3e\n"
217 " [lvl %02u] work: Wt=%0.3e W,AfP=%0.3e Aff=%0.3e\n",
218 lvl, t[0],t[2],t[4], lvl, t[1],t[3],t[5]);
253 static uint *assign_dofs(
struct array *
const uid,
254 struct rid *
const rid_map,
255 const ulong *
const id,
const uint n,
257 struct gs_data *gs_top, buffer *
const buf)
259 uint i,count, *map=0;
260 struct rid *
const rid = rid_map?rid_map:tmalloc(
struct rid,n);
262 gs_vec(n?&
rid[0].
p:0,2, gs_sint,gs_add,0, gs_top,buf);
263 for(count=0,
i=0;
i<n;++
i) {
265 array_init(ulong,uid,count); uid->n=count;
267 map=tmalloc(uint,count);
268 for(count=0,
i=0;
i<n;++
i) {
271 { ulong *
const uid_p = uid->ptr;
273 uid_p[count]=
id[
i],
rid[
i].
i=count, ++count;
275 if(rid_map) gs_vec(n?&
rid[0].
p:0,2, gs_sint,gs_add,0, gs_top,buf);
280 static void localize_rows(
struct gnz *
p, uint nz,
281 const ulong *
const uid,
const uint *
const id_perm)
283 uint
i=0;
while(nz--) {
285 while(uid[
i]<
id) ++
i;
290 static void find_mat_offs(uint *
const off,
const int levels,
291 const struct gnz *
const p,
const uint nz,
292 const struct id_data *
const id)
294 int lvl=-1; uint
i;
for(
i=0;
i<nz;++
i) {
295 const int lvl_i =
id[
p[
i].i].level;
296 while(lvl<lvl_i) off[++lvl]=
i;
298 while(lvl<levels-1) off[++lvl]=
i;
301 static void compress_mat(
302 uint *
const row_off,
const int rn, uint *
const col,
double *
const a,
303 const struct gnz *
const p,
const uint nz)
306 for(r=-1, k=0;k<nz;++k) {
307 const int row =
p[k].i;
308 while(r<row) row_off[++r]=k;
312 while(r<rn) row_off[++r]=k;
315 static void amg_setup_mat(
316 struct array *ids,
const uint nloc,
struct csr_mat *
const mat,
317 const ulong *uid,
const uint uid_n,
const uint *id_perm,
318 const uint i0,
const uint j0,
struct gnz *
const p,
const uint nz,
321 slong *
id = ids->ptr; ulong last_id = ULONG_MAX;
322 const uint ne = ids->n;
324 sarray_sort(
struct gnz,
p,nz, j,1, buf);
326 const ulong j_id =
p[k].j;
327 p[k].i =
p[k].i - i0;
328 if(j_id == last_id) {
p[k].j =
p[k-1].j;
continue; }
331 while(
i<uid_n && uid[
i]<j_id) ++
i;
332 if(
i<uid_n && uid[
i]==j_id) {
p[k].j=id_perm[
i]-j0;
continue; }
334 while(ie<ne && (ulong)(-
id[ie])<j_id) ++ie;
335 if(ie<ne && (ulong)(-
id[ie])==j_id) {
p[k].j=ie;
continue; }
337 id = array_reserve(slong, ids, ids->n+1);
338 id[ids->n] = -(slong)j_id,
p[k].j = ids->n++;
340 sarray_sort_2(
struct gnz,
p,nz,
i,1, j,1, buf);
346 static uint amg_setup_mats(
348 const ulong *
const uid,
const uint uid_n,
349 const uint *
const id_perm,
350 const struct id_data *
const id,
355 struct array ide = null_array; uint max_e=0;
356 const unsigned levels=data->
levels;
359 data->
Q_W = tmalloc(
struct Q, (levels-1)*3);
360 data->
Q_AfP = data->
Q_W + (levels-1);
365 mat_off[0] = tmalloc(uint, levels*3);
366 mat_off[1] = mat_off[0]+levels;
367 mat_off[2] = mat_off[1]+levels;
370 localize_rows(mat[m].ptr,mat[m].
n, uid,id_perm);
372 sarray_sort(
struct gnz,mat[m].ptr,mat[m].
n, i,1, buf);
374 find_mat_offs(mat_off[m],levels, mat[m].ptr,mat[m].
n,
id);
378 uint *ui = tmalloc(uint, 3*((off[levels-1]-off[0])+(levels-1))
379 +mat[0].
n+mat[1].
n+mat[2].
n);
380 double *
a = tmalloc(
double, mat[0].
n+mat[1].
n+mat[2].
n);
381 for(m=0;m<3;++m)
for(lvl=0;lvl<levels-1;++lvl) {
382 const uint
rn = off[lvl+1]-off[lvl];
383 const uint nz = mat_off[m][lvl+1]-mat_off[m][lvl];
390 for(lvl=0;lvl<levels-1;++lvl) {
391 uint i;
const uint j0=off[lvl+1], nloc = off[levels]-j0;
392 slong *p = array_reserve(slong, &ide, nloc);
393 for(i=0;i<nloc;++i) p[i] =
id[i+j0].
id;
395 amg_setup_mat(&ide,nloc, &data->
W[lvl],
396 uid, uid_n, id_perm, off[lvl],j0,
397 (
struct gnz*)mat[0].ptr + mat_off[0][lvl],
398 mat_off[0][lvl+1]-mat_off[0][lvl], buf);
399 data->
Q_W[lvl].
gsh = gs_setup(ide.ptr,ide.n, &data->
comm, 0,gs_auto,1);
400 amg_setup_mat(&ide,nloc, &data->
AfP[lvl],
401 uid, uid_n, id_perm, off[lvl],j0,
402 (
struct gnz*)mat[1].ptr + mat_off[1][lvl],
403 mat_off[1][lvl+1]-mat_off[1][lvl], buf);
404 data->
Q_AfP[lvl].
gsh = gs_setup(ide.ptr,ide.n, &data->
comm, 0,gs_auto,1);
405 if(ide.n>max_e) max_e=ide.n;
407 for(lvl=0;lvl<levels-1;++lvl) {
408 uint i;
const uint j0=off[lvl], nloc = off[lvl+1]-j0;
409 slong *p = array_reserve(slong, &ide, nloc);
410 for(i=0;i<nloc;++i) p[i] =
id[i+j0].
id;
412 amg_setup_mat(&ide,nloc, &data->
Aff[lvl],
413 uid, uid_n, id_perm, j0,j0,
414 (
struct gnz*)mat[2].ptr + mat_off[2][lvl],
415 mat_off[2][lvl+1]-mat_off[2][lvl], buf);
416 data->
Q_Aff[lvl].
gsh = gs_setup(ide.ptr,ide.n, &data->
comm, 0,gs_auto,1);
417 if(ide.n>max_e) max_e=ide.n;
424 static uint *compute_offsets(
425 const struct id_data *
const id,
const uint
n,
428 uint i, *off = tmalloc(uint, levels+1);
431 const int lvl_i =
id[i].level;
432 while(lvl<lvl_i) off[++lvl]=i;
434 while(lvl<levels) off[++lvl]=i;
438 static void read_data(
440 struct array *ids,
struct array mats[3],
441 struct crystal *
const cr,
442 const ulong *uid,
const uint uid_n,
const char *datafname);
444 static void read_data_mpiio(
446 struct array *ids,
struct array mats[3],
447 struct crystal *
const cr,
448 const ulong *uid,
const uint uid_n,
const char *datafname);
450 static void amg_setup_aux(
struct crs_data *data, uint
n,
const ulong *
id,
const char *datafname)
453 struct array uid; uint *id_perm;
454 struct array ids, mat[3];
455 uint max_e; ulong temp_long;
457 crystal_init(&cr, &data->
comm);
458 data->
umap = assign_dofs(&uid,0,
id,n,data->
comm.id,data->
gs_top,&cr.data);
461 sortp_long(&cr.data,0, uid.ptr,uid.n,
sizeof(ulong));
462 sarray_permute(ulong,uid.ptr ,uid.n, cr.data.ptr, &temp_long);
463 sarray_permute(uint ,data->
umap,uid.n, cr.data.ptr, &max_e);
466 read_data_mpiio(data, &ids, mat, &cr, uid.ptr,uid.n, datafname);
468 read_data(data, &ids, mat, &cr, uid.ptr,uid.n, datafname);
473 {
int not_happy = ids.n==uid.n ? 0 : 1;
474 if(comm_reduce_int(&data->
comm, gs_max, ¬_happy,1)) {
475 comm_barrier(&data->
comm);
477 fail(1,__FILE__,__LINE__,
"AMG: missing data for some rows");
482 sarray_sort(
struct id_data,ids.ptr,ids.n,
id,1, &cr.data);
483 sarray_sort(
struct id_data,ids.ptr,ids.n, level,0, &cr.data);
484 sarray_permute(uint,data->
umap,ids.n, cr.data.ptr, &max_e);
485 id_perm = tmalloc(uint, uid.n);
486 sarray_perm_invert(id_perm, cr.data.ptr, uid.n);
492 max_e = amg_setup_mats(data,uid.ptr,uid.n,id_perm,ids.ptr,mat, &cr.data);
494 {
const unsigned levels=data->
levels;
496 struct id_data *
const id = ids.ptr;
const uint n = ids.n;
499 for(i=0;i<levels-1;++i) {
500 const uint nf = off[i+1]-off[i];
501 if(nf>max_f) max_f=nf;
503 d = data->
Dff = tmalloc(
double, 3*n + 3*max_f + max_e + 6*(levels-1));
504 data->
x = data->
Dff + n;
505 data->
b = data->
x + n;
506 data->
c = data->
b + n;
507 data->
c_old = data->
c + max_f;
508 data->
r = data->
c_old + max_f;
509 data->
buf = data->
r + max_f;
511 for(i=0;i<n;++i) d[i]=
id[i].
D;
512 for(i=0;i<6*(levels-1);++i) data->
timing[i]=0;
525 static void amg_dump(
526 uint n,
const ulong *
id,
527 uint nz,
const uint *Ai,
const uint *Aj,
const double *A,
528 struct crs_data *data,
const char *datafname);
531 uint n,
const ulong *
id,
532 uint nz,
const uint *Ai,
const uint *Aj,
const double *A,
539 comm_dup(&data->
comm,comm);
542 if(data->
comm.id==0) {
547 sprintf(str1,
"%s.amg.dat",datafname);
548 sprintf(str2,
"%s.amg_W.dat",datafname);
549 sprintf(str3,
"%s.amg_AfP.dat",datafname);
550 sprintf(str4,
"%s.amg_Aff.dat",datafname);
551 if(stat(str1,&info)!=0) dump++;
552 if(stat(str2,&info)!=0) dump++;
553 if(stat(str3,&info)!=0) dump++;
554 if(stat(str4,&info)!=0) dump++;
555 if(dump!=0) printf(
"AMG files not found, dumping data ...\n"), fflush(stdout);
557 comm_bcast(&data->
comm,&dump,
sizeof(
int),0);
559 data->
gs_top = gs_setup((
const slong*)
id,n, &data->
comm, 1,
560 dump?gs_crystal_router:gs_auto, !dump);
563 amg_dump(n,
id,nz,Ai,Aj,A,data,datafname);
566 if(data->
comm.id==0) {
567 printf(
"AMG dump successful. Run AMG setup tool to continue.\n");
570 comm_barrier(&data->
comm);
571 comm_free(&data->
comm);
576 amg_setup_aux(data, n,
id, datafname);
584 const unsigned levels = data->
levels;
590 for(lvl=0;lvl<levels-1;++lvl)
593 gs_free(data->
Q_W[lvl].
gsh);
608 comm_free(&data->
comm);
631 static void find_id_setup(
633 const ulong *
id, uint n,
634 struct crystal *
const cr)
636 const uint np = cr->comm.np;
643 for(q=data->
map.ptr,i=0;i<n;++i)
644 q[i].
id =
id[i], q[i].
p =
id[i] % np;
649 static void find_id_free(
struct find_id_data *
const data)
651 array_free(&data->
map);
652 array_free(&data->
work);
658 uint *p_out,
const unsigned p_stride,
660 const ulong *
id,
const unsigned id_stride,
const uint n)
664 const uint np = data->
cr->comm.np;
669 for(nn=n;nn;--nn,
id=(
const ulong*)((
const char*)
id+id_stride))
670 p->id=*
id,
p->p=0,
p->wp = (*
id)%np, ++
p;
679 for(
p=data->
work.ptr,p_end=
p+data->
work.n;
p!=p_end;++
p) {
680 while(q!=q_end && q->
id<
p->id) ++q;
682 if(q->
id!=
p->id) not_found=1,
p->p=UINT_MAX;
685 for(;
p!=p_end;++
p) not_found=1,
p->p=UINT_MAX;
694 for(nn=n;nn;--nn,p_out=(uint*)((
char*)p_out+p_stride))
697 return comm_reduce_int(&data->
cr->comm, gs_max, ¬_found,1);
713 static ulong read_level_data(
struct crs_data *data,
struct file f)
715 unsigned i,n; ulong tn;
double *buf;
718 if(data->
comm.id==0) {
719 double hdr; dread(&hdr,1,f,&code);
721 printf(
"AMG version %3.2f\n",hdr);
723 printf(
"Outdates AMG dat files\n");
726 double t; dread(&t,1,f,&code);
728 printf(
"AMG: %u levels\n", data->
levels);
731 comm_bcast(&data->
comm, &data->
levels,
sizeof(
unsigned), 0);
732 comm_bcast(&data->
comm, &updt,
sizeof(
int), 0);
733 comm_bcast(&data->
comm, &code,
sizeof(
int), 0);
734 if(!updt||code!=0) die(1);
737 data->
cheb_m = tmalloc(
unsigned,n);
738 data->
cheb_rho = tmalloc(
double ,n);
740 buf = tmalloc(
double, 2*n+1);
741 if(data->
comm.id==0) dread(buf,2*n+1,f,&code);
742 comm_bcast(&data->
comm, &code,
sizeof(
int), 0);
745 comm_bcast(&data->
comm, buf,(2*n+1)*
sizeof(
double), 0);
746 for(i=0;i<n;++i) data->
cheb_m[i] = buf[i];
747 for(i=0;i<n;++i) data->
cheb_rho[i] = buf[n+i];
749 data->
tni = 1/(double)tn;
752 if(data->
comm.id==0) {
753 printf(
"AMG Chebyshev smoother data:\n");
755 printf(
"AMG level %u: %u iterations with rho = %g\n",
757 printf(
"AMG: %lu rows\n", (
unsigned long)tn);
763 static void read_data(
765 struct array *ids,
struct array mat[3],
766 struct crystal *
const cr,
767 const ulong *uid,
const uint uid_n,
const char *datafname)
771 const uint pid = data->
comm.id;
773 struct array read_buffer=null_array;
774 struct array id_buffer = null_array, mat_buffer = null_array;
775 uint *row_lens=0, *id_proc=0, *id_perm=0;
776 struct array mat_proc = null_array;
777 struct file f={0,0}, fm[3]={{0,0},{0,0},{0,0}};
779 ids->ptr=0, ids->n=0, ids->max=0;
780 for(m=0;m<3;++m) mat[m].ptr=0,mat[m].n=0,mat[m].max=0;
781 find_id_setup(&fid, uid,uid_n, cr);
788 sprintf(str1,
"%s.amg.dat",datafname);
789 sprintf(str2,
"%s.amg_W.dat",datafname);
790 sprintf(str3,
"%s.amg_AfP.dat",datafname);
791 sprintf(str4,
"%s.amg_Aff.dat",datafname);
792 f = dopen(str1,
'r',&code);
793 fm[0] = dopen(str2,
'r',&code);
794 fm[1] = dopen(str3,
'r',&code);
795 fm[2] = dopen(str4,
'r',&code);
804 comm_bcast(&data->
comm,&code,
sizeof(
int),0);
807 tn = read_level_data(data,f);
810 uint mat_size[3]={0,0,0};
815 struct id_data *idp = id_buffer.ptr;
816 double *b = read_buffer.ptr;
817 dread(b,nr*6, f,&code);
819 printf(
"AMG: reading through row %lu, pass %u/%u\n",
820 (
unsigned long)b[(nr-1)*6],
825 idp[i].
level = (uint)(*b++) - 1;
826 mat_size[0] += row_lens[3*i+0] = *b++;
827 mat_size[1] += row_lens[3*i+1] = *b++;
828 mat_size[2] += row_lens[3*i+2] = *b++;
834 sarray_sort(
struct id_data,idp,nr,
id,1, &cr->data);
835 sarray_perm_invert(id_perm, cr->data.ptr, nr);
840 comm_bcast(&data->
comm,&code,
sizeof(
int),0);
844 if(find_id(id_proc,
sizeof(uint), &fid,
845 (
const ulong*)((
const char*)id_buffer.ptr+offsetof(
struct id_data,
id)),
846 sizeof(
struct id_data), id_buffer.n)) {
848 fail(1,__FILE__,__LINE__,
"AMG: data has more rows than given problem");
852 buffer_reserve(&cr->data,
sizeof(
struct id_data)+
sizeof(uint));
853 sarray_permute(
struct id_data,id_buffer.ptr,nr, id_perm, cr->data.ptr);
854 sarray_permute(uint, id_proc, nr, id_perm, cr->data.ptr);
859 const struct id_data *
const idp = id_buffer.ptr;
860 unsigned i; uint j,rl;
861 struct gnz *p = array_reserve(
struct gnz, &mat_buffer, mat_size[m]);
862 double *b = array_reserve(
double, &read_buffer, 2*mat_size[m]);
863 uint *proc = array_reserve(uint, &mat_proc, mat_size[m]);
864 dread(b,2*mat_size[m], fm[m],&code);
867 const ulong i_id = idp[
i].
id;
868 const uint i_p = id_proc[
i];
869 for(rl=row_lens[3*
i+m],
j=0;
j<rl;++
j)
870 p->
i = i_id, p->
j = *b++, p->
a = *b++, *proc++ = i_p, ++p;
872 mat_buffer.n = mat_size[m];
876 comm_bcast(&data->
comm,&code,
sizeof(
int),0);
878 sarray_transfer_ext(
struct gnz,&mat_buffer,mat_proc.ptr,
sizeof(uint),cr);
879 array_cat(
struct gnz,&mat[m],mat_buffer.ptr,mat_buffer.n);
882 sarray_transfer_ext(
struct id_data,&id_buffer,id_proc,
sizeof(uint),cr);
883 array_cat(
struct id_data,ids,id_buffer.ptr,id_buffer.n);
885 array_free(&id_buffer);
886 array_free(&mat_buffer);
888 array_free(&read_buffer);
890 array_free(&mat_proc);
900 static void read_data_mpiio(
struct crs_data *
const data,
901 struct array *ids,
struct array mat[3],
902 struct crystal *
const cr,
903 const ulong *uid,
const uint uid_n,
const char *datafname)
907 const uint pid = data->
comm.id;
908 struct array read_buffer=null_array;
909 struct array id_buffer = null_array, mat_buffer = null_array;
910 uint *row_lens=0, *id_proc=0, *id_perm=0;
911 struct array mat_proc = null_array;
912 ulong scan_in, scan_buf[2];
913 uint mat_size[3] = {0,0,0};
919 for(m = 0; m < 3; ++m) {
924 find_id_setup(&fid, uid,uid_n, cr);
927 struct file f = {0,0};
928 f = dopen(
"amg.dat",
'r', &code);
929 if(code != 0) die(1);
930 ulong tn = read_level_data(data, f);
934 printf(
"AMG: preading through rows ... ");
936 struct sfile fs = {0,0};
937 struct sfile fsm[3] = {{0,0},{0,0},{0,0}};
942 sprintf(str1,
"%s.amg.dat",datafname);
943 sprintf(str2,
"%s.amg_W.dat",datafname);
944 sprintf(str3,
"%s.amg_AfP.dat",datafname);
945 sprintf(str4,
"%s.amg_Aff.dat",datafname);
946 fs = dopen_mpi(str1,
'r', &data->
comm, &code);
947 fsm[0] = dopen_mpi(str2,
'r', &data->
comm, &code);
948 fsm[1] = dopen_mpi(str3,
'r', &data->
comm, &code);
949 fsm[2] = dopen_mpi(str4,
'r', &data->
comm, &code);
950 code = comm_reduce_int(&data->
comm, gs_max, &code, 1);
961 unsigned nr = tn/data->
comm.np;
962 for(i = 0; i < tn%data->
comm.np;++i)
967 fail(1,__FILE__,__LINE__,
"AMG: AMG_MAX_ROWS too small!");
974 comm_scan(nr_off, &data->
comm, gs_long, gs_add, &scan_in, 1, scan_buf);
975 const ulong off0 = 1+1+1+2*(data->
levels-1)+1;
976 dview_mpi(off0+nr_off[0]*6, fs, &code);
977 code = comm_reduce_int(&data->
comm, gs_max, &code,1);
981 double *b = read_buffer.ptr;
982 dread_mpi(b, nr*6, fs, &code);
983 code = comm_reduce_int(&data->
comm, gs_max, &code,1);
987 struct id_data *idp = id_buffer.ptr;
988 for (i = 0; i < nr; ++i) {
990 idp[i].
level = (uint)(*b++) - 1;
991 mat_size[0] += row_lens[3*i+0] = *b++;
992 mat_size[1] += row_lens[3*i+1] = *b++;
993 mat_size[2] += row_lens[3*i+2] = *b++;
999 sarray_sort(
struct id_data,idp,nr,
id,1, &cr->data);
1000 sarray_perm_invert(id_perm, cr->data.ptr, nr);
1001 if (find_id(id_proc,
sizeof(uint), &fid,
1002 (
const ulong*)((
const char*)id_buffer.ptr+offsetof(
struct id_data,
id)),
1003 sizeof(
struct id_data), id_buffer.n))
1005 if (comm_reduce_int(&data->
comm, gs_max, &code, 1)) {
1007 fail(1,__FILE__,__LINE__,
"AMG: data has more rows than given problem");
1012 buffer_reserve(&cr->data,
sizeof(
struct id_data)+
sizeof(uint));
1013 sarray_permute(
struct id_data,id_buffer.ptr,nr, id_perm, cr->data.ptr);
1014 sarray_permute(uint, id_proc, nr, id_perm, cr->data.ptr);
1017 for (m = 0; m < 3; ++m) {
1018 ulong mat_size_off[2];
1019 scan_in = (ulong)mat_size[m];
1020 comm_scan(mat_size_off, &data->
comm, gs_long, gs_add, &scan_in, 1, scan_buf);
1021 dview_mpi(1+2*mat_size_off[0], fsm[m], &code);
1022 code = comm_reduce_int(&data->
comm, gs_max, &code, 1);
1026 double *b = array_reserve(
double, &read_buffer, 2*mat_size[m]);
1027 dread_mpi(b, 2*mat_size[m], fsm[m], &code);
1028 code = comm_reduce_int(&data->
comm, gs_max, &code, 1);
1032 const struct id_data *
const idp = id_buffer.ptr;
1033 struct gnz *p = array_reserve(
struct gnz, &mat_buffer, mat_size[m]);
1034 uint *proc = array_reserve(uint, &mat_proc, mat_size[m]);
1036 for (
i = 0;
i < nr; ++
i) {
1037 const ulong i_id = idp[
i].
id;
1038 const uint i_p = id_proc[
i];
1040 for (rl = row_lens[3*
i+m],
j=0;
j < rl; ++
j) {
1048 mat_buffer.n = mat_size[m];
1049 sarray_transfer_ext(
struct gnz, &mat_buffer, mat_proc.ptr,
sizeof(uint), cr);
1050 array_cat(
struct gnz, &mat[m], mat_buffer.ptr, mat_buffer.n);
1051 sarray_sort(
struct gnz, (
struct gnz*)mat[m].ptr, mat[m].n,
i, 1, &cr->data);
1055 sarray_transfer_ext(
struct id_data, &id_buffer, id_proc,
sizeof(uint), cr);
1056 array_cat(
struct id_data,ids, id_buffer.ptr, id_buffer.n);
1057 sarray_sort(
struct id_data, ids->ptr, ids->n,
id, 1, &cr->data);
1060 array_free(&id_buffer);
1061 array_free(&mat_buffer);
1062 array_free(&read_buffer);
1064 array_free(&mat_proc);
1090 #define rid_equal(a,b) ((a).p==(b).p && (a).i==(b).i)
1096 #define nz_pos_equal(a,b) \
1097 (rid_equal((a).i,(b).i) && rid_equal((a).j,(b).j))
1099 static void mat_sort(
struct array *
const mat,
1100 const enum mat_order order, buffer *
const buf)
1103 case col_major: sarray_sort_4(
struct rnz,mat->ptr,mat->n,
1104 j.p,0,j.i,0,
i.p,0,
i.i,0, buf);
break;
1105 case row_major: sarray_sort_4(
struct rnz,mat->ptr,mat->n,
1106 i.p,0,
i.i,0, j.p,0,j.i,0, buf);
break;
1111 static void mat_condense_sorted(
struct array *
const mat)
1113 struct rnz *dst,*src, *
const end=(
struct rnz*)mat->ptr+mat->n;
1114 if(mat->n<=1)
return;
1115 for(dst=mat->ptr;;++dst) {
1116 if(dst+1==end)
return;
1119 for(src=dst+1;src!=end;++src) {
1125 mat->n = (dst+1)-(
struct rnz*)mat->ptr;
1128 static void mat_condense(
1129 struct array *
const mat,
const enum mat_order order, buffer *
const buf)
1131 mat_sort(mat,order,buf); mat_condense_sorted(mat);
1134 static void mat_distribute(
1135 struct array *
const mat,
const enum distr d,
const enum mat_order o,
1136 struct crystal *
const cr)
1140 sarray_transfer(
struct rnz,mat, i.p,0, cr);
break;
1142 sarray_transfer(
struct rnz,mat,
j.
p,0, cr);
break;
1144 mat_condense(mat,o,&cr->data);
1151 static void mat_list_nonlocal_sorted(
1152 struct array *
const nonlocal_id,
1153 const struct array *
const mat,
const enum distr d,
1154 const ulong *uid,
struct crystal *
const cr)
1156 const uint pid = cr->comm.id;
1157 struct rid last_rid;
1158 const struct rnz *p, *
const e=(
const struct rnz*)mat->ptr+mat->n;
1161 last_rid.p=UINT_MAX,last_rid.i=UINT_MAX; \
1162 for(count=0,p=mat->ptr;p!=e;++p) { \
1163 if(p->k.p==pid ||
rid_equal(last_rid,p->k)) continue; \
1164 last_rid=p->k; ++count; \
1167 nonlocal_id->n=count; out=nonlocal_id->ptr; \
1168 last_rid.p=UINT_MAX,last_rid.i=UINT_MAX; \
1169 for(p=mat->ptr;p!=e;++p) { \
1170 if(p->k.p==pid ||
rid_equal(last_rid,p->k)) continue; \
1171 (out++)->
rid=last_rid=p->k; \
1180 for(out=nonlocal_id->ptr,end=out+nonlocal_id->n;out!=end;++out)
1181 out->id=uid[out->rid.i];
1183 sarray_sort_2(
struct labelled_rid,nonlocal_id->ptr,nonlocal_id->n,
1187 static void mat_list_nonlocal(
1188 struct array *
const nonlocal_id,
1189 struct array *
const mat,
const enum distr d,
1190 const ulong *uid,
struct crystal *
const cr)
1196 mat_list_nonlocal_sorted(nonlocal_id,mat,d,uid,cr);
1199 static uint dump_matrix_setdata(
1201 struct array *
const mat,
const ulong *
const uid,
1202 struct crystal *
const cr)
1204 const uint pid = cr->comm.id;
1205 struct array nonlocal_id;
1206 double *vi, *vj, *va; uint n;
1207 const struct rnz *nz, *enz;
1213 mat_list_nonlocal_sorted(&nonlocal_id,mat,
row_distr,uid,cr);
1215 buffer_reserve(buf,3*n*
sizeof(
double));
1216 vi=buf->ptr, vj=vi+n, va=vj+n;
1217 rlbl = nonlocal_id.ptr;
1218 for(nz=mat->ptr,enz=nz+n;nz!=enz;++nz) {
1219 *vi++ = uid[nz->i.i];
1222 *vj++ = uid[nz->
j.
i];
1224 const uint jp = nz->
j.
p, ji = nz->
j.
i;
1225 while(rlbl->
rid.
p<jp) ++rlbl;
1226 if(rlbl->
rid.
p!=jp) printf(
"dump_matrix: FAIL!!!\n");
1227 while(rlbl->
rid.
i<ji) ++rlbl;
1228 if(rlbl->
rid.
i!=ji) printf(
"dump_matrix: FAIL!!!\n");
1232 array_free(&nonlocal_id);
1236 static void dump_matrix(
1237 struct array *
const mat,
const ulong *
const uid,
1238 struct crystal *
const cr,
const char *datafname)
1240 const struct comm *comm = &cr->comm;
1241 const uint pid = comm->id, np = comm->np;
1242 buffer *
const buf = &cr->data;
1243 struct file fi={0,0}, fj={0,0}, fp={0,0};
1247 n = dump_matrix_setdata(buf, mat,uid,cr);
1254 sprintf(str1,
"%s.amgdmp_i.dat",datafname);
1255 sprintf(str2,
"%s.amgdmp_j.dat",datafname);
1256 sprintf(str3,
"%s.amgdmp_p.dat",datafname);
1257 fi=dopen(str1,
'w',&code);
1258 fj=dopen(str2,
'w',&code);
1259 fp=dopen(str3,
'w',&code);
1261 comm_bcast(comm,&code,
sizeof(
int),0);
1266 if(pid!=0 && i==pid)
1267 comm_send(comm, &n,
sizeof(uint), 0,i),
1268 comm_send(comm, buf->ptr,3*n*
sizeof(
double), 0,np+i);
1271 printf(
"AMG writing data from proc %u\n",(
unsigned)i),fflush(stdout);
1273 comm_recv(comm, &n,
sizeof(uint), i,i);
1274 buffer_reserve(buf,3*n*
sizeof(
double));
1275 comm_recv(comm, buf->ptr,3*n*
sizeof(
double), i,np+i);
1279 dwrite(fi,
v , n,&code);
1280 dwrite(fj,
v+ n, n,&code);
1281 dwrite(fp,
v+2*n, n,&code);
1285 if(pid==0) dclose(fi),dclose(fj),dclose(fp);
1286 comm_bcast(comm,&code,
sizeof(
int),0);
1292 static void amg_dump(
1293 uint n,
const ulong *
id,
1294 uint nz,
const uint *Ai,
const uint *Aj,
const double *A,
1295 struct crs_data *data,
const char *datafname)
1298 struct array uid;
struct rid *rid_map = tmalloc(
struct rid,n);
1301 uint k;
struct rnz *out;
1303 crystal_init(&cr, &data->
comm);
1304 assign_dofs(&uid,rid_map,
id,n,data->
comm.id,data->
gs_top,&cr.data);
1306 array_init(
struct rnz,&mat,nz);
1307 for(out=mat.ptr,k=0;k<nz;++k) {
1308 uint i=Ai[k],
j=Aj[k];
double a=A[k];
1309 if(
id[i]==0 ||
id[
j]==0 || fabs(a)==0)
continue;
1310 out->
v = a, out->i=rid_map[i], out->
j=rid_map[
j];
1313 mat.n = out-(
struct rnz*)mat.ptr;
1316 dump_matrix(&mat,uid.ptr,&cr,datafname);
#define nz_pos_equal(a, b)
The nomenclature of the interpolating fields saved by Nek into the binary file int_fld is here explained v