10 #define crs_setup PREFIXED_NAME(crs_xxt_setup)
11 #define crs_solve PREFIXED_NAME(crs_xxt_solve)
12 #define crs_stats PREFIXED_NAME(crs_xxt_stats)
13 #define crs_free PREFIXED_NAME(crs_xxt_free )
26 static unsigned lg(uint
v)
29 #define UINT_BITS (sizeof(uint)*CHAR_BIT)
30 #define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i))
31 #define MASK(i) ((((uint)1<<BITS(i)) - 1) << BITS(i))
32 #define CHECK(i) if((BITS(i)!=1) && (v&MASK(i))) v>>=BITS(i), r+=BITS(i)
73 static void factor_symbolic(uint n,
const uint *Arp,
const uint *Aj,
76 uint *visit = tmalloc(uint,2*n), *parent = visit+n;
83 uint p=Arp[i], pe=Arp[i+1];
84 visit[i]=i, parent[i]=n;
86 uint j=Aj[p];
if(j>=i)
break;
87 for(;visit[j]!=i;j=parent[j]) {
89 if(parent[j]==n) { parent[j]=i;
break; }
94 Lrp=out->
Lrp=tmalloc(uint,n+1+nz);
99 uint p=Arp[i], pe=Arp[i+1], count=0, *Ljr=&Lj[Lrp[i]];
102 uint j=Aj[p];
if(j>=i)
break;
103 for(;visit[j]!=i;j=parent[j]) Ljr[count++]=j, visit[j]=i;
105 sortv(Ljr, Ljr,count,
sizeof(uint), buf);
106 Lrp[i+1]=Lrp[i]+count;
132 static void factor_numeric(uint n,
const uint *Arp,
const uint *Aj,
135 uint *visit,
double *y)
137 const uint *Lrp=out->
Lrp, *Lj=out->
Lj;
141 D=out->
D=tmalloc(
double,n+Lrp[n]);
145 uint p,pe;
double a=0;
147 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
148 uint j=Lj[p]; y[j]=0, visit[j]=i;
150 for(p=Arp[i],pe=Arp[i+1];p!=pe;++p) {
152 if(j>=i) {
if(j==i) a=A[p];
break; }
155 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
156 uint q,qe,j=Lj[p];
double lij,yj=y[j];
157 for(q=Lrp[j],qe=Lrp[j+1];q!=qe;++q) {
158 uint k=Lj[q];
if(visit[k]==i) yj+=L[q]*y[k];
172 const uint n=fac->
n, *Lrp=fac->
Lrp, *Lj=fac->
Lj;
173 const double *L=fac->
L, *D=fac->
D;
177 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) xi+=L[p]*x[Lj[p]];
180 for(i=0;i<n;++i) x[i]*=D[i];
183 for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) x[Lj[p]]+=L[p]*xi;
191 const uint n_uints_as_dbls = (n*
sizeof(uint)+
sizeof(
double)-1)/
sizeof(
double);
192 buffer_reserve(buf,(n_uints_as_dbls+n)*
sizeof(
double));
193 factor_symbolic(n,Arp,Aj,out,buf);
194 factor_numeric(n,Arp,Aj,A,out,buf->ptr,n_uints_as_dbls+(
double*)buf->ptr);
200 free(fac->
D); fac->
L =fac->
D =0;
297 static void locate_proc(
struct xxt *data)
299 const uint
id = data->
comm.id;
300 uint
n = data->
comm.np, c=1, odd=0, base=0;
306 if(
id>=base+
n) c|=1, base+=
n,
n+=(odd&1);
309 data->
nsep = level+1;
313 for(level=0;level<data->
plevels;++level) {
315 uint targ =
id - (
n-(odd&1));
316 data->
pother[level]=-(sint)(targ+1);
321 c>>=1,
n=(
n<<1)+(odd&1), odd>>=1;
334 static void discover_sep_sizes(
struct xxt *data,
335 struct array *dofa, buffer *buf)
338 const uint n = dofa->n;
340 unsigned i,lvl; uint j;
341 const struct dof *
dof = dofa->ptr;
343 buffer_reserve(buf,2*ns*
sizeof(
float));
344 v=buf->ptr, recv=
v+ns;
346 for(i=0;i<ns;++i)
v[i]=0;
350 for(lvl=0;lvl<nl;++lvl) {
351 sint other = data->
pother[lvl];
352 unsigned s = ns-(lvl+1);
354 comm_send(&data->
comm,
v +lvl+1,s*
sizeof(
float),-other-1,s);
356 comm_recv(&data->
comm,recv+lvl+1,s*
sizeof(
float),other,s);
357 for(i=lvl+1;i<ns;++i)
v[i]+=recv[i];
362 sint other = data->
pother[--lvl];
363 unsigned s = ns-(lvl+1);
365 comm_recv(&data->
comm,
v+lvl+1,s*
sizeof(
float),-other-1,s);
367 comm_send(&data->
comm,
v+lvl+1,s*
sizeof(
float),other,s);
378 data->
sn=data->
cn-data->
ln;
385 static ulong *unique_nonzero(ulong *A, ulong *Aend)
387 if(Aend==A)
return A;
389 ulong *end = Aend-1, last=*end, *p=A,*q=A,
v=0;
397 if(last!=
v) *p++=last;
402 static void merge_sep_ids(
struct xxt *data, ulong *sep_id, ulong *other,
403 ulong *work,
unsigned s0, buffer *buf)
405 const unsigned ns = data->
nsep;
407 ulong *p=sep_id, *q=other;
411 memcpy(work ,p,size*
sizeof(ulong));
412 memcpy(work+size,q,size*
sizeof(ulong));
413 sortv_long(work, work,2*size,
sizeof(ulong), buf);
414 end = unique_nonzero(work,work+2*size);
415 memcpy(p,work,(end-work)*
sizeof(ulong));
420 static void init_sep_ids(
struct xxt *data,
struct array *dofa, ulong *xid)
422 const unsigned ns=data->
nsep;
423 const uint n=data->
cn, *sep_size=data->
sep_size;
426 const struct dof *
dof = dofa->ptr;
429 for(i=data->
ln;i<n;++i) {
432 memset(xid,0,size*
sizeof(ulong));
434 if(++s != ns) size=data->
sep_size[s];
436 *xid++ =
dof[i].
id, --size;
439 memset(xid,0,size*
sizeof(ulong));
441 if(++s != ns) size=data->
sep_size[s];
445 static void find_perm_x2c(uint ln, uint cn,
const struct array *dofc,
446 uint xn,
const ulong *xid, sint *perm)
448 const struct dof *
dof = dofc->ptr, *dof_end =
dof+cn;
449 const ulong *xid_end = xid+xn; uint i=ln;
451 while(
dof!=dof_end) {
453 while(*xid!=
v) ++xid, *perm++ = -1;
454 *perm++ = i++, ++
dof, ++xid;
456 while(xid!=xid_end) ++xid, *perm++ = -1;
460 static sint *discover_sep_ids(
struct xxt *data,
struct array *dofa, buffer *buf)
463 const uint xn=data->
xn, *sep_size=data->
sep_size;
464 ulong *xid, *recv, *work, *p;
469 size=0;
for(lvl=1;lvl<ns;++lvl)
if(sep_size[lvl]>size) size=sep_size[lvl];
470 xid=tmalloc(ulong,2*xn+2*size), recv=xid+xn, work=recv+xn;
472 init_sep_ids(data,dofa,xid);
477 for(lvl=0;lvl<nl;++lvl) {
478 sint other = data->
pother[lvl];
480 comm_send(&data->
comm,p ,size*
sizeof(ulong),-other-1,size);
482 comm_recv(&data->
comm,recv,size*
sizeof(ulong),other,size);
483 merge_sep_ids(data,p,recv,work,lvl+1,buf);
486 if(ss>=size || lvl==nl-1)
break;
491 sint other = data->
pother[lvl];
493 comm_recv(&data->
comm,p,size*
sizeof(ulong),-other-1,size);
495 comm_send(&data->
comm,p,size*
sizeof(ulong),other,size);
498 p-=ss, size+=ss, --lvl;
502 perm_x2c=tmalloc(sint,xn);
503 find_perm_x2c(data->
ln,data->
cn,dofa, xn,xid, perm_x2c);
509 static void apply_QQt(
struct xxt *data,
double *
v, uint n, uint tag)
511 const unsigned nl=data->
plevels;
512 double *p=
v, *recv=data->
combuf;
513 unsigned lvl, nsend=0;
516 if(n==0 || nl==0)
return;
520 for(lvl=0;lvl<nl;++lvl) {
521 sint other = data->
pother[lvl];
523 comm_send(&data->
comm,p ,size*
sizeof(
double),-other-1,tag);
526 comm_recv(&data->
comm,recv,size*
sizeof(
double),other ,tag);
527 for(i=0;i<size;++i) p[i]+=recv[i];
530 if(ss>=size || lvl==nl-1)
break;
535 sint other = data->
pother[lvl];
537 comm_recv (&data->
comm,p,size*
sizeof(
double),-other-1,tag);
539 comm_isend(&data->
req[nsend++],&data->
comm,
540 p,size*
sizeof(
double),other ,tag);
544 p-=ss, size+=ss, --lvl;
546 if(nsend) comm_wait(data->
req,nsend);
549 static double sum(
struct xxt *data,
double v, uint n, uint tag)
551 const unsigned nl=data->
plevels;
553 unsigned lvl,nsend=0;
557 if(n==0 || nl==0)
return v;
559 for(lvl=0;lvl<nl;++lvl) {
560 sint other = data->
pother[lvl];
562 comm_send(&data->
comm,&
v,
sizeof(
double),-other-1,tag);
564 comm_recv(&data->
comm,&r,
sizeof(
double),other ,tag);
568 if(ss>=size || lvl==nl-1)
break;
573 sint other = data->
pother[lvl];
575 comm_recv (&data->
comm,&
v,
sizeof(
double),-other-1,tag);
577 comm_isend(&data->
req[nsend++],&data->
comm,
578 &
v,
sizeof(
double),other ,tag);
584 if(nsend) comm_wait(data->
req,nsend);
590 static uint unique_ids(uint n,
const ulong *
id, sint *perm, buffer *buf)
592 uint *p, i, un=0; ulong last=0;
593 p = sortp_long(buf,0,
id,n,
sizeof(ulong));
595 uint j = p[i]; ulong
v =
id[j];
598 if(
v!=last) last=
v, ++un;
612 static void discover_dofs(
613 struct xxt *data, uint n,
const ulong *
id,
614 struct array *dofa, buffer *buf,
const struct comm *comm)
616 const uint pcoord = data->
pcoord, ns=data->
nsep;
624 data->
perm_u2c = perm = tmalloc(sint,n);
625 data->
cn = cn = unique_ids(n,
id,perm,buf);
626 array_init(
struct dof,dofa,cn), dofa->n=cn,
dof=dofa->ptr;
627 buffer_reserve(buf,cn*
sizeof(ulong)), bid=buf->ptr;
628 for(i=0;i<n;++i)
if(perm[i]>=0) bid[perm[i]]=
dof[perm[i]].
id=
id[i];
630 gsh = gs_setup((
const slong*)bid,cn,comm,0,gs_crystal_router,0);
631 v = tmalloc(sint,cn);
633 for(i=0;i<cn;++i)
v[i]=pcoord;
634 gs(
v,gs_sint,gs_bpr,0,gsh,buf);
635 for(i=0;i<cn;++i)
dof[i].
level=ns-1-lg((uint)
v[i]);
637 for(i=0;i<cn;++i)
v[i]=1;
638 gs(
v,gs_sint,gs_add,0,gsh,buf);
645 buffer_reserve(buf,2*cn*
sizeof(uint));
646 p = sortp(buf,0, &
dof[0].
level,cn,
sizeof(
struct dof));
647 pi = p+cn;
for(i=0;i<cn;++i) pi[p[i]]=i;
648 for(i=0;i<n;++i)
if(perm[i]>=0) perm[i]=pi[perm[i]];
649 sarray_permute_buf(
struct dof,
dof,cn, buf);
653 static void apply_p_Als(
double *vl,
struct xxt *data,
const double *vs, uint ns)
655 const uint *Arp = data->
A_sl.
Arp,
657 const double *A = data->
A_sl.
A;
660 for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
661 vl[Aj[p]]+=A[p]*vs[i];
665 static void apply_m_Asl(
double *vs, uint ns,
struct xxt *data,
const double *vl)
667 const uint *Arp = data->
A_sl.
Arp,
669 const double *A = data->
A_sl.
A;
672 for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
673 vs[i]-=A[p]*vl[Aj[p]];
677 static void apply_S_col(
double *vs,
struct xxt *data,
678 struct csr_mat *A_ss, uint ei,
double *vl)
680 const uint ln=data->
ln;
681 const uint *Asl_rp = data->
A_sl.
Arp, *Ass_rp = A_ss->
Arp,
682 *Asl_j = data->
A_sl.
Aj, *Ass_j = A_ss->
Aj;
683 const double *Asl = data->
A_sl.
A, *Ass = A_ss->
A;
685 for(i=0;i<ei;++i) vs[i]=0;
686 for(p=Ass_rp[ei],pe=Ass_rp[ei+1];p!=pe;++p) {
691 for(i=0;i<ln;++i) vl[i]=0;
692 for(p=Asl_rp[ei],pe=Asl_rp[ei+1];p!=pe;++p) vl[Asl_j[p]]=-Asl[p];
694 apply_m_Asl(vs,ei,data,vl);
697 static void apply_S(
double *Svs, uint ns,
struct xxt *data,
698 struct csr_mat *A_ss,
const double *vs,
double *vl)
700 const uint ln=data->
ln;
701 const uint *Ass_rp = A_ss->
Arp,
703 const double *Ass = A_ss->
A;
707 for(p=Ass_rp[i],pe=Ass_rp[i+1];p!=pe;++p) {
714 for(i=0;i<ln;++i) vl[i]=0;
715 apply_p_Als(vl,data,vs,ns);
717 apply_m_Asl(Svs,ns,data,vl);
721 static void apply_Xt(
double *vx, uint nx,
const struct xxt *data,
724 const double *X = data->
X;
const uint *Xp = data->
Xp;
725 uint i;
for(i=0;i<nx;++i) vx[i]=tensor_dot(vs,X+Xp[i],Xp[i+1]-Xp[i]);
729 static void apply_X(
double *vs, uint ns,
const struct xxt *data,
730 const double *vx, uint nx)
732 const double *X = data->
X;
const uint *Xp = data->
Xp;
734 for(i=0;i<ns;++i) vs[i]=0;
736 const double v = vx[i];
737 const double *x = X+Xp[i]; uint n=Xp[i+1]-Xp[i];
738 for(j=0;j<n;++j) vs[j]+=x[j]*
v;
742 static void allocate_X(
struct xxt *data, sint *perm_x2c)
747 data->
Xp = tmalloc(uint,xn+1);
750 if(perm_x2c[i]!=-1) ++h;
751 data->
Xp[i+1]=data->
Xp[i]+h;
753 data->
X = tmalloc(
double,data->
Xp[xn]);
756 static void orthogonalize(
struct xxt *data,
struct csr_mat *A_ss,
757 sint *perm_x2c, buffer *buf)
759 uint ln=data->
ln, sn=data->
sn, xn=data->
xn;
760 double *vl, *vs, *vx, *Svs;
763 allocate_X(data,perm_x2c);
765 buffer_reserve(buf,(ln+2*sn+xn)*
sizeof(
double));
766 vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn;
770 uint ns=data->
Xp[i+1]-data->
Xp[i];
771 sint ui = perm_x2c[i];
775 for(j=0;j<i;++j) vx[j]=0;
778 apply_S_col(vs, data,A_ss, ui, vl);
779 apply_Xt(vx,i, data, vs);
781 apply_QQt(data,vx,i,xn-i);
782 apply_X(vs,ns, data, vx,i);
784 apply_S(Svs,ns, data,A_ss, vs, vl);
785 ytsy = tensor_dot(vs,Svs,ns);
786 ytsy = sum(data,ytsy,i+1,xn-(i+1));
787 if(ytsy<DBL_EPSILON/128) ytsy=0;
else ytsy = 1/sqrt(ytsy);
788 x=&data->
X[data->
Xp[i]];
789 for(j=0;j<ns;++j) x[j]=ytsy*vs[j];
796 static void condense_matrix(
struct array *mat, uint nr,
797 struct csr_mat *out, buffer *buf)
801 sarray_sort_2(
struct yale_mat,mat->ptr,mat->n,
i,0,
j,0, buf);
804 for(k=0;k+1<nz;++k,++p)
if(p[0].
i==p[1].
i && p[0].
j==p[1].
j)
break;
809 if(
i==q->
i&&
j==q->
j) p->
v += q->
v, --mat->n;
810 else ++p, p->
i=
i=q->
i,p->
j=
j=q->
j, p->
v=q->
v;
816 out->
Arp = tmalloc(uint,nr+1+mat->n);
817 out->
Aj = out->
Arp+nr+1;
818 out->
A = tmalloc(
double,mat->n);
819 for(k=0;k<nr;++k) out->
Arp[k]=0;
820 for(p=mat->ptr,k=0;k<nz;++k,++p)
821 out->
Arp[p->
i]++, out->
Aj[k]=p->
j, out->
A[k]=p->
v;
822 nz=0;
for(k=0;k<=nr;++k) { uint t=out->
Arp[k]; out->
Arp[k]=nz, nz+=t; }
825 static void separate_matrix(
826 uint nz,
const uint *Ai,
const uint *Aj,
const double *A,
827 const sint *perm, uint ln, uint sn,
833 struct array mat_ll, mat_sl, mat_ss;
835 array_init(
struct yale_mat,&mat_ll,2*nz), mll=mat_ll.ptr;
836 array_init(
struct yale_mat,&mat_sl,2*nz), msl=mat_sl.ptr;
837 array_init(
struct yale_mat,&mat_ss,2*nz), mss=mat_ss.ptr;
839 sint
i=perm[Ai[k]],
j=perm[Aj[k]];
840 if(
i<0 ||
j<0 || A[k]==0)
continue;
843 n=mat_ll.n++,mll[n].
i=
i,mll[n].
j=
j,mll[n].
v=A[k];
846 n=mat_sl.n++,msl[n].
i=
i-ln,msl[n].
j=
j,msl[n].
v=A[k];
848 n=mat_ss.n++,mss[n].
i=
i-ln,mss[n].
j=
j-ln,mss[n].
v=A[k];
851 condense_matrix(&mat_ll,ln,out_ll,buf);
852 condense_matrix(&mat_sl,sn,out_sl,buf);
853 condense_matrix(&mat_ss,sn,out_ss,buf);
860 uint n,
const ulong *
id,
861 uint nz,
const uint *Ai,
const uint *Aj,
const double *A,
864 struct xxt *data = tmalloc(
struct xxt,1);
870 comm_dup(&data->
comm,comm);
876 buffer_init(&buf,1024);
878 discover_dofs(data,
n,
id,&dofa,&buf,&data->
comm);
879 discover_sep_sizes(data,&dofa,&buf);
881 perm_x2c = discover_sep_ids(data,&dofa,&buf);
883 uint i;
double count = 0;
struct dof *
dof = dofa.ptr;
887 for(i=0;i<data->
cn;++i)
893 separate_matrix(nz,Ai,Aj,A,data->
perm_u2c,
895 &A_ll,&data->
A_sl,&A_ss,
898 separate_matrix(nz,Ai,Aj,A,data->
perm_u2c,
900 &A_ll,&data->
A_sl,&A_ss,
906 free(A_ll.
Arp); free(A_ll.
A);
908 data->
vl = tmalloc(
double,data->
ln+data->
cn+2*data->
xn);
909 data->
vc = data->
vl+data->
ln;
910 data->
vx = data->
vc+data->
cn;
913 orthogonalize(data,&A_ss,perm_x2c,&buf);
914 free(A_ss.
Arp); free(A_ss.
A);
923 uint cn=data->
cn, un=data->
un, ln=data->
ln, sn=data->
sn, xn=data->
xn;
924 double *vl=data->
vl, *vc=data->
vc, *vx=data->
vx;
926 for(i=0;i<cn;++i) vc[i]=0;
929 if(p>=0) vc[p]+=b[i];
934 apply_m_Asl(vc+ln,sn, data, vc);
935 apply_Xt(vx,xn, data, vc+ln);
936 apply_QQt(data,vx,xn,0);
937 apply_X(vc+ln,sn, data, vx,xn);
938 for(i=0;i<ln;++i) vl[i]=0;
939 apply_p_Als(vl, data, vc+ln,sn);
941 for(i=0;i<ln;++i) vc[i]-=vl[i];
945 if(xn==0) vc[ln-1]=0;
946 else if(sn==1) vc[ln]=0;
952 s = sum(data,s,data->
xn,0);
953 for(i=0;i<cn;++i) vc[i]-=s;
957 x[i] = p>=0 ? vc[p] : 0;
964 if(data->
comm.id==0) {
966 printf(
"xxt: separator sizes on %d =",(
int)data->
comm.id);
967 for(s=0;s<data->
nsep;++s) printf(
" %d",(
int)data->
sep_size[s]);
969 printf(
"xxt: shared dofs on %d = %d\n",(
int)data->
comm.id,(
int)data->
sn);
972 comm_allreduce(&data->
comm,gs_int,gs_max, &a,1, &b);
973 if(data->
comm.id==0) printf(
"xxt: max non-shared dofs = %d\n",a);
975 comm_allreduce(&data->
comm,gs_int,gs_max, &a,1, &b);
976 if(data->
comm.id==0) printf(
"xxt: max shared dofs = %d\n",a);
979 comm_allreduce(&data->
comm,gs_int,gs_max, &a,1, &b);
980 if(data->
comm.id==0) printf(
"xxt: max X cols = %d\n",a);
981 a=data->
Xp[xcol]*
sizeof(double);
982 comm_allreduce(&data->
comm,gs_int,gs_max, &a,1, &b);
983 if(data->
comm.id==0) printf(
"xxt: max X size = %d bytes\n",a);
988 comm_free(&data->
comm);
996 free(data->
Xp); free(data->
X);
void sparse_cholesky_solve(double *x, const struct sparse_cholesky *fac, double *b)
void sparse_cholesky_free(struct sparse_cholesky *fac)
void sparse_cholesky_factor(uint n, const uint *Arp, const uint *Aj, const double *A, struct sparse_cholesky *out, buffer *buf)
The nomenclature of the interpolating fields saved by Nek into the binary file int_fld is here explained v
struct sparse_cholesky fac_A_ll