KTH framework for Nek5000 toolboxes; testing version  0.0.1
crs_xxt.c
Go to the documentation of this file.
1 #include <stddef.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <limits.h>
5 #include <float.h>
6 #include <string.h>
7 #include <math.h>
8 #include "gslib.h"
9 
10 #define crs_setup PREFIXED_NAME(crs_xxt_setup)
11 #define crs_solve PREFIXED_NAME(crs_xxt_solve)
12 #define crs_stats PREFIXED_NAME(crs_xxt_stats)
13 #define crs_free PREFIXED_NAME(crs_xxt_free )
14 
15 /*
16  portable log base 2
17 
18  does a binary search to find leading order bit
19 
20  UINT_BITS = number of bits in a uint
21  BITS(0) = UINT_BITS
22  BITS(i) = half of BITS(i-1), rounded up
23  MASK(i) = bitmask with BITS(i) 1's followed by BITS(i) 0's
24 */
25 
26 static unsigned lg(uint v)
27 {
28  unsigned r = 0;
29 #define UINT_BITS (sizeof(uint)*CHAR_BIT)
30 #define BITS(i) ((UINT_BITS+(1<<(i))-1)>>(i))
31 #define MASK(i) ((((uint)1<<BITS(i)) - 1) << BITS(i))
32 #define CHECK(i) if((BITS(i)!=1) && (v&MASK(i))) v>>=BITS(i), r+=BITS(i)
33  CHECK(1); CHECK(2); CHECK(3); CHECK(4); CHECK(5); CHECK(6); CHECK(7);
34  CHECK(8); CHECK(9); /* this covers up to 1024-bit uints */
35  if(v&2) ++r;
36  return r;
37 #undef UINT_BITS
38 #undef BITS
39 #undef MASK
40 #undef CHECK
41 }
42 
43 /* factors: L is in CSR format
44  D is a diagonal matrix stored as a vector
45  actual factorization is:
46 
47  -1 T
48  A = (I-L) D (I-L)
49 
50  -1 -T -1
51  A = (I-L) D (I-L)
52 
53  (triangular factor is unit diagonal; the diagonal is not stored)
54 */
56  uint n, *Lrp, *Lj;
57  double *L, *D;
58 };
59 
60 /*
61  symbolic factorization: finds the sparsity structure of L
62 
63  uses the concept of elimination tree:
64  the parent of node j is node i when L(i,j) is the first
65  non-zero in column j below the diagonal (i>j)
66  L's structure is discovered row-by-row; the first time
67  an entry in column j is set, it must be the parent
68 
69  the nonzeros in L are the nonzeros in A + paths up the elimination tree
70 
71  linear in the number of nonzeros of L
72 */
73 static void factor_symbolic(uint n, const uint *Arp, const uint *Aj,
74  struct sparse_cholesky *out, buffer *buf)
75 {
76  uint *visit = tmalloc(uint,2*n), *parent = visit+n;
77  uint *Lrp, *Lj;
78  uint i,nz=0;
79 
80  out->n=n;
81 
82  for(i=0;i<n;++i) {
83  uint p=Arp[i], pe=Arp[i+1];
84  visit[i]=i, parent[i]=n;
85  for(;p!=pe;++p) {
86  uint j=Aj[p]; if(j>=i) break;
87  for(;visit[j]!=i;j=parent[j]) {
88  ++nz, visit[j]=i;
89  if(parent[j]==n) { parent[j]=i; break; }
90  }
91  }
92  }
93 
94  Lrp=out->Lrp=tmalloc(uint,n+1+nz);
95  Lj =out->Lj =Lrp+n+1;
96 
97  Lrp[0]=0;
98  for(i=0;i<n;++i) {
99  uint p=Arp[i], pe=Arp[i+1], count=0, *Ljr=&Lj[Lrp[i]];
100  visit[i]=i;
101  for(;p!=pe;++p) {
102  uint j=Aj[p]; if(j>=i) break;
103  for(;visit[j]!=i;j=parent[j]) Ljr[count++]=j, visit[j]=i;
104  }
105  sortv(Ljr, Ljr,count,sizeof(uint), buf);
106  Lrp[i+1]=Lrp[i]+count;
107  }
108  free(visit);
109 }
110 
111 /*
112  numeric factorization:
113 
114  L is built row-by-row, using: ( ' indicates transpose )
115 
116 
117  [ A r ] = [ (I-L) ] [ D^(-1) ] [ (I-L)' -s ]
118  [ r' a ] [ -s' 1 ] [ 1/d ] [ 1 ]
119 
120  = [ A (I-L) D^(-1) (-s) ]
121  [ r' s' D^(-1) s + 1/d ]
122 
123  so, if r' is the next row of A, up to but excluding the diagonal,
124  then the next row of L, s', obeys
125 
126  r = - (I-L) D^(-1) s
127 
128  let y = (I-L)^(-1) (-r)
129  then s = D y, and d = 1/(s' y)
130 
131 */
132 static void factor_numeric(uint n, const uint *Arp, const uint *Aj,
133  const double *A,
134  struct sparse_cholesky *out,
135  uint *visit, double *y)
136 {
137  const uint *Lrp=out->Lrp, *Lj=out->Lj;
138  double *D, *L;
139  uint i;
140 
141  D=out->D=tmalloc(double,n+Lrp[n]);
142  L=out->L=D+n;
143 
144  for(i=0;i<n;++i) {
145  uint p,pe; double a=0;
146  visit[i]=n;
147  for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
148  uint j=Lj[p]; y[j]=0, visit[j]=i;
149  }
150  for(p=Arp[i],pe=Arp[i+1];p!=pe;++p) {
151  uint j=Aj[p];
152  if(j>=i) { if(j==i) a=A[p]; break; }
153  y[j]=-A[p];
154  }
155  for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) {
156  uint q,qe,j=Lj[p]; double lij,yj=y[j];
157  for(q=Lrp[j],qe=Lrp[j+1];q!=qe;++q) {
158  uint k=Lj[q]; if(visit[k]==i) yj+=L[q]*y[k];
159  }
160  y[j]=yj;
161  L[p]=lij=D[j]*yj;
162  a-=yj*lij;
163  }
164  D[i]=1/a;
165  }
166 }
167 
168 /* x = A^(-1) b; works when x and b alias */
170  double *x, const struct sparse_cholesky *fac, double *b)
171 {
172  const uint n=fac->n, *Lrp=fac->Lrp, *Lj=fac->Lj;
173  const double *L=fac->L, *D=fac->D;
174  uint i, p,pe;
175  for(i=0;i<n;++i) {
176  double xi = b[i];
177  for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) xi+=L[p]*x[Lj[p]];
178  x[i]=xi;
179  }
180  for(i=0;i<n;++i) x[i]*=D[i];
181  for(i=n;i;) {
182  double xi = x[--i];
183  for(p=Lrp[i],pe=Lrp[i+1];p!=pe;++p) x[Lj[p]]+=L[p]*xi;
184  }
185 }
186 
187 void sparse_cholesky_factor(uint n, const uint *Arp, const uint *Aj,
188  const double *A,
189  struct sparse_cholesky *out, buffer *buf)
190 {
191  const uint n_uints_as_dbls = (n*sizeof(uint)+sizeof(double)-1)/sizeof(double);
192  buffer_reserve(buf,(n_uints_as_dbls+n)*sizeof(double));
193  factor_symbolic(n,Arp,Aj,out,buf);
194  factor_numeric(n,Arp,Aj,A,out,buf->ptr,n_uints_as_dbls+(double*)buf->ptr);
195 }
196 
198 {
199  free(fac->Lrp); fac->Lj=fac->Lrp=0;
200  free(fac->D); fac->L =fac->D =0;
201 }
202 
203 struct csr_mat {
204  uint n, *Arp, *Aj; double *A;
205 };
206 
207 struct xxt {
208 
209  /* communication */
210 
211  struct comm comm;
212  uint pcoord; /* coordinate in communication tree */
213  unsigned plevels; /* # of stages of communication */
214  sint *pother; /* let p = pother[i], then during stage i of fan-in,
215  if p>=0, receive from p
216  if p< 0, send to (-p-1)
217  fan-out is just the reverse ...
218  on proc 0, pother is never negative
219  on others, pother is negative for the last stage only */
220  comm_req *req;
221 
222  /* separators */
223 
224  unsigned nsep; /* number of separators */
225  uint *sep_size; /* # of dofs on each separator,
226  ordered from the bottom to the top of the tree:
227  separator 0 is the bottom-most one (dofs not shared)
228  separator nsep-1 is the root of the tree */
229 
230  unsigned null_space;
231  double *share_weight;
232 
233  /* vector sizes */
234 
235  uint un; /* user's vector size */
236 
237  /* xxt_solve works with "condensed" vectors;
238  same dofs as in user's vectors, but no duplicates and no Dirichlet nodes,
239  and also ordered topologically (children before parents) according to the
240  separator tree */
241 
242  uint cn; /* size of condensed vectors */
243  sint *perm_u2c; /* permutation from user vector to condensed vector,
244  p=perm_u2c[i]; xu[i] = p=-1 ? 0 : xc[p]; */
245  uint ln, sn; /* xc[0 ... ln-1] are not shared (ln=sep_size[0])
246  xc[ln ... ln+sn-1] are shared
247  ln+sn = cn */
248 
249  uint xn; /* # of columns of x = sum_i(sep_size[i]) - sep_size[0] */
250 
251  /* data */
252  struct sparse_cholesky fac_A_ll;
253  struct csr_mat A_sl;
254  uint *Xp; double *X; /* column i of X starts at X[Xp[i]] */
255 
256  /* execution buffers */
257  double *vl, *vc, *vx, *combuf;
258 };
259 
260 
261 /*
262  for the binary communication tree, the procs are divided in half
263  at each level, with the second half always the larger
264 
265  e.g., for np = 13:
266 
267  +------13-------+
268  | |
269  +---6---+ +---7---+
270  | | | |
271  +-3-+ +-3-+ +-3-+ +-4-+
272  1 2 1 2 1 2 2 2
273  1^1 1^1 1^1 1^1 1^1
274 
275  plevels is the number of levels in the tree
276  = np==1 ? 1 : ( lg(np-1)+2 )
277 
278  labelling the nodes with proc id's gives this communication tree:
279 
280  +-------0-------+
281  | |
282  +---0---+ +---6---+
283  | | | |
284  +-0-+ +-3-+ +-6-+ +-9-+
285  0 1 3 4 6 7 9 b
286  1^2 4^5 7^8 9^a b^c
287 
288  consider proc 7 (pid = 7);
289  pcoord gives the position of the leaf labelled 7:
290  Root Right Left Right Left -> RRLRL -> 11010
291  so pcoord = 11010 binary
292  note the parent coordinate can be found by bit shifting right
293  (i.e. dividing by 2)
294 */
295 
296 /* sets: pcoord, nsep, plevels, pother, req */
297 static void locate_proc(struct xxt *data)
298 {
299  const uint id = data->comm.id;
300  uint n = data->comm.np, c=1, odd=0, base=0;
301  unsigned level=0;
302  while(n>1) {
303  ++level;
304  odd=(odd<<1)|(n&1);
305  c<<=1, n>>=1;
306  if(id>=base+n) c|=1, base+=n, n+=(odd&1);
307  }
308  data->pcoord=c;
309  data->nsep = level+1;
310  data->plevels = data->nsep-1;
311  data->pother = tmalloc(sint,data->plevels);
312  data->req = tmalloc(comm_req,data->plevels);
313  for(level=0;level<data->plevels;++level) {
314  if((c&1)==1) {
315  uint targ = id - (n-(odd&1));
316  data->pother[level]=-(sint)(targ+1);
317  data->plevels = level+1;
318  break;
319  } else {
320  data->pother[level]=id+n;
321  c>>=1, n=(n<<1)+(odd&1), odd>>=1;
322  }
323  }
324 }
325 
326 /* the tuple list describing the condensed dofs:
327  [(separator level, share count, global id)] */
328 struct dof { ulong id; uint level, count; };
329 
330 /* determine the size of each separator;
331  sums the separator sizes following the fan-in, fan-out comm. pattern
332  uses the share-counts to avoid counting dofs more than once */
333 /* sets: xn, sep_size, ln, sn */
334 static void discover_sep_sizes(struct xxt *data,
335  struct array *dofa, buffer *buf)
336 {
337  const unsigned ns=data->nsep, nl=data->plevels;
338  const uint n = dofa->n;
339  float *v, *recv;
340  unsigned i,lvl; uint j;
341  const struct dof *dof = dofa->ptr;
342 
343  buffer_reserve(buf,2*ns*sizeof(float));
344  v=buf->ptr, recv=v+ns;
345 
346  for(i=0;i<ns;++i) v[i]=0;
347  for(j=0;j<n;++j) v[dof[j].level]+=1/(float)dof[j].count;
348 
349  /* fan-in */
350  for(lvl=0;lvl<nl;++lvl) {
351  sint other = data->pother[lvl];
352  unsigned s = ns-(lvl+1);
353  if(other<0) {
354  comm_send(&data->comm,v +lvl+1,s*sizeof(float),-other-1,s);
355  } else {
356  comm_recv(&data->comm,recv+lvl+1,s*sizeof(float),other,s);
357  for(i=lvl+1;i<ns;++i) v[i]+=recv[i];
358  }
359  }
360  /* fan-out */
361  for(;lvl;) {
362  sint other = data->pother[--lvl];
363  unsigned s = ns-(lvl+1);
364  if(other<0)
365  comm_recv(&data->comm,v+lvl+1,s*sizeof(float),-other-1,s);
366  else
367  comm_send(&data->comm,v+lvl+1,s*sizeof(float),other,s);
368  }
369 
370  data->xn=0;
371  data->sep_size = tmalloc(uint,ns);
372  for(i=0;i<ns;++i) {
373  uint s=v[i]+.1f;
374  data->sep_size[i]=s;
375  data->xn+=s;
376  }
377  data->ln=data->sep_size[0];
378  data->sn=data->cn-data->ln;
379  data->xn-=data->ln;
380 }
381 
382 /* assuming [A,Aend) is sorted,
383  removes 0's and any duplicate entries,
384  returns new end */
385 static ulong *unique_nonzero(ulong *A, ulong *Aend)
386 {
387  if(Aend==A) return A;
388  else {
389  ulong *end = Aend-1, last=*end, *p=A,*q=A,v=0;
390  *end = 1;
391  while(*q==0) ++q; /* *q==0 => q!=end since *end==0 */
392  *end = 0;
393  while(q!=end) {
394  v=*q++, *p++=v;
395  while(*q==v) ++q; /* *q==v => q!=end since *end==0 */
396  }
397  if(last!=v) *p++=last;
398  return p;
399  }
400 }
401 
402 static void merge_sep_ids(struct xxt *data, ulong *sep_id, ulong *other,
403  ulong *work, unsigned s0, buffer *buf)
404 {
405  const unsigned ns = data->nsep;
406  unsigned s;
407  ulong *p=sep_id, *q=other;
408  for(s=s0;s<ns;++s) {
409  ulong *end;
410  uint size = data->sep_size[s];
411  memcpy(work ,p,size*sizeof(ulong));
412  memcpy(work+size,q,size*sizeof(ulong));
413  sortv_long(work, work,2*size,sizeof(ulong), buf);
414  end = unique_nonzero(work,work+2*size);
415  memcpy(p,work,(end-work)*sizeof(ulong));
416  p+=size, q+=size;
417  }
418 }
419 
420 static void init_sep_ids(struct xxt *data, struct array *dofa, ulong *xid)
421 {
422  const unsigned ns=data->nsep;
423  const uint n=data->cn, *sep_size=data->sep_size;
424  unsigned s=1;
425  uint i, size;
426  const struct dof *dof = dofa->ptr;
427  if(ns==1) return;
428  size=sep_size[s];
429  for(i=data->ln;i<n;++i) {
430  unsigned si = dof[i].level;
431  while(s!=si) {
432  memset(xid,0,size*sizeof(ulong));
433  xid+=size;
434  if(++s != ns) size=data->sep_size[s];
435  }
436  *xid++ = dof[i].id, --size;
437  }
438  while(s!=ns) {
439  memset(xid,0,size*sizeof(ulong));
440  xid+=size;
441  if(++s != ns) size=data->sep_size[s];
442  }
443 }
444 
445 static void find_perm_x2c(uint ln, uint cn, const struct array *dofc,
446  uint xn, const ulong *xid, sint *perm)
447 {
448  const struct dof *dof = dofc->ptr, *dof_end = dof+cn;
449  const ulong *xid_end = xid+xn; uint i=ln;
450  dof+=ln;
451  while(dof!=dof_end) {
452  ulong v=dof->id;
453  while(*xid!=v) ++xid, *perm++ = -1;
454  *perm++ = i++, ++dof, ++xid;
455  }
456  while(xid!=xid_end) ++xid, *perm++ = -1;
457 }
458 
459 /* sets: perm_x2c */
460 static sint *discover_sep_ids(struct xxt *data, struct array *dofa, buffer *buf)
461 {
462  const unsigned ns=data->nsep, nl=data->plevels;
463  const uint xn=data->xn, *sep_size=data->sep_size;
464  ulong *xid, *recv, *work, *p;
465  unsigned lvl;
466  uint size,ss;
467  sint *perm_x2c;
468 
469  size=0; for(lvl=1;lvl<ns;++lvl) if(sep_size[lvl]>size) size=sep_size[lvl];
470  xid=tmalloc(ulong,2*xn+2*size), recv=xid+xn, work=recv+xn;
471 
472  init_sep_ids(data,dofa,xid);
473 
474  if(nl) {
475  /* fan-in */
476  p=xid, size=xn;
477  for(lvl=0;lvl<nl;++lvl) {
478  sint other = data->pother[lvl];
479  if(other<0) {
480  comm_send(&data->comm,p ,size*sizeof(ulong),-other-1,size);
481  } else {
482  comm_recv(&data->comm,recv,size*sizeof(ulong),other,size);
483  merge_sep_ids(data,p,recv,work,lvl+1,buf);
484  }
485  ss=data->sep_size[lvl+1];
486  if(ss>=size || lvl==nl-1) break;
487  p+=ss, size-=ss;
488  }
489  /* fan-out */
490  for(;;) {
491  sint other = data->pother[lvl];
492  if(other<0)
493  comm_recv(&data->comm,p,size*sizeof(ulong),-other-1,size);
494  else
495  comm_send(&data->comm,p,size*sizeof(ulong),other,size);
496  if(lvl==0) break;
497  ss=data->sep_size[lvl];
498  p-=ss, size+=ss, --lvl;
499  }
500  }
501 
502  perm_x2c=tmalloc(sint,xn);
503  find_perm_x2c(data->ln,data->cn,dofa, xn,xid, perm_x2c);
504  free(xid);
505 
506  return perm_x2c;
507 }
508 
509 static void apply_QQt(struct xxt *data, double *v, uint n, uint tag)
510 {
511  const unsigned nl=data->plevels;
512  double *p=v, *recv=data->combuf;
513  unsigned lvl, nsend=0;
514  uint size=n, ss;
515 
516  if(n==0 || nl==0) return;
517 
518  tag=tag*2+0;
519  /* fan-in */
520  for(lvl=0;lvl<nl;++lvl) {
521  sint other = data->pother[lvl];
522  if(other<0) {
523  comm_send(&data->comm,p ,size*sizeof(double),-other-1,tag);
524  } else {
525  uint i;
526  comm_recv(&data->comm,recv,size*sizeof(double),other ,tag);
527  for(i=0;i<size;++i) p[i]+=recv[i];
528  }
529  ss=data->sep_size[lvl+1];
530  if(ss>=size || lvl==nl-1) break;
531  p+=ss, size-=ss;
532  }
533  /* fan-out */
534  for(;;) {
535  sint other = data->pother[lvl];
536  if(other<0) {
537  comm_recv (&data->comm,p,size*sizeof(double),-other-1,tag);
538  } else {
539  comm_isend(&data->req[nsend++],&data->comm,
540  p,size*sizeof(double),other ,tag);
541  }
542  if(lvl==0) break;
543  ss=data->sep_size[lvl];
544  p-=ss, size+=ss, --lvl;
545  }
546  if(nsend) comm_wait(data->req,nsend);
547 }
548 
549 static double sum(struct xxt *data, double v, uint n, uint tag)
550 {
551  const unsigned nl=data->plevels;
552  double r;
553  unsigned lvl,nsend=0;
554  uint size=n, ss;
555 
556  tag=tag*2+1;
557  if(n==0 || nl==0) return v;
558  /* fan-in */
559  for(lvl=0;lvl<nl;++lvl) {
560  sint other = data->pother[lvl];
561  if(other<0) {
562  comm_send(&data->comm,&v,sizeof(double),-other-1,tag);
563  } else {
564  comm_recv(&data->comm,&r,sizeof(double),other ,tag);
565  v+=r;
566  }
567  ss=data->sep_size[lvl+1];
568  if(ss>=size || lvl==nl-1) break;
569  size-=ss;
570  }
571  /* fan-out */
572  for(;;) {
573  sint other = data->pother[lvl];
574  if(other<0) {
575  comm_recv (&data->comm,&v,sizeof(double),-other-1,tag);
576  } else {
577  comm_isend(&data->req[nsend++],&data->comm,
578  &v,sizeof(double),other ,tag);
579  }
580  if(lvl==0) break;
581  ss=data->sep_size[lvl];
582  size+=ss, --lvl;
583  }
584  if(nsend) comm_wait(data->req,nsend);
585  return v;
586 }
587 
588 /* sorts an array of ids, removes 0's and duplicates;
589  just returns the permutation */
590 static uint unique_ids(uint n, const ulong *id, sint *perm, buffer *buf)
591 {
592  uint *p, i, un=0; ulong last=0;
593  p = sortp_long(buf,0, id,n,sizeof(ulong));
594  for(i=0;i<n;++i) {
595  uint j = p[i]; ulong v = id[j];
596  if(v==0) perm[j]=-1;
597  else {
598  if(v!=last) last=v, ++un;
599  perm[j]=un-1;
600  }
601  }
602  buf->n=0;
603  return un;
604 }
605 
606 /* given user's list of dofs (as id's)
607  uses gather-scatter to find share-count and separator # for each
608  outputs as a list, sorted topologically (children before parents)
609  according to the sep. tree (and without duplicates),
610  as well as the permutation to get there from the user's list */
611 /* sets: un, cn, perm_u2c */
612 static void discover_dofs(
613  struct xxt *data, uint n, const ulong *id,
614  struct array *dofa, buffer *buf, const struct comm *comm)
615 {
616  const uint pcoord = data->pcoord, ns=data->nsep;
617  sint *perm;
618  uint i, cn, *p, *pi;
619  ulong *bid;
620  struct gs_data *gsh; sint *v;
621  struct dof *dof;
622 
623  data->un = n;
624  data->perm_u2c = perm = tmalloc(sint,n);
625  data->cn = cn = unique_ids(n,id,perm,buf);
626  array_init(struct dof,dofa,cn), dofa->n=cn, dof=dofa->ptr;
627  buffer_reserve(buf,cn*sizeof(ulong)), bid=buf->ptr;
628  for(i=0;i<n;++i) if(perm[i]>=0) bid[perm[i]]=dof[perm[i]].id=id[i];
629 
630  gsh = gs_setup((const slong*)bid,cn,comm,0,gs_crystal_router,0);
631  v = tmalloc(sint,cn);
632 
633  for(i=0;i<cn;++i) v[i]=pcoord;
634  gs(v,gs_sint,gs_bpr,0,gsh,buf);
635  for(i=0;i<cn;++i) dof[i].level=ns-1-lg((uint)v[i]);
636 
637  for(i=0;i<cn;++i) v[i]=1;
638  gs(v,gs_sint,gs_add,0,gsh,buf);
639  for(i=0;i<cn;++i) dof[i].count=v[i];
640 
641  free(v);
642  gs_free(gsh);
643 
644  if(!cn) return;
645  buffer_reserve(buf,2*cn*sizeof(uint));
646  p = sortp(buf,0, &dof[0].level,cn,sizeof(struct dof));
647  pi = p+cn; for(i=0;i<cn;++i) pi[p[i]]=i;
648  for(i=0;i<n;++i) if(perm[i]>=0) perm[i]=pi[perm[i]];
649  sarray_permute_buf(struct dof,dof,cn, buf);
650 }
651 
652 /* vl += A_ls * vs */
653 static void apply_p_Als(double *vl, struct xxt *data, const double *vs, uint ns)
654 {
655  const uint *Arp = data->A_sl.Arp,
656  *Aj = data->A_sl.Aj;
657  const double *A = data->A_sl.A;
658  uint i,p,pe;
659  for(i=0;i<ns;++i)
660  for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
661  vl[Aj[p]]+=A[p]*vs[i];
662 }
663 
664 /* vs -= A_sl * vl */
665 static void apply_m_Asl(double *vs, uint ns, struct xxt *data, const double *vl)
666 {
667  const uint *Arp = data->A_sl.Arp,
668  *Aj = data->A_sl.Aj;
669  const double *A = data->A_sl.A;
670  uint i,p,pe;
671  for(i=0;i<ns;++i)
672  for(p=Arp[i],pe=Arp[i+1];p!=pe;++p)
673  vs[i]-=A[p]*vl[Aj[p]];
674 }
675 
676 /* returns a column of S : vs = -S(0:ei-1,ei) */
677 static void apply_S_col(double *vs, struct xxt *data,
678  struct csr_mat *A_ss, uint ei, double *vl)
679 {
680  const uint ln=data->ln;
681  const uint *Asl_rp = data->A_sl.Arp, *Ass_rp = A_ss->Arp,
682  *Asl_j = data->A_sl.Aj, *Ass_j = A_ss->Aj;
683  const double *Asl = data->A_sl.A, *Ass = A_ss->A;
684  uint i,p,pe;
685  for(i=0;i<ei;++i) vs[i]=0;
686  for(p=Ass_rp[ei],pe=Ass_rp[ei+1];p!=pe;++p) {
687  uint j=Ass_j[p];
688  if(j>=ei) break;
689  vs[j]=-Ass[p];
690  }
691  for(i=0;i<ln;++i) vl[i]=0;
692  for(p=Asl_rp[ei],pe=Asl_rp[ei+1];p!=pe;++p) vl[Asl_j[p]]=-Asl[p];
693  sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
694  apply_m_Asl(vs,ei,data,vl);
695 }
696 
697 static void apply_S(double *Svs, uint ns, struct xxt *data,
698  struct csr_mat *A_ss, const double *vs, double *vl)
699 {
700  const uint ln=data->ln;
701  const uint *Ass_rp = A_ss->Arp,
702  *Ass_j = A_ss->Aj;
703  const double *Ass = A_ss->A;
704  uint i, p,pe;
705  for(i=0;i<ns;++i) {
706  double sum=0;
707  for(p=Ass_rp[i],pe=Ass_rp[i+1];p!=pe;++p) {
708  uint j=Ass_j[p];
709  if(j>=ns) break;
710  sum+=Ass[p]*vs[j];
711  }
712  Svs[i]=sum;
713  }
714  for(i=0;i<ln;++i) vl[i]=0;
715  apply_p_Als(vl,data,vs,ns);
716  sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
717  apply_m_Asl(Svs,ns,data,vl);
718 }
719 
720 /* vx = X' * vs */
721 static void apply_Xt(double *vx, uint nx, const struct xxt *data,
722  const double *vs)
723 {
724  const double *X = data->X; const uint *Xp = data->Xp;
725  uint i; for(i=0;i<nx;++i) vx[i]=tensor_dot(vs,X+Xp[i],Xp[i+1]-Xp[i]);
726 }
727 
728 /* vs = X * vx */
729 static void apply_X(double *vs, uint ns, const struct xxt *data,
730  const double *vx, uint nx)
731 {
732  const double *X = data->X; const uint *Xp = data->Xp;
733  uint i,j;
734  for(i=0;i<ns;++i) vs[i]=0;
735  for(i=0;i<nx;++i) {
736  const double v = vx[i];
737  const double *x = X+Xp[i]; uint n=Xp[i+1]-Xp[i];
738  for(j=0;j<n;++j) vs[j]+=x[j]*v;
739  }
740 }
741 
742 static void allocate_X(struct xxt *data, sint *perm_x2c)
743 {
744  uint xn=data->xn;
745  uint i,h=0;
746  if(data->null_space && xn) --xn;
747  data->Xp = tmalloc(uint,xn+1);
748  data->Xp[0]=0;
749  for(i=0;i<xn;++i) {
750  if(perm_x2c[i]!=-1) ++h;
751  data->Xp[i+1]=data->Xp[i]+h;
752  }
753  data->X = tmalloc(double,data->Xp[xn]);
754 }
755 
756 static void orthogonalize(struct xxt *data, struct csr_mat *A_ss,
757  sint *perm_x2c, buffer *buf)
758 {
759  uint ln=data->ln, sn=data->sn, xn=data->xn;
760  double *vl, *vs, *vx, *Svs;
761  uint i,j;
762 
763  allocate_X(data,perm_x2c);
764 
765  buffer_reserve(buf,(ln+2*sn+xn)*sizeof(double));
766  vl=buf->ptr, vs=vl+ln, Svs=vs+sn, vx=Svs+sn;
767 
768  if(data->null_space && xn) --xn;
769  for(i=0;i<xn;++i) {
770  uint ns=data->Xp[i+1]-data->Xp[i];
771  sint ui = perm_x2c[i];
772  double ytsy, *x;
773 
774  if(ui == -1) {
775  for(j=0;j<i;++j) vx[j]=0;
776  } else {
777  ui-=ln;
778  apply_S_col(vs, data,A_ss, ui, vl);
779  apply_Xt(vx,i, data, vs);
780  }
781  apply_QQt(data,vx,i,xn-i);
782  apply_X(vs,ns, data, vx,i);
783  if(ui!=-1) vs[ui]=1;
784  apply_S(Svs,ns, data,A_ss, vs, vl);
785  ytsy = tensor_dot(vs,Svs,ns);
786  ytsy = sum(data,ytsy,i+1,xn-(i+1));
787  if(ytsy<DBL_EPSILON/128) ytsy=0; else ytsy = 1/sqrt(ytsy);
788  x=&data->X[data->Xp[i]];
789  for(j=0;j<ns;++j) x[j]=ytsy*vs[j];
790  }
791 }
792 
793 struct yale_mat { uint i,j; double v; };
794 
795 /* produces CSR matrix from Yale-like format, summing duplicates */
796 static void condense_matrix(struct array *mat, uint nr,
797  struct csr_mat *out, buffer *buf)
798 {
799  uint k, nz=mat->n;
800  struct yale_mat *p, *q;
801  sarray_sort_2(struct yale_mat,mat->ptr,mat->n, i,0, j,0, buf);
802 
803  p = mat->ptr;
804  for(k=0;k+1<nz;++k,++p) if(p[0].i==p[1].i && p[0].j==p[1].j) break;
805  if(++k<nz) {
806  uint i=p->i,j=p->j;
807  q = p+1;
808  for(;k<nz;++k,++q) {
809  if(i==q->i&&j==q->j) p->v += q->v, --mat->n;
810  else ++p, p->i=i=q->i,p->j=j=q->j, p->v=q->v;
811  }
812  }
813 
814  nz=mat->n;
815  out->n=nr;
816  out->Arp = tmalloc(uint,nr+1+mat->n);
817  out->Aj = out->Arp+nr+1;
818  out->A = tmalloc(double,mat->n);
819  for(k=0;k<nr;++k) out->Arp[k]=0;
820  for(p=mat->ptr,k=0;k<nz;++k,++p)
821  out->Arp[p->i]++, out->Aj[k]=p->j, out->A[k]=p->v;
822  nz=0; for(k=0;k<=nr;++k) { uint t=out->Arp[k]; out->Arp[k]=nz, nz+=t; }
823 }
824 
825 static void separate_matrix(
826  uint nz, const uint *Ai, const uint *Aj, const double *A,
827  const sint *perm, uint ln, uint sn,
828  struct csr_mat *out_ll, struct csr_mat *out_sl, struct csr_mat *out_ss,
829  buffer *buf
830 )
831 {
832  uint k,n;
833  struct array mat_ll, mat_sl, mat_ss;
834  struct yale_mat *mll, *msl, *mss;
835  array_init(struct yale_mat,&mat_ll,2*nz), mll=mat_ll.ptr;
836  array_init(struct yale_mat,&mat_sl,2*nz), msl=mat_sl.ptr;
837  array_init(struct yale_mat,&mat_ss,2*nz), mss=mat_ss.ptr;
838  for(k=0;k<nz;++k) {
839  sint i=perm[Ai[k]], j=perm[Aj[k]];
840  if(i<0 || j<0 || A[k]==0) continue;
841  if((uint)i<ln) {
842  if((uint)j<ln)
843  n=mat_ll.n++,mll[n].i=i,mll[n].j=j,mll[n].v=A[k];
844  } else {
845  if((uint)j<ln)
846  n=mat_sl.n++,msl[n].i=i-ln,msl[n].j=j,msl[n].v=A[k];
847  else
848  n=mat_ss.n++,mss[n].i=i-ln,mss[n].j=j-ln,mss[n].v=A[k];
849  }
850  }
851  condense_matrix(&mat_ll,ln,out_ll,buf);
852  condense_matrix(&mat_sl,sn,out_sl,buf);
853  condense_matrix(&mat_ss,sn,out_ss,buf);
854  array_free(&mat_ll);
855  array_free(&mat_sl);
856  array_free(&mat_ss);
857 }
858 
859 struct xxt *crs_setup(
860  uint n, const ulong *id,
861  uint nz, const uint *Ai, const uint *Aj, const double *A,
862  uint null_space, const struct comm *comm)
863 {
864  struct xxt *data = tmalloc(struct xxt,1);
865  sint *perm_x2c;
866  struct array dofa;
867  struct csr_mat A_ll, A_ss;
868  buffer buf;
869 
870  comm_dup(&data->comm,comm);
871 
872  locate_proc(data);
873 
874  data->null_space=null_space;
875 
876  buffer_init(&buf,1024);
877 
878  discover_dofs(data,n,id,&dofa,&buf,&data->comm);
879  discover_sep_sizes(data,&dofa,&buf);
880 
881  perm_x2c = discover_sep_ids(data,&dofa,&buf);
882  if(data->null_space) {
883  uint i; double count = 0; struct dof *dof = dofa.ptr;
884  for(i=0;i<data->cn;++i) count+=1/(double)dof[i].count;
885  count=1/sum(data,count,data->xn,0);
886  data->share_weight=tmalloc(double,data->cn);
887  for(i=0;i<data->cn;++i)
888  data->share_weight[i]=count/dof[i].count;
889  }
890  array_free(&dofa);
891 
892  if(!data->null_space || data->xn!=0) {
893  separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
894  data->ln,data->sn,
895  &A_ll,&data->A_sl,&A_ss,
896  &buf);
897  } else {
898  separate_matrix(nz,Ai,Aj,A,data->perm_u2c,
899  data->ln-1,1,
900  &A_ll,&data->A_sl,&A_ss,
901  &buf);
902  }
903 
904  sparse_cholesky_factor(A_ll.n,A_ll.Arp,A_ll.Aj,A_ll.A,
905  &data->fac_A_ll, &buf);
906  free(A_ll.Arp); free(A_ll.A);
907 
908  data->vl = tmalloc(double,data->ln+data->cn+2*data->xn);
909  data->vc = data->vl+data->ln;
910  data->vx = data->vc+data->cn;
911  data->combuf = data->vx+data->xn;
912 
913  orthogonalize(data,&A_ss,perm_x2c,&buf);
914  free(A_ss.Arp); free(A_ss.A);
915  free(perm_x2c);
916  buffer_free(&buf);
917 
918  return data;
919 }
920 
921 void crs_solve(double *x, struct xxt *data, const double *b)
922 {
923  uint cn=data->cn, un=data->un, ln=data->ln, sn=data->sn, xn=data->xn;
924  double *vl=data->vl, *vc=data->vc, *vx=data->vx;
925  uint i;
926  for(i=0;i<cn;++i) vc[i]=0;
927  for(i=0;i<un;++i) {
928  sint p=data->perm_u2c[i];
929  if(p>=0) vc[p]+=b[i];
930  }
931  if(xn>0 && (!data->null_space || xn>1)) {
932  if(data->null_space) --xn;
933  sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
934  apply_m_Asl(vc+ln,sn, data, vc);
935  apply_Xt(vx,xn, data, vc+ln);
936  apply_QQt(data,vx,xn,0);
937  apply_X(vc+ln,sn, data, vx,xn);
938  for(i=0;i<ln;++i) vl[i]=0;
939  apply_p_Als(vl, data, vc+ln,sn);
940  sparse_cholesky_solve(vl,&data->fac_A_ll,vl);
941  for(i=0;i<ln;++i) vc[i]-=vl[i];
942  } else {
943  sparse_cholesky_solve(vc,&data->fac_A_ll,vc);
944  if(data->null_space) {
945  if(xn==0) vc[ln-1]=0;
946  else if(sn==1) vc[ln]=0;
947  }
948  }
949  if(data->null_space) {
950  double s=0;
951  for(i=0;i<cn;++i) s+=data->share_weight[i]*vc[i];
952  s = sum(data,s,data->xn,0);
953  for(i=0;i<cn;++i) vc[i]-=s;
954  }
955  for(i=0;i<un;++i) {
956  sint p=data->perm_u2c[i];
957  x[i] = p>=0 ? vc[p] : 0;
958  }
959 }
960 
961 void crs_stats(struct xxt *data)
962 {
963  int a,b; uint xcol;
964  if(data->comm.id==0) {
965  unsigned s;
966  printf("xxt: separator sizes on %d =",(int)data->comm.id);
967  for(s=0;s<data->nsep;++s) printf(" %d",(int)data->sep_size[s]);
968  printf("\n");
969  printf("xxt: shared dofs on %d = %d\n",(int)data->comm.id,(int)data->sn);
970  }
971  a=data->ln;
972  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
973  if(data->comm.id==0) printf("xxt: max non-shared dofs = %d\n",a);
974  a=data->sn;
975  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
976  if(data->comm.id==0) printf("xxt: max shared dofs = %d\n",a);
977  xcol=data->xn; if(xcol&&data->null_space) --xcol;
978  a=xcol;
979  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
980  if(data->comm.id==0) printf("xxt: max X cols = %d\n",a);
981  a=data->Xp[xcol]*sizeof(double);
982  comm_allreduce(&data->comm,gs_int,gs_max, &a,1, &b);
983  if(data->comm.id==0) printf("xxt: max X size = %d bytes\n",a);
984 }
985 
986 void crs_free(struct xxt *data)
987 {
988  comm_free(&data->comm);
989  free(data->pother);
990  free(data->req);
991  free(data->sep_size);
992  free(data->perm_u2c);
993  if(data->null_space) free(data->share_weight);
995  free(data->A_sl.Arp); free(data->A_sl.A);
996  free(data->Xp); free(data->X);
997  free(data->vl);
998  free(data);
999 }
#define crs_solve
Definition: crs_xxt.c:11
#define crs_stats
Definition: crs_xxt.c:12
void sparse_cholesky_solve(double *x, const struct sparse_cholesky *fac, double *b)
Definition: crs_xxt.c:169
#define crs_setup
Definition: crs_xxt.c:10
void sparse_cholesky_free(struct sparse_cholesky *fac)
Definition: crs_xxt.c:197
void sparse_cholesky_factor(uint n, const uint *Arp, const uint *Aj, const double *A, struct sparse_cholesky *out, buffer *buf)
Definition: crs_xxt.c:187
#define crs_free
Definition: crs_xxt.c:13
#define CHECK(i)
The nomenclature of the interpolating fields saved by Nek into the binary file int_fld is here explained v
Definition: field_list.txt:2
uint n
Definition: crs_xxt.c:204
uint * Aj
Definition: crs_xxt.c:204
double * A
Definition: crs_xxt.c:204
uint * Arp
Definition: crs_xxt.c:204
Definition: crs_xxt.c:328
ulong id
Definition: crs_xxt.c:328
uint count
Definition: crs_xxt.c:328
uint level
Definition: crs_xxt.c:328
double * D
Definition: crs_xxt.c:57
double * L
Definition: crs_xxt.c:57
uint * Lrp
Definition: crs_xxt.c:56
uint * Lj
Definition: crs_xxt.c:56
Definition: crs_xxt.c:207
sint * perm_u2c
Definition: crs_xxt.c:243
struct csr_mat A_sl
Definition: crs_xxt.c:253
uint ln
Definition: crs_xxt.c:245
unsigned null_space
Definition: crs_xxt.c:230
double * combuf
Definition: crs_xxt.c:257
unsigned nsep
Definition: crs_xxt.c:224
double * X
Definition: crs_xxt.c:254
double * share_weight
Definition: crs_xxt.c:231
comm_req * req
Definition: crs_xxt.c:220
double * vx
Definition: crs_xxt.c:257
unsigned plevels
Definition: crs_xxt.c:213
struct sparse_cholesky fac_A_ll
Definition: crs_xxt.c:252
struct comm comm
Definition: crs_xxt.c:211
uint pcoord
Definition: crs_xxt.c:212
double * vl
Definition: crs_xxt.c:257
uint cn
Definition: crs_xxt.c:242
uint sn
Definition: crs_xxt.c:245
uint * sep_size
Definition: crs_xxt.c:225
uint * Xp
Definition: crs_xxt.c:254
double * vc
Definition: crs_xxt.c:257
uint un
Definition: crs_xxt.c:235
uint xn
Definition: crs_xxt.c:249
sint * pother
Definition: crs_xxt.c:214
uint j
Definition: crs_xxt.c:793
uint i
Definition: crs_xxt.c:793
double v
Definition: crs_xxt.c:793