tree-mpi.h

📄 View in API Reference (Doxygen) · View on basilisk.fr

// this can be used to control the outputs of debug_mpi()
int debug_iteration = -1;

void debug_mpi (FILE * fp1);

typedef struct {
  CacheLevel * halo; // ghost cell indices for each level
  void * buf;        // MPI buffer
  MPI_Request r;     // MPI request
  int depth;         // the maximum number of levels
  int pid;           // the rank of the PE  
  int maxdepth;      // the maximum depth for this PE (= depth or depth + 1)
} Rcv;

typedef struct {
  Rcv * rcv;
  char * name;
  int npid;
} RcvPid;

typedef struct {
  RcvPid * rcv, * snd;
} SndRcv;

typedef struct {
  Boundary parent;
  
  SndRcv mpi_level, mpi_level_root, restriction;
  Array * send, * receive; // which pids do we send to/receive from
} MpiBoundary;

static void cache_level_init (CacheLevel * c)
{
  c->p = NULL;
  c->n = c->nm = 0;
}

static void rcv_append (Point point, Rcv * rcv)
{
  if (level > rcv->depth) {
    qrealloc (rcv->halo, level + 1, CacheLevel);
    for (int j = rcv->depth + 1; j <= level; j++)
      cache_level_init (&rcv->halo[j]);
    rcv->depth = level;
  }
  cache_level_append (&rcv->halo[level], point);
  if (level > rcv->maxdepth)
    rcv->maxdepth = level;
}

void rcv_print (Rcv * rcv, FILE * fp, const char * prefix)
{
  for (int l = 0; l <= rcv->depth; l++)
    if (rcv->halo[l].n > 0)
      foreach_cache_level(rcv->halo[l], l)
	fprintf (fp, "%s%g %g %g %d %d\n", prefix, x, y, z, rcv->pid, level);
}

static void rcv_free_buf (Rcv * rcv)
{
  if (rcv->buf) {
    prof_start ("rcv_pid_receive");
    MPI_Wait (&rcv->r, MPI_STATUS_IGNORE);
    free (rcv->buf);
    rcv->buf = NULL;
    prof_stop();
  }
}

static void rcv_destroy (Rcv * rcv)
{
  rcv_free_buf (rcv);
  for (int i = 0; i <= rcv->depth; i++)
    if (rcv->halo[i].n > 0)
      free (rcv->halo[i].p);
  free (rcv->halo);
}

static RcvPid * rcv_pid_new (const char * name)
{
  RcvPid * r = qcalloc (1, RcvPid);
  r->name = strdup (name);
  return r;
}

static Rcv * rcv_pid_pointer (RcvPid * p, int pid)
{
  assert (pid >= 0 && pid < npe());
  
  int i;
  for (i = 0; i < p->npid; i++)
    if (pid == p->rcv[i].pid)
      break;

  if (i == p->npid) {
    qrealloc (p->rcv, ++p->npid, Rcv);
    Rcv * rcv = &p->rcv[p->npid-1];
    rcv->pid = pid;
    rcv->depth = rcv->maxdepth = 0;
    rcv->halo = qmalloc (1, CacheLevel);
    rcv->buf = NULL;
    cache_level_init (&rcv->halo[0]);
  }
  return &p->rcv[i];
}

static void rcv_pid_append (RcvPid * p, int pid, Point point)
{
  rcv_append (point, rcv_pid_pointer (p, pid));
}

static void rcv_pid_append_pids (RcvPid * p, Array * pids)
{
  // appends the pids of @p to @pids without duplication
  for (int i = 0; i < p->npid; i++) {
    int pid = p->rcv[i].pid, j, * a;
    for (j = 0, a = pids->p; j < pids->len/sizeof(int); j++,a++)
      if (*a == pid)
	break;
    if (j == pids->len/sizeof(int))
      array_append (pids, &pid, sizeof(int));
  }
}

void rcv_pid_write (RcvPid * p, const char * name)
{
  for (int i = 0; i < p->npid; i++) {
    Rcv * rcv = &p->rcv[i];
    char fname[80];
    sprintf (fname, "%s-%d-%d", name, pid(), rcv->pid);
    FILE * fp = fopen (fname, "w");
    rcv_print (rcv, fp, "");
    fclose (fp);
  }
}

static void rcv_pid_print (RcvPid * p, FILE * fp, const char * prefix)
{
  for (int i = 0; i < p->npid; i++)
    rcv_print (&p->rcv[i], fp, prefix);
}

static void rcv_pid_destroy (RcvPid * p)
{
  for (int i = 0; i < p->npid; i++)
    rcv_destroy (&p->rcv[i]);
  free (p->rcv);
  free (p->name);
  free (p);
}

static Boundary * mpi_boundary = NULL;

#define BOUNDARY_TAG(level) (level)
#define COARSEN_TAG(level)  ((level) + 64)
#define REFINE_TAG()        (128)
#define MOVED_TAG()         (256)

void debug_mpi (FILE * fp1);

static void apply_bc (Rcv * rcv, scalar * list, scalar * listv,
		      vector * listf, int l, MPI_Status s)
{
  double * b = rcv->buf;
  foreach_cache_level(rcv->halo[l], l) {
    for (scalar s in list) {
      memcpy (&s[], b, sizeof(double)*s.block);
      b += s.block;
    }
    for (vector v in listf)
      foreach_dimension() {
	memcpy (&v.x[], b, sizeof(double)*v.x.block);
	b += v.x.block;
	if (*b != nodata && allocated(1))
	  memcpy (&v.x[1], b, sizeof(double)*v.x.block);
	b += v.x.block;
      }
    for (scalar s in listv) {
      for (int i = 0; i <= 1; i++)
	for (int j = 0; j <= 1; j++)
#if dimension == 3
	  for (int k = 0; k <= 1; k++) {
	    if (*b != nodata && allocated(i,j,k))
	      memcpy (&s[i,j,k], b, sizeof(double)*s.block);
	    b += s.block;
	  }
#else // dimension == 2
          {
	    if (*b != nodata && allocated(i,j))
	      memcpy (&s[i,j], b, sizeof(double)*s.block);
	    b += s.block;	    
          }
#endif // dimension == 2
    }
  }
  size_t size = b - (double *) rcv->buf;
  free (rcv->buf);
  rcv->buf = NULL;

  int rlen;
  MPI_Get_count (&s, MPI_DOUBLE, &rlen);
  if (rlen != size) {
    fprintf (stderr,
	     "rlen (%d) != size (%ld), %d receiving from %d at level %d\n"
	     "Calling debug_mpi(NULL)...\n"
	     "Aborting...\n",
	     rlen, size, pid(), rcv->pid, l);
    fflush (stderr);
    debug_mpi (NULL);
    MPI_Abort (MPI_COMM_WORLD, -2);
  }
}

#ifdef TIMEOUT
static struct {
  int count, source, tag;
  const char * name;
} RecvArgs;

static void timeout (int sig, siginfo_t *si, void *uc)
{
  fprintf (stderr,
	   "ERROR MPI_Recv \"%s\" (count = %d, source = %d, tag = %d):\n"
	   "Time out\n"
	   "Calling debug_mpi(NULL)...\n"
	   "Aborting...\n",
	   RecvArgs.name, RecvArgs.count, RecvArgs.source, RecvArgs.tag);
  fflush (stderr);
  debug_mpi (NULL);
  MPI_Abort (MPI_COMM_WORLD, -1);
}
#endif // TIMEOUT

static void mpi_recv_check (void * buf, int count, MPI_Datatype datatype,
			    int source, int tag,
			    MPI_Comm comm, MPI_Status * status,
			    const char * name)
{
#ifdef TIMEOUT
  RecvArgs.count = count;
  RecvArgs.source = source;
  RecvArgs.tag = tag;
  RecvArgs.name = name;
  
  timer_t timerid;
  extern double t;
  if (t > TIMEOUT) {
    struct sigaction sa;
    sa.sa_flags = SA_SIGINFO;
    sa.sa_sigaction = timeout;
    sigemptyset(&sa.sa_mask);
    assert (sigaction(SIGRTMIN, &sa, NULL) != -1);

    struct sigevent sev;
    sev.sigev_notify = SIGEV_SIGNAL;
    sev.sigev_signo = SIGRTMIN;
    sev.sigev_value.sival_ptr = &timerid;
    assert (timer_create(CLOCK_REALTIME, &sev, &timerid) != -1);

    struct itimerspec its;
    its.it_value.tv_sec = 10;
    its.it_value.tv_nsec = 0;
    its.it_interval.tv_sec = 0;
    its.it_interval.tv_nsec = 0;
    assert (timer_settime(timerid, 0, &its, NULL) != -1);
  }
#endif // TIMEOUT
  
  int errorcode = MPI_Recv (buf, count, datatype, source, tag, comm, status);
  if (errorcode != MPI_SUCCESS) {
    char string[MPI_MAX_ERROR_STRING];
    int resultlen;
    MPI_Error_string (errorcode, string, &resultlen);
    fprintf (stderr,
	     "ERROR MPI_Recv \"%s\" (count = %d, source = %d, tag = %d):\n%s\n"
	     "Calling debug_mpi(NULL)...\n"
	     "Aborting...\n",
	     name, count, source, tag, string);
    fflush (stderr);
    debug_mpi (NULL);
    MPI_Abort (MPI_COMM_WORLD, -1);
  }

#ifdef TIMEOUT  
  if (t > TIMEOUT)
    assert (timer_delete (timerid) != -1);
#endif
}

trace
static int mpi_waitany (int count, MPI_Request array_of_requests[], int *indx,
			MPI_Status *status)
{
  return MPI_Waitany (count, array_of_requests, indx, status);
}

static int list_lenb (scalar * list) {
  int len = 0;
  for (scalar s in list)
    len += s.block;
  return len;
}

static int vectors_lenb (vector * list) {
  int len = 0;
  for (vector v in list)
    len += v.x.block;
  return len;
}

static void rcv_pid_receive (RcvPid * m, scalar * list, scalar * listv,
			     vector * listf, int l)
{
  if (m->npid == 0)
    return;
  
  prof_start ("rcv_pid_receive");

  int len = list_lenb (list) + 2*dimension*vectors_lenb (listf) +
    (1 << dimension)*list_lenb (listv);

  MPI_Request r[m->npid];
  Rcv * rrcv[m->npid]; // fixme: using NULL requests should be OK
  int nr = 0;
  for (int i = 0; i < m->npid; i++) {
    Rcv * rcv = &m->rcv[i];
    if (l <= rcv->depth && rcv->halo[l].n > 0) {
      assert (!rcv->buf);
      rcv->buf = malloc (sizeof (double)*rcv->halo[l].n*len);
#if 0
      fprintf (stderr, "%s receiving %d doubles from %d level %d\n",
	       m->name, rcv->halo[l].n*len, rcv->pid, l);
      fflush (stderr);
#endif
#if 1 /* initiate non-blocking receive */
      MPI_Irecv (rcv->buf, rcv->halo[l].n*len, MPI_DOUBLE, rcv->pid,
		 BOUNDARY_TAG(l), MPI_COMM_WORLD, &r[nr]);
      rrcv[nr++] = rcv;
#else /* blocking receive (useful for debugging) */
      MPI_Status s;
      mpi_recv_check (rcv->buf, rcv->halo[l].n*len, MPI_DOUBLE, rcv->pid,
		      BOUNDARY_TAG(l), MPI_COMM_WORLD, &s, "rcv_pid_receive");
      apply_bc (rcv, list, listf, listv, l, s);
#endif
    }
  }

  /* non-blocking receives (does nothing when using blocking receives) */
  if (nr > 0) {
    int i;
    MPI_Status s;
    mpi_waitany (nr, r, &i, &s);
    while (i != MPI_UNDEFINED) {
      Rcv * rcv = rrcv[i];
      assert (l <= rcv->depth && rcv->halo[l].n > 0);
      assert (rcv->buf);
      apply_bc (rcv, list, listv, listf, l, s);
      mpi_waitany (nr, r, &i, &s);
    }
  }
  
  prof_stop();
}

trace
static void rcv_pid_wait (RcvPid * m)
{
  /* wait for completion of send requests */
  for (int i = 0; i < m->npid; i++)
    rcv_free_buf (&m->rcv[i]);
}

static void rcv_pid_send (RcvPid * m, scalar * list, scalar * listv,
			  vector * listf, int l)
{
  if (m->npid == 0)
    return;

  prof_start ("rcv_pid_send");

  int len = list_lenb (list) + 2*dimension*vectors_lenb (listf) +
    (1 << dimension)*list_lenb (listv);

  /* send ghost values */
  for (int i = 0; i < m->npid; i++) {
    Rcv * rcv = &m->rcv[i];
    if (l <= rcv->depth && rcv->halo[l].n > 0) {
      assert (!rcv->buf);
      rcv->buf = malloc (sizeof (double)*rcv->halo[l].n*len);
      double * b = rcv->buf;
      foreach_cache_level(rcv->halo[l], l) {
	for (scalar s in list) {
	  memcpy (b, &s[], sizeof(double)*s.block);
	  b += s.block;
	}
	for (vector v in listf)
	  foreach_dimension() {
	    memcpy (b, &v.x[], sizeof(double)*v.x.block);
	    b += v.x.block;
	    if (allocated(1))
	      memcpy (b, &v.x[1], sizeof(double)*v.x.block);
	    else
	      *b = nodata;
	    b += v.x.block;
	  }
	for (scalar s in listv) {
	  for (int i = 0; i <= 1; i++)
	    for (int j = 0; j <= 1; j++)
#if dimension == 3
	      for (int k = 0; k <= 1; k++) {
		if (allocated(i,j,k))
		  memcpy (b, &s[i,j,k], sizeof(double)*s.block);
		else
		  *b = nodata;
		b += s.block;
	      }
#else // dimension == 2
	      {
		if (allocated(i,j))
		  memcpy (b, &s[i,j], sizeof(double)*s.block);
		else
		  *b = nodata;
		b += s.block;
	      }
#endif // dimension == 2
	}
      }
#if 0
      fprintf (stderr, "%s sending %d doubles to %d level %d\n",
	       m->name, rcv->halo[l].n*len, rcv->pid, l);
      fflush (stderr);
#endif
      MPI_Isend (rcv->buf, (b - (double *) rcv->buf),
		 MPI_DOUBLE, rcv->pid, BOUNDARY_TAG(l), MPI_COMM_WORLD,
		 &rcv->r);
    }
  }

  prof_stop();
}

static void rcv_pid_sync (SndRcv * m, scalar * list, int l)
{
  scalar * listr = NULL, * listv = NULL;
  vector * listf = NULL;
  for (scalar s in list)
    if (!is_constant(s) && s.block > 0) {
      if (s.face)
	listf = vectors_add (listf, s.v);
      else if (s.restriction == restriction_vertex)
	listv = list_add (listv, s);
      else
	listr = list_add (listr, s);
    }
  rcv_pid_send (m->snd, listr, listv, listf, l);
  rcv_pid_receive (m->rcv, listr, listv, listf, l);
  rcv_pid_wait (m->snd);
  free (listr);
  free (listf);
  free (listv);
}

static void snd_rcv_destroy (SndRcv * m)
{
  rcv_pid_destroy (m->rcv);
  rcv_pid_destroy (m->snd);
}

static void snd_rcv_init (SndRcv * m, const char * name)
{
  char s[strlen(name) + 5];
  strcpy (s, name);
  strcat (s, ".rcv");
  m->rcv = rcv_pid_new (s);
  strcpy (s, name);
  strcat (s, ".snd");
  m->snd = rcv_pid_new (s);
}

static void mpi_boundary_destroy (Boundary * b)
{
  MpiBoundary * m = (MpiBoundary *) b;
  snd_rcv_destroy (&m->mpi_level);
  snd_rcv_destroy (&m->mpi_level_root);
  snd_rcv_destroy (&m->restriction);
  array_free (m->send);
  array_free (m->receive);
  free (m);
}

trace
static void mpi_boundary_level (const Boundary * b, scalar * list, int l)
{
  MpiBoundary * m = (MpiBoundary *) b;
  rcv_pid_sync (&m->mpi_level, list, l);
  rcv_pid_sync (&m->mpi_level_root, list, l);
}

trace
static void mpi_boundary_restriction (const Boundary * b, scalar * list, int l)
{
  MpiBoundary * m = (MpiBoundary *) b;
  rcv_pid_sync (&m->restriction, list, l);
}

void mpi_boundary_new()
{
  mpi_boundary = (Boundary *) qcalloc (1, MpiBoundary);
  mpi_boundary->destroy = mpi_boundary_destroy;
  mpi_boundary->level = mpi_boundary_level;
  mpi_boundary->restriction = mpi_boundary_restriction;
  MpiBoundary * mpi = (MpiBoundary *) mpi_boundary;
  snd_rcv_init (&mpi->mpi_level, "mpi_level");
  snd_rcv_init (&mpi->mpi_level_root, "mpi_level_root");
  snd_rcv_init (&mpi->restriction, "restriction");
  mpi->send = array_new();
  mpi->receive = array_new();
  add_boundary (mpi_boundary);
}

static FILE * fopen_prefix (FILE * fp, const char * name, char * prefix)
{
  if (fp) {
    sprintf (prefix, "%s-%d ", name, pid());
    return fp;
  }
  else {
    strcpy (prefix, "");
    char fname[80];
    if (debug_iteration >= 0)
      sprintf (fname, "%s-%d-%d", name, debug_iteration, pid());
    else
      sprintf (fname, "%s-%d", name, pid());
    return fopen (fname, "w");
  }
}

void debug_mpi (FILE * fp1)
{
  void output_cells_internal (FILE * fp);

  char prefix[80];
  FILE * fp;

  // cleanup
  if (fp1 == NULL) {
    char name[80];
    sprintf (name, "halo-%d", pid()); remove (name);
    sprintf (name, "cells-%d", pid()); remove (name);
    sprintf (name, "faces-%d", pid()); remove (name);
    sprintf (name, "vertices-%d", pid()); remove (name);
    sprintf (name, "neighbors-%d", pid()); remove (name);
    sprintf (name, "mpi-level-rcv-%d", pid()); remove (name);
    sprintf (name, "mpi-level-snd-%d", pid()); remove (name);
    sprintf (name, "mpi-level-root-rcv-%d", pid()); remove (name);
    sprintf (name, "mpi-level-root-snd-%d", pid()); remove (name);
    sprintf (name, "mpi-restriction-rcv-%d", pid()); remove (name);
    sprintf (name, "mpi-restriction-snd-%d", pid()); remove (name);
    sprintf (name, "mpi-border-%d", pid()); remove (name);
    sprintf (name, "exterior-%d", pid()); remove (name);
    sprintf (name, "depth-%d", pid()); remove (name);
    sprintf (name, "refined-%d", pid()); remove (name);
  }
  
  // local halo
  fp = fopen_prefix (fp1, "halo", prefix);
  for (int l = 0; l < depth(); l++)
    foreach_halo (prolongation, l)
      foreach_child()
        fprintf (fp, "%s%g %g %g %d\n", prefix, x, y, z, level);
  if (!fp1)
    fclose (fp);

  if (!fp1) {
    fp = fopen_prefix (fp1, "cells", prefix);
    output_cells_internal (fp);
    fclose (fp);
  }
  
  fp = fopen_prefix (fp1, "faces", prefix);
  foreach_face()
    fprintf (fp, "%s%g %g %g %d\n", prefix, x, y, z, level);
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "vertices", prefix);
  foreach_vertex()
    fprintf (fp, "%s%g %g %g %d\n", prefix, x, y, z, level);
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "neighbors", prefix);
  foreach() {
    int n = 0;
    foreach_neighbor(1)
      if (is_refined(cell))
	n++;
    fprintf (fp, "%s%g %g %g %d\n", prefix, x, y, z, cell.neighbors);
    assert (cell.neighbors == n);
  }
  if (!fp1)
    fclose (fp);

  MpiBoundary * mpi = (MpiBoundary *) mpi_boundary;
  
  fp = fopen_prefix (fp1, "mpi-level-rcv", prefix);
  rcv_pid_print (mpi->mpi_level.rcv, fp, prefix);
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "mpi-level-root-rcv", prefix);
  rcv_pid_print (mpi->mpi_level_root.rcv, fp, prefix);
  if (!fp1)
    fclose (fp);
    
  fp = fopen_prefix (fp1, "mpi-restriction-rcv", prefix);
  rcv_pid_print (mpi->restriction.rcv, fp, prefix);
  if (!fp1)
    fclose (fp);
    
  fp = fopen_prefix (fp1, "mpi-level-snd", prefix);
  rcv_pid_print (mpi->mpi_level.snd, fp, prefix);
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "mpi-level-root-snd", prefix);
  rcv_pid_print (mpi->mpi_level_root.snd, fp, prefix);
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "mpi-restriction-snd", prefix);
  rcv_pid_print (mpi->restriction.snd, fp, prefix);
  if (!fp1)
    fclose (fp);
    
  fp = fopen_prefix (fp1, "mpi-border", prefix);
  foreach_cell() {
    if (is_border(cell))
      fprintf (fp, "%s%g %g %g %d %d %d\n",
	       prefix, x, y, z, level, cell.neighbors, cell.pid);
    else
      continue;
    if (is_leaf(cell))
      continue;
  }
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "exterior", prefix);
  foreach_cell() {
    if (!is_local(cell))
      fprintf (fp, "%s%g %g %g %d %d %d %d\n",
	       prefix, x, y, z, level, cell.neighbors,
	       cell.pid, cell.flags & leaf);
#if 0
    else if (is_active(cell) && !is_border(cell))
      continue;
    if (is_leaf(cell))
      continue;
#endif
  }
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "depth", prefix);
  fprintf (fp, "depth: %d %d\n", pid(), depth());
  fprintf (fp, "======= mpi_level.snd ======\n");
  RcvPid * snd = mpi->mpi_level.snd;
  for (int i = 0; i < snd->npid; i++)
    fprintf (fp, "%d %d %d\n", pid(), snd->rcv[i].pid, snd->rcv[i].maxdepth);
  fprintf (fp, "======= mpi_level.rcv ======\n");
  snd = mpi->mpi_level.rcv;
  for (int i = 0; i < snd->npid; i++)
    fprintf (fp, "%d %d %d\n", pid(), snd->rcv[i].pid, snd->rcv[i].maxdepth);
  if (!fp1)
    fclose (fp);

  fp = fopen_prefix (fp1, "refined", prefix);
  foreach_cache (tree->refined)
    fprintf (fp, "%s%g %g %g %d\n", prefix, x, y, z, level);
  if (!fp1)
    fclose (fp);
}

static void snd_rcv_free (SndRcv * p)
{
  char name[strlen(p->rcv->name) + 1];
  strcpy (name, p->rcv->name);
  rcv_pid_destroy (p->rcv);
  p->rcv = rcv_pid_new (name);
  strcpy (name, p->snd->name);
  rcv_pid_destroy (p->snd);
  p->snd = rcv_pid_new (name);
}

static bool is_root (Point point)
{
  if (is_refined(cell))
    foreach_child()
      if (is_local(cell))
	return true;
  return false;
}

// see src/figures/prolongation.svg
static bool is_local_prolongation (Point point, Point p)
{
#if dimension == 2
  struct { int x, y; } dp = {p.i - point.i, p.j - point.j};
#elif dimension == 3
  struct { int x, y, z; } dp = {p.i - point.i, p.j - point.j, p.k - point.k};
#endif
  foreach_dimension() {
    if (dp.x == 0 && (is_refined(neighbor(-1)) || is_refined(neighbor(1))))
      return true;
    if (is_refined(neighbor(dp.x)))
      return true;
  }
  return false;
}

#define is_remote(cell) (cell.pid >= 0 && cell.pid != pid())

static void append_pid (Array * pids, int pid)
{
  for (int i = 0, * p = (int *) pids->p; i < pids->len/sizeof(int); i++, p++)
    if (*p == pid)
      return;
  array_append (pids, &pid, sizeof(int));
}

static int locals_pids (Point point, Array * pids)
{
  if (is_leaf(cell)) { // prolongation
    if (is_local(cell)) {
      Point p = point;
      foreach_neighbor(1) {
	if (is_remote(cell) &&
	    (is_refined(cell) || is_local_prolongation (point, p)))
	  append_pid (pids, cell.pid);
	if (is_refined(cell))
	  foreach_child()
	    if (is_remote(cell))
	      append_pid (pids, cell.pid);
      }
    }
  }
  else
    foreach_neighbor(1) {
      if (is_remote(cell))
	append_pid (pids, cell.pid);
      if (is_refined(cell))
	foreach_child()
	  if (is_remote(cell))
	    append_pid (pids, cell.pid);
    }
  return pids->len/sizeof(int);
}

static int root_pids (Point point, Array * pids)
{
  foreach_child()
    if (is_remote(cell))
      append_pid (pids, cell.pid);
  return pids->len/sizeof(int);
}

// turns on tree_check() and co with outputs controlled by the condition
// #define DEBUGCOND (pid() >= 300 && pid() <= 400 && t > 0.0784876)

// turns on tree_check() and co without any outputs
// #define DEBUGCOND false

static void rcv_pid_row (RcvPid * m, int l, int * row)
{
  for (int i = 0; i < npe(); i++)
    row[i] = 0;
  for (int i = 0; i < m->npid; i++) {
    Rcv * rcv = &m->rcv[i];
    if (l <= rcv->depth && rcv->halo[l].n > 0)
      row[rcv->pid] = rcv->halo[l].n;
  }
}

void check_snd_rcv_matrix (SndRcv * sndrcv, const char * name)
{
  int maxlevel = depth();
  mpi_all_reduce (maxlevel, MPI_INT, MPI_MAX);
  int * row = qmalloc (npe(), int);
  for (int l = 0; l <= maxlevel; l++) {
    int status = 0;
    if (pid() == 0) { // master
      // send/receive matrix i.e.
      // send[i][j] = number of points sent/received from i to j
      int ** send = matrix_new (npe(), npe(), sizeof(int));
      int ** receive = matrix_new (npe(), npe(), sizeof(int));
      rcv_pid_row (sndrcv->snd, l, row);
      MPI_Gather (row, npe(), MPI_INT, &send[0][0], npe(), MPI_INT, 0,
		  MPI_COMM_WORLD); 
      rcv_pid_row (sndrcv->rcv, l, row);
      MPI_Gather (row, npe(), MPI_INT, &receive[0][0], npe(), MPI_INT, 0,
		  MPI_COMM_WORLD);

      int * astatus = qmalloc (npe(), int);
      for (int i = 0; i < npe(); i++)
	astatus[i] = 0;
      for (int i = 0; i < npe(); i++)
	for (int j = 0; j < npe(); j++)
	  if (send[i][j] != receive[j][i]) {
	    fprintf (stderr, "%s: %d sends    %d to   %d at level %d\n",
		     name, i, send[i][j], j, l);
	    fprintf (stderr, "%s: %d receives %d from %d at level %d\n",
		     name, j, receive[j][i], i, l);
	    fflush (stderr);
	    for (int k = i - 2; k <= i + 2; k++)
	      if (k >= 0 && k < npe())
		astatus[k] = 1;
	    for (int k = j - 2; k <= j + 2; k++)
	      if (k >= 0 && k < npe())
		astatus[k] = 1;
	  }
      MPI_Scatter (astatus, 1, MPI_INT, &status, 1, MPI_INT, 0, MPI_COMM_WORLD);
      free (astatus);

      matrix_free (send);
      matrix_free (receive);
    }
    else { // slave
      rcv_pid_row (sndrcv->snd, l, row);
      MPI_Gather (row, npe(), MPI_INT, NULL, npe(), MPI_INT, 0, MPI_COMM_WORLD); 
      rcv_pid_row (sndrcv->rcv, l, row);
      MPI_Gather (row, npe(), MPI_INT, NULL, npe(), MPI_INT, 0, MPI_COMM_WORLD);
      MPI_Scatter (NULL, 1, MPI_INT, &status, 1, MPI_INT, 0, MPI_COMM_WORLD);
    }
    if (status) {
      fprintf (stderr,
	       "check_snd_rcv_matrix \"%s\" failed\n"
	       "Calling debug_mpi(NULL)...\n"
	       "Aborting...\n",
	       name);
      fflush (stderr);
      debug_mpi (NULL);
      MPI_Abort (MPI_COMM_WORLD, -3);
    }
  }
  free (row);
}

static bool has_local_child (Point point)
{
  foreach_child()
    if (is_local(cell))
      return true;
  return false;
}

trace
void mpi_boundary_update_buffers()
{
  if (npe() == 1)
    return;

  prof_start ("mpi_boundary_update_buffers");

  MpiBoundary * m = (MpiBoundary *) mpi_boundary;
  SndRcv * mpi_level = &m->mpi_level;
  SndRcv * mpi_level_root = &m->mpi_level_root;
  SndRcv * restriction = &m->restriction;

  snd_rcv_free (mpi_level);
  snd_rcv_free (mpi_level_root);
  snd_rcv_free (restriction);
  
  static const unsigned short used = 1 << user;
  foreach_cell() {
    if (is_active(cell) && !is_border(cell))
      /* We skip the interior of the local domain.
	 Note that this can be commented out in case of suspicion that
	 something is wrong with border cell tagging. */
      continue;
    
    if (cell.neighbors) {
      // sending
      Array pids = {NULL, 0, 0};
      int n = locals_pids (point, &pids);
      if (n) {
	foreach_child()
	  if (is_local(cell))
	    for (int i = 0, * p = (int *) pids.p; i < n; i++, p++)
	      rcv_pid_append (mpi_level->snd, *p, point);
	free (pids.p);
      }
      // receiving
      bool locals = false;
      if (is_leaf(cell)) { // prolongation
	if (is_remote(cell)) {
	  Point p = point;
	  foreach_neighbor(1)
	    if ((is_local(cell) &&
		 (is_refined(cell) || is_local_prolongation (point, p))) ||
		is_root(point)) {
	      locals = true; break;
	    }
	}
      }
      else
	foreach_neighbor(1)
	  if (is_local(cell) || is_root(point)) {
	    locals = true; break;
	  }
      if (locals)
	foreach_child()
	  if (is_remote(cell))
            rcv_pid_append (mpi_level->rcv, cell.pid, point),
	      cell.flags |= used;
      
      // root cells
      if (!is_leaf(cell)) {
	// sending
	if (is_local(cell)) {
	  Array pids = {NULL, 0, 0};
	  // root cell
	  int n = root_pids (point, &pids);
	  if (n) {
	    foreach_neighbor()
	      for (int i = 0, * p = (int *) pids.p; i < n; i++, p++)
		if (cell.pid >= 0 && cell.pid != *p)
		  rcv_pid_append (mpi_level_root->snd, *p, point);
	    // restriction (remote root)
	    for (int i = 0, * p = (int *) pids.p; i < n; i++, p++)
	      rcv_pid_append (restriction->snd, *p, point);
	    free (pids.p);
	  }
	}
	// receiving
	else if (is_remote(cell)) {
	  bool root = false;
	  foreach_child()
	    if (is_local(cell)) {
	      root = true; break;
	    }
	  if (root) {
	    int pid = cell.pid;
	    foreach_neighbor()
	      if (is_remote(cell))
		rcv_pid_append (mpi_level_root->rcv, pid, point),
		  cell.flags |= used;
	    // restriction (remote root)
	    rcv_pid_append (restriction->rcv, pid, point);
	  }
	}
      }
    }

    // restriction (remote siblings/children)
    if (level > 0) {
      if (is_local(cell)) {
	// sending
	Array pids = {NULL, 0, 0};
	if (is_remote(aparent(0)))
	  append_pid (&pids, aparent(0).pid);
	int n = root_pids (parent, &pids);
	if (n) {
	  for (int i = 0, * p = (int *) pids.p; i < n; i++, p++)
	    rcv_pid_append (restriction->snd, *p, point);
	  free (pids.p);
	}
      }
      else if (is_remote(cell)) {
	// receiving
	if (is_local(aparent(0)) || has_local_child (parent))
	  rcv_pid_append (restriction->rcv, cell.pid, point);
      }
    }
  }
    
  /* we remove unused cells
     we do a breadth-first traversal from fine to coarse, so that
     coarsening of unused cells can proceed fully. */
  
  static const unsigned short keep = 1 << (user + 1);
  for (int l = depth(); l >= 0; l--)
    foreach_cell()
      if (level == l) {
	if (level > 0 && (cell.pid < 0 || is_local(cell) || (cell.flags & used)))
	  aparent(0).flags |= keep;
	if (is_refined(cell) && !(cell.flags & keep))
	  coarsen_cell (point, NULL);
	cell.flags &= ~(used|keep);
	continue; // level == l
      }

  /* we update the list of send/receive pids */
  m->send->len = m->receive->len = 0;
  rcv_pid_append_pids (mpi_level->snd, m->send);
  rcv_pid_append_pids (mpi_level_root->snd, m->send);
  rcv_pid_append_pids (mpi_level->rcv, m->receive);
  rcv_pid_append_pids (mpi_level_root->rcv, m->receive);
  
  prof_stop();

#if DEBUG_MPI
  debug_mpi (NULL);
#endif

#ifdef DEBUGCOND
  extern double t; NOT_UNUSED(t);
  check_snd_rcv_matrix (mpi_level, "mpi_level");
  check_snd_rcv_matrix (mpi_level_root, "mpi_level_root");
  check_snd_rcv_matrix (restriction, "restriction");
  if (DEBUGCOND)
    debug_mpi (NULL);
  tree_check();
#endif
}

trace
void mpi_boundary_refine (scalar * list)
{
  prof_start ("mpi_boundary_refine");

  MpiBoundary * mpi = (MpiBoundary *) mpi_boundary;

  /* Send refinement cache to each neighboring process. */
  Array * snd = mpi->send;
  MPI_Request r[2*snd->len/sizeof(int)];
  int nr = 0;
  for (int i = 0, * dest = snd->p; i < snd->len/sizeof(int); i++,dest++) {
    int len = tree->refined.n;
    MPI_Isend (&tree->refined.n, 1, MPI_INT, *dest,
	       REFINE_TAG(), MPI_COMM_WORLD, &r[nr++]);
    if (len > 0)
      MPI_Isend (tree->refined.p, sizeof(Index)/sizeof(int)*len,
		 MPI_INT, *dest, REFINE_TAG(), MPI_COMM_WORLD, &r[nr++]);
  }

  /* Receive refinement cache from each neighboring process. 
   fixme: use non-blocking receives */
  Array * rcv = mpi->receive;
  Cache rerefined = {NULL, 0, 0};
  for (int i = 0, * source = rcv->p; i < rcv->len/sizeof(int); i++,source++) {
    int len;
    mpi_recv_check (&len, 1, MPI_INT, *source, REFINE_TAG(),
		    MPI_COMM_WORLD, MPI_STATUS_IGNORE,
		    "mpi_boundary_refine (len)");
    if (len > 0) {
      Index p[len];
      mpi_recv_check (p, sizeof(Index)/sizeof(int)*len,
		      MPI_INT, *source, REFINE_TAG(),
		      MPI_COMM_WORLD, MPI_STATUS_IGNORE,
		      "mpi_boundary_refine (p)");
      Cache refined = {p, len, len};
      foreach_cache (refined)
	if (level <= depth() && allocated(0)) {
	  if (is_leaf(cell)) {
	    bool neighbors = false;
	    foreach_neighbor()
	      if (allocated(0) && (is_active(cell) || is_local(aparent(0)))) {
		neighbors = true; break;
	      }
	    // refine the cell only if it has local neighbors
	    if (neighbors)
	      refine_cell (point, list, 0, &rerefined);
	  }
	}
    }
  }

  /* check that caches were received OK and free ressources */
  if (nr)
    MPI_Waitall (nr, r, MPI_STATUSES_IGNORE);

  /* update the refinement cache with "re-refined" cells */
  free (tree->refined.p);
  tree->refined = rerefined;
  
  prof_stop();

  /* if any cell has been re-refined, we repeat the process to take care
     of recursive refinements induced by the 2:1 constraint */
  mpi_all_reduce (rerefined.n, MPI_INT, MPI_SUM);
  if (rerefined.n)
    mpi_boundary_refine (list);
  for (scalar s in list)
    set_dirty_stencil (s);
}

static void check_depth()
{
#if DEBUG_MPI 
  int max = 0;
  foreach_cell_all()
    if (level > max)
      max = level;
  if (depth() != max) {
    FILE * fp = fopen ("layer", "w");
    fprintf (fp, "depth() = %d, max = %d\n", depth(), max);
    for (int l = 0; l <= depth(); l++) {
      Layer * L = tree->L[l];
      fprintf (fp, "Layer level = %d, nc = %d, len = %d\n", l, L->nc, L->len);
      for (int i = 0; i < L->len; i++)
	if (L->m[i]) {
	  fprintf (fp, "  i = %d, refcount = %d\n", i,
		   *((int *)(((char *)L->m[i]) + L->len*sizeof(char *))));
	  for (int j = 0; j < L->len; j++)
	    if (L->m[i][j]) {
	      fprintf (fp, "    j = %d\n", j);
	      fprintf (fp, "point %g %g %d\n",
		       X0 + (i - 1.5)*L0/(1 << l), Y0 + (j - 1.5)*L0/(1 << l),
		       l);
	    }
	}
    }
    fclose (fp);
    fp = fopen ("colls", "w");
    output_cells_internal (fp);
    fclose (fp);
    assert (false);
  }
#endif
}

typedef struct {
  int refined, leaf;
} Remote;

#define REMOTE() ((Remote *)&val(remote,0))

trace
void mpi_boundary_coarsen (int l, int too_fine)
{
  if (npe() == 1)
    return;
  
  check_depth();

  assert (sizeof(Remote) == sizeof(double));
  
  scalar remote[];
  foreach_cell() {
    if (level == l) {
      if (is_local(cell)) {
	REMOTE()->refined = is_refined(cell);
	REMOTE()->leaf = is_leaf(cell);
      }
      else {
	REMOTE()->refined = true;
	REMOTE()->leaf = false;
      }
      continue;
    }
    if (is_leaf(cell))
      continue;
  }
  mpi_boundary_level (mpi_boundary, {remote}, l);
  
  foreach_cell() {
    if (level == l) {
      if (!is_local(cell)) {
	if (is_refined(cell) && !REMOTE()->refined)
	  coarsen_cell_recursive (point, NULL);
	else if (is_leaf(cell) && cell.neighbors && REMOTE()->leaf) {
	  int pid = cell.pid;
	  foreach_child()
	    cell.pid = pid;
	}
      }
      continue;
    }
    if (is_leaf(cell))
      continue;
  }

  check_depth();

  if (l > 0) {
    foreach_cell() {
      if (level == l) {
	remote[] = is_local(cell) ? cell.neighbors : 0;
	continue;
      }
      if (is_leaf(cell))
	continue;
    }
    mpi_boundary_level (mpi_boundary, {remote}, l);
    foreach_cell() {
      if (level == l)
	if (!is_local(cell) && is_local(aparent(0)) && remote[]) {
	  aparent(0).flags &= ~too_fine;
	  continue;
	}
      if (is_leaf(cell))
	continue;
    }
  }
}

static void flag_border_cells()
{
  foreach_cell() {
    if (is_active(cell)) {
      short flags = cell.flags & ~border;
      foreach_neighbor() {
	if (!is_local(cell) || (level > 0 && !is_local(aparent(0)))) {
	  flags |= border; break;
	}
	// root cell
	if (is_refined_check())
	  foreach_child()
	    if (!is_local(cell)) {
	      flags |= border; break;
	    }
	if (flags & border)
	  break;
      }
      cell.flags = flags;
    }
    else {
      cell.flags &= ~border;
      //      continue; // fixme
    }
    if (is_leaf(cell)) {
      if (cell.neighbors) {
	foreach_child()
	  cell.flags &= ~border;
	if (is_border(cell)) {
	  bool remote = false;
	  foreach_neighbor (GHOSTS/2)
	    if (!is_local(cell)) {
	      remote = true; break;
	    }
	  if (remote)
	    foreach_child()
	      cell.flags |= border;
	}
      }
      continue;
    }
  }
}

static int balanced_pid (long index, long nt, int nproc)
{
  long ne = max(1, nt/nproc), nr = nt % nproc;
  int pid = index < nr*(ne + 1) ?
    index/(ne + 1) :
    nr + (index - nr*(ne + 1))/ne;
  return min(nproc - 1, pid);
}

// static partitioning: only used for tests
trace
void mpi_partitioning()
{
  prof_start ("mpi_partitioning");

  long nt = 0;
  foreach (serial)
    nt++;

  /* set the pid of each cell */
  long i = 0;
  tree->dirty = true;
  foreach_cell_post (is_active (cell))
    if (is_active (cell)) {
      if (is_leaf (cell)) {
	cell.pid = balanced_pid (i++, nt, npe());
	if (cell.neighbors > 0) {
	  int pid = cell.pid;
	  foreach_child()
	    cell.pid = pid;
	}
	if (!is_local(cell))
	  cell.flags &= ~active;
      }
      else {
	cell.pid = child(0).pid;
	bool inactive = true;
	foreach_child()
	  if (is_active(cell)) {
	    inactive = false; break;
	  }
	if (inactive)
	  cell.flags &= ~active;
      }
    }

  flag_border_cells();
  
  prof_stop();
  
  mpi_boundary_update_buffers();
}

void restore_mpi (FILE * fp, scalar * list1)
{
  long index = 0, nt = 0, start = ftell (fp);
  scalar size[], * list = list_concat ({size}, list1);;
  long offset = sizeof(double)*list_len(list);

  // read local cells
  static const unsigned short set = 1 << user;
  scalar * listm = is_constant(cm) ? NULL : (scalar *){fm};
  foreach_cell()
    if (balanced_pid (index, nt, npe()) <= pid()) {
      unsigned flags;
      if (fread (&flags, sizeof(unsigned), 1, fp) != 1) {
	fprintf (stderr, "restore(): error: expecting 'flags'\n");
	exit (1);
      }
      for (scalar s in list) {
	double val;
	if (fread (&val, sizeof(double), 1, fp) != 1) {
	  fprintf (stderr, "restore(): error: expecting scalar\n");
	  exit (1);
	}
	if (s.i != INT_MAX)
	  s[] = val;
      }
      if (level == 0)
	nt = size[];
      cell.pid = balanced_pid (index, nt, npe());
      cell.flags |= set;
      if (!(flags & leaf) && is_leaf(cell)) {
	if (balanced_pid (index + size[] - 1, nt, npe()) < pid()) {
	  fseek (fp, (sizeof(unsigned) + offset)*(size[] - 1), SEEK_CUR);
	  index += size[];
	  continue;
	}
	refine_cell (point, listm, 0, NULL);
      }
      index++;
      if (is_leaf(cell))
	continue;
    }

  // read non-local neighbors
  fseek (fp, start, SEEK_SET);
  index = 0;
  foreach_cell() {
    unsigned flags;
    if (fread (&flags, sizeof(unsigned), 1, fp) != 1) {
      fprintf (stderr, "restore(): error: expecting 'flags'\n");
      exit (1);
    }
    if (cell.flags & set)
      fseek (fp, offset, SEEK_CUR);
    else {
      for (scalar s in list) {
	double val;
	if (fread (&val, sizeof(double), 1, fp) != 1) {
	  fprintf (stderr, "restore(): error: expecting a scalar\n");
	  exit (1);
	}
	if (s.i != INT_MAX)
	  s[] = val;
      }
      cell.pid = balanced_pid (index, nt, npe());
      if (is_leaf(cell) && cell.neighbors) {
	int pid = cell.pid;
	foreach_child()
	  cell.pid = pid;
      }
    }
    if (!(flags & leaf) && is_leaf(cell)) {
      bool locals = false;
      foreach_neighbor(1)
	if ((cell.flags & set) && (is_local(cell) || is_root(point))) {
	  locals = true; break;
	}
      if (locals)
	refine_cell (point, listm, 0, NULL);
      else {
	fseek (fp, (sizeof(unsigned) + offset)*(size[] - 1), SEEK_CUR);
	index += size[];
	continue;
      }
    }
    index++;
    if (is_leaf(cell))
      continue;
  }

  /* set active flags */
  foreach_cell_post (is_active (cell)) {
    cell.flags &= ~set;
    if (is_active (cell)) {
      if (is_leaf (cell)) {
	if (cell.neighbors > 0) {
	  int pid = cell.pid;
	  foreach_child()
	    cell.pid = pid;
	}
	if (!is_local(cell))
	  cell.flags &= ~active;
      }
      else if (!is_local(cell)) {
	bool inactive = true;
	foreach_child()
	  if (is_active(cell)) {
	    inactive = false; break;
	  }
	if (inactive)
	  cell.flags &= ~active;
      }
    }
  }

  flag_border_cells();
  
  scalar * clean = NULL;
  for (scalar s in list)
    if (s.i != INT_MAX)
      clean = list_append (clean, s);
  free (list);
  mpi_boundary_update (clean);
  free (clean);
}

*z_indexing()*: fills *index* with the Z-ordering index.

If leaves is true only leaves are indexed, otherwise all active cells are indexed.

On the master process (pid() == 0), the function returns the (global) maximum index (and -1 on all other processes).

On a single processor, we would just need something like (for leaves)

~~~literatec double i = 0; foreach() index[] = i++; ~~~

In parallel, this is a bit more difficult.

trace
double z_indexing (scalar index, bool leaves)
{

We first compute the size of each subtree.

scalar size[];
  subtree_size (size, leaves);

The maximum index value is the size of the entire tree (i.e. the value of size in the root cell on the master process) minus one.

double maxi = -1.;
  if (pid() == 0)
    foreach_level(0, serial)
      maxi = size[] - 1.;

fixme: doc

foreach_level(0)
    index[] = 0;
  for (int l = 0; l < depth(); l++) {
    boundary_iterate (restriction, {index}, l);
    foreach_cell() {
      if (level == l) {
	if (is_leaf(cell)) {
	  if (is_local(cell) && cell.neighbors) {
	    int i = index[];
	    foreach_child()
	      index[] = i;
	  }
	}
	else { // not leaf
	  bool loc = is_local(cell);
	  if (!loc)
	    foreach_child()
	      if (is_local(cell)) {
		loc = true; break;
	      }
	  if (loc) {
	    int i = index[] + !leaves;
	    foreach_child() {
	      index[] = i;
	      i += size[]; 
	    }
	  }
	}
	continue; // level == l
      }
      if (is_leaf(cell))
	continue;
    }
  }
  boundary_iterate (restriction, {index}, depth());

  return maxi;
}