static int ack_slot;

static Counter red_rctr, red_sctr;
static jlong reduce_buf[2];

static Counter bcast_rctr, bcast_sctr;
#ifdef WIDE_POINTERS
static jGPointer bcast_buf; /* sizeof(jGPointer) on 64-bit is > sizeof(jlong) */ 
#else
static jlong bcast_buf;
#endif



/* This is a pipelined implementation of reduce. Each node sends acks to its
	 children indicating when it is ready for another value. */
#ifdef COMM_AM2
#define REDUCE_CODE(NAME, TYPE, COMB, PUT)                                          \
                                                                                    \
static TYPE volatile local_reduce_buf_ ## NAME [MAX_BOX_PROCS];                     \
static TYPE volatile global_reduce_val_ ## NAME;                                    \
                                                                                    \
/* Only the proxy processor will run this */                                        \
TYPE global_ ## NAME(TYPE val)                                                      \
{                                                                                   \
  TYPE lbuf[2];                                                                     \
  int parent_box = (MYBOX - 1) / 2;                                                 \
  int odd_box = 2 * MYBOX + 1;                                                      \
  int even_box = 2 * MYBOX + 2;                                                     \
  int parity = MYBOX & 1;                                                           \
  /* wait for ack from parent */                                                    \
  if (MYBOX == 0)  store_sync_ctr(sizeof(jint), &red_rctr, &red_sctr);              \
                                                                                    \
  if (MYBOX == 0) {                                                                 \
    if (BOXES == 1) {                                                               \
       tic_poll();                                                                  \
       return(val);                                                                 \
    } else if (BOXES == 2) {                                                        \
      store_sync_ctr(sizeof(TYPE), &red_rctr, &red_sctr);                           \
      lbuf[1] = *((TYPE *)&reduce_buf[1]);                                          \
      /* ack child */                                                               \
      __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1,                        \
                    TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr);      \
      return(COMB (val, lbuf[1]));                                                  \
    } else {                                                                        \
      store_sync_ctr(2*sizeof(TYPE), &red_rctr, &red_sctr);                         \
      lbuf[0] = *((TYPE *)&reduce_buf[0]);                                          \
      lbuf[1] = *((TYPE *)&reduce_buf[1]);                                          \
      /* ack children */                                                            \
      __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1,                        \
                    TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr);      \
      __i_store_ctr(toglobalb_static(even_box, &ack_slot), 1,                       \
                    TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,even_box), &red_sctr);     \
      return (COMB (COMB (val, lbuf[1]), lbuf[0]));                                 \
    }                                                                               \
  } else {                                                                          \
    jGPointer dest = toglobalb_static(parent_box, &reduce_buf[parity]);             \
                                                                                    \
    if (odd_box >= BOXES) {     /* Not expecting anything from any child */         \
       PUT(dest, val, TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,parent_box), &red_sctr); \
    } else if (even_box >= BOXES) {                                                 \
      /* Not expecting anything from an even child, but wait for the odd one. */    \
       store_sync_ctr(sizeof(TYPE), &red_rctr, &red_sctr);                          \
       lbuf[1] = *((TYPE *)&reduce_buf[1]);                                         \
       /* ack child */                                                              \
       __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1,                       \
                     TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr);     \
       PUT(dest, COMB(val, lbuf[1]),                                                \
           TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,parent_box), &red_sctr);            \
    } else {                                                                        \
       store_sync_ctr(2*sizeof(TYPE), &red_rctr, &red_sctr);                        \
       lbuf[0] = *((TYPE *)&reduce_buf[0]);                                         \
       lbuf[1] = *((TYPE *)&reduce_buf[1]);                                         \
       /* ack children */                                                           \
       __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1,                       \
                     TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr);     \
       __i_store_ctr(toglobalb_static(even_box, &ack_slot), 1,                      \
                     TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,even_box), &red_sctr);    \
       PUT(dest, COMB (COMB (val, lbuf[1]), lbuf[0]),                               \
           TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,parent_box), &red_sctr);            \
    }                                                                               \
  }                                                                                 \
  return(val);                                                                      \
}                                                                                   \
                                                                                    \
TYPE NAME (TYPE val)                                                                \
{                                                                                   \
  local_reduce_buf_ ## NAME [MYBOXPROC] = val;                                      \
                                                                                    \
  if (MYBOXPROCS > 1) {                                                             \
    local_barrier();                                                                \
    if (MYBOXPROC == 0) {                                                           \
      int i;                                                                        \
      TYPE local_val = local_reduce_buf_ ## NAME [0];                               \
      for(i = 1; i < MYBOXPROCS; i++) {                                             \
        local_val = COMB(local_reduce_buf_ ## NAME [i], local_val);                 \
      }                                                                             \
      if (BOXES == 1) { /* Are we the only box here? */                             \
        global_reduce_val_ ## NAME = local_val;                                     \
      } else {                                                                      \
        global_reduce_val_ ## NAME = global_ ## NAME (local_val);                   \
      }                                                                             \
      local_barrier();                                                              \
      return global_reduce_val_ ## NAME;                                            \
    }                                                                               \
    else {                                                                          \
      local_barrier();                                                              \
      return global_reduce_val_ ## NAME;                                            \
    }                                                                               \
  }                                                                                 \
  else { /* One processor per box, runtime-determined */                            \
    return global_reduce_val_ ## NAME = global_ ## NAME(val);                       \
  }                                                                                 \
}

#else /* not AMII */

#define REDUCE_CODE(NAME, TYPE, COMB, PUT)                          \
                                                                    \
static TYPE volatile local_reduce_buf_ ## NAME [MAX_BOX_PROCS];     \
static TYPE volatile global_reduce_val_ ## NAME;                    \
                                                                    \
                                                                    \
TYPE NAME (TYPE val)                                                \
{                                                                   \
  local_reduce_buf_ ## NAME [MYBOXPROC] = val;                      \
                                                                    \
  if (MYBOXPROCS > 1) {                                             \
    local_barrier();                                                \
    if (MYBOXPROC == 0) {                                           \
      int i;                                                        \
      TYPE local_val = local_reduce_buf_ ## NAME [0];               \
      for(i = 1; i < MYBOXPROCS; i++) {                             \
        local_val = COMB(local_reduce_buf_ ## NAME [i], local_val); \
      }                                                             \
      global_reduce_val_ ## NAME = local_val;                       \
    }                                                               \
    local_barrier();                                                \
    return global_reduce_val_ ## NAME;                              \
  }                                                                 \
  else { /* One processor per box, runtime-determined */            \
    return val;                                                     \
  }                                                                 \
}
#endif /* AMII */



/* This is a pipelined bcast. Each node sends an ack to its parent when it is
	 ready to receive another value. */

#define BCAST_PHASE (myProcess()->broadcast_phase)

#ifdef COMM_AM2

#define BCAST_CODE(NAME, TYPE, PUT)                                                  \
                                                                                     \
static TYPE volatile local_bcast_val_ ## NAME[2];                                    \
                                                                                     \
TYPE global_ ## NAME(TYPE val)                                                       \
{                                                                                    \
  int parent_box = (MYBOX - 1) / 2;                                                  \
  int left_box = 2 * MYBOX + 1;                                                      \
  int right_box = 2 * MYBOX + 2;                                                     \
  assert(sizeof(bcast_buf) >= sizeof(TYPE));                                         \
  /* wait for acks from children */                                                  \
  if (left_box < BOXES)                                                              \
    store_sync_ctr (sizeof(jint), &bcast_rctr, &bcast_sctr);                         \
  if (right_box < BOXES)                                                             \
    store_sync_ctr (sizeof(jint), &bcast_rctr, &bcast_sctr);                         \
                                                                                     \
  if (MYBOX != 0) {                                                                  \
    store_sync_ctr(sizeof(TYPE), &bcast_rctr, &bcast_sctr);                          \
    val = *((TYPE*) &bcast_buf);                                                     \
    /* send ack to parent */                                                         \
    __i_store_ctr(toglobalb_static(parent_box, &ack_slot), 1,                        \
                  TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,parent_box), &bcast_sctr);  \
  }                                                                                  \
  if (left_box < BOXES)                                                              \
    PUT(toglobalb_static(left_box, (TYPE*)&bcast_buf), val,                          \
        TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,left_box), &bcast_sctr);              \
  if (right_box < BOXES)                                                             \
    PUT(toglobalb_static(right_box, (TYPE*)&bcast_buf), val,                         \
        TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,right_box), &bcast_sctr);             \
  return(val);                                                                       \
}                                                                                    \
                                                                                     \
TYPE NAME (TYPE val) {                                                               \
  TIC_BEGIN_FUNCTION                                                                 \
  if (MYBOXPROCS > 1) { /* SMP */                                                    \
    int const this_phase = BCAST_PHASE;                                              \
    BCAST_PHASE = !(this_phase);                                                     \
    if (MYBOX == COMM_GetBoxNumberForProcNumber(0)) { /* Is the broadcaster here? */ \
      if (MYPROC == 0) { /* Am I the broadcaster? */                                 \
        local_bcast_val_ ## NAME [this_phase] = val;                                 \
        if (BOXES > 1) { /* I'm on a cluster. */                                     \
          local_bcast_val_ ## NAME [this_phase] = global_ ## NAME (val);             \
        }                                                                            \
      }                                                                              \
      local_barrier();                                                               \
      return *(TYPE *)&(local_bcast_val_## NAME [this_phase]);                       \
    } else { /* The broadcaster is not here. */                                      \
      if (MYBOXPROC == 0) {                                                          \
        local_bcast_val_ ## NAME [this_phase] = global_ ## NAME ( val );             \
      }                                                                              \
      local_barrier();                                                               \
      return *(TYPE *)&(local_bcast_val_ ## NAME [this_phase]);                      \
    }                                                                                \
  } else { /* Only one processor per box. */                                         \
    return global_ ## NAME ( val );                                                  \
  }                                                                                  \
}

#else /* NOT AMII */

#define BCAST_CODE(NAME, TYPE, PUT)                                       \
                                                                          \
static TYPE volatile local_bcast_val_ ## NAME[2];                         \
                                                                          \
TYPE NAME (TYPE val) {                                                    \
  TIC_BEGIN_FUNCTION                                                      \
  if (MYBOXPROCS > 1) { /* SMP */                                         \
    int const this_phase = BCAST_PHASE;                                   \
    BCAST_PHASE = !(this_phase);                                          \
                                                                          \
    if (MYPROC == 0) { /* Am I the broadcaster? */                        \
      local_bcast_val_ ## NAME [this_phase] = val;                        \
    }                                                                     \
    local_barrier();                                                      \
    return *(TYPE *)&(local_bcast_val_ ## NAME [this_phase]);             \
  } else return val;                                                      \
}

#endif /* AMII */

#define REDUCE_TO_ALL_CODE(NAME, BCAST_NAME, REDUCE_NAME, TYPE) \
                                                                \
TYPE NAME(TYPE val)                                             \
{                                                               \
    TYPE result = REDUCE_NAME(val);                             \
    return (BCAST_NAME(result));                                \
}


#define MAX_LOGP	20

/* Use two phase counter to avoid barriers. */
static int phase = 0;
static Counter scan_rctr[2], scan_sctr[2];
static long long scan_buf[MAX_LOGP];
static long long temp_scan_val[MAX_LOGP];
static long long scan_result_buf;

/* A slightly convoluted implementation of scan. Will come
   back and clean it later. The way the algorithm works is:
   Initially you wait for all the values in the tree rooted
   at MYPROC then after getting all the values you spread them
   back in the second phase.

   DOB: the SCAN_CODE below is totally broken - do not use!
   It's incorrect for hierarchical configurations (CLUMP), and
   doesn't handle static data translation correctly.

*/

#define SCAN_CODE(NAME, TYPE, COMB, PUT)                                                                    \
                                                                                                            \
TYPE NAME(TYPE val)                                                                                         \
{                                                                                                           \
    int  dest=0;                                                                                            \
    int  i, count;                                                                                          \
    int  bit_vec = 1;                                                                                       \
    TYPE scan_val;                                                                                          \
    long long *local_buf = (long long *)scan_buf;                                                           \
                                                                                                            \
        ti_hsl_lock(&Store_mutex);                                                                          \
                scan_sctr[phase] = scan_rctr[phase] = 0;                                                    \
                ti_hsl_unlock(&Store_mutex);                                                                \
                phase = (phase+1)%2;                                                                        \
    *((TYPE *)&temp_scan_val[0]) = scan_val = val;                                                          \
                                                                                                            \
    /* determine the number of vals expected from other procs. */                                           \
    for (count=0; bit_vec != PROCS; count++, bit_vec = bit_vec << 1) {                                      \
                        dest = MYPROC & (~bit_vec);                                                         \
                        if (dest != MYPROC)                                                                 \
                break;                                                                                      \
    }                                                                                                       \
                                                                                                            \
    /* Wait for all those vals. */                                                                          \
    store_sync_ctr(count*sizeof(TYPE), &scan_rctr[phase], &scan_sctr[phase]);                               \
                                                                                                            \
    /* Now accumulate the vals. */                                                                          \
    for (i=1; i<=count; i++) {                                                                              \
                        scan_val = COMB(scan_val, *((TYPE *)&local_buf[i]));                                \
                        *((TYPE *)&temp_scan_val[i]) = scan_val;                                            \
    }                                                                                                       \
                                                                                                            \
    /* And store the value in the "dest". */                                                                \
                PUT(toglobalp(dest, &scan_buf[count+1]), scan_val, &scan_rctr[phase], &scan_sctr[phase]);   \
                                                                                                            \
    count--;                                                                                                \
    bit_vec = bit_vec >> 1;                                                                                 \
                                                                                                            \
    if (MYPROC != 0) {                                                                                      \
                        TYPE result_val;                                                                    \
                                                                                                            \
                        store_sync_ctr(sizeof(TYPE), &scan_rctr[phase], &scan_sctr[phase]);                 \
                        result_val = *((TYPE *)&scan_result_buf);                                           \
                        for (; count >= 0; count--, bit_vec = bit_vec >> 1) {                               \
                  dest = MYPROC | bit_vec;                                                                  \
                                PUT(toglobalp(dest, &scan_result_buf),                                      \
                                                        COMB(result_val, *((TYPE *)&temp_scan_val[count])), \
                                                &scan_rctr[phase], &scan_sctr[phase]);                      \
                }                                                                                           \
                return COMB(result_val, val);                                                               \
    }                                                                                                       \
    else {                                                                                                  \
                        for (; count >= 0; count--, bit_vec = bit_vec >> 1) {                               \
                  dest = MYPROC | bit_vec;                                                                  \
                                PUT(toglobalp(dest, &scan_result_buf),                                      \
                  *((TYPE *)&temp_scan_val[count]),                                                         \
                                                &scan_rctr[phase], &scan_sctr[phase]);                      \
                }                                                                                           \
                return val;                                                                                 \
  }                                                                                                         \
}