#if NOT_SINGLE || SCAN
#define SOMETIMES_SINGLE
#else
#define SOMETIMES_SINGLE single
#endif

#ifdef INTERFACE
#define INVOKE_OP(value,to) CONCAT(inner_,OP)(Oper, value, to)
#define COMPUTE_OP(value)   CONCAT(compute_,OP)(Oper, value)
#else
#define INVOKE_OP(value,to) CONCAT(inner_,OP)(value, to)
#define COMPUTE_OP(value)   CONCAT(compute_,OP)(value)
#endif

#if !SCAN
  public static sglobal T OP(
#ifdef INTERFACE
				INTERFACE Oper,
#endif
				T val, int single to) {
    Reduce.trace(SCAN,STRINGIFY(OP),STRINGIFY(T));
    return INVOKE_OP(val, to);
  }
#endif /* !SCAN */

  public static sglobal T SOMETIMES_SINGLE OP(
#ifdef INTERFACE
				INTERFACE Oper,
#endif
				T val) {
    Reduce.trace(SCAN,STRINGIFY(OP),STRINGIFY(T));
    return (T SOMETIMES_SINGLE)INVOKE_OP(val, -1);
}

  private static sglobal T CONCAT(inner_,OP)(
#ifdef INTERFACE
				INTERFACE Oper,
#endif
				T val, int single to) {
    int single root = (to == -1 ? 0 : to); /* determine root */

    T [1d] single [2d] lz = CONCAT(lz_,T); /* find a landing zone */
    if (lz == null) {
      lz = new T [0:Ti.numProcs()-1] [2d];
      lz.exchange(new T [[0,0]:[1,Ti.numProcs()-1]]);
      CONCAT(lz_,T) = lz;
    } 
    phase = phase ^ 1;
    T [2d] active_lz = lz[root];

    active_lz[[phase,Ti.thisProc()]] = val; /* fill landing zone and await arrival */
    Ti.barrier();

    /* perform computation */
    T result;
    if (Ti.thisProc() == root) 
	result = COMPUTE_OP(((T [2d] local)active_lz).slice(1,phase));
 
#if SCAN
    Ti.barrier(); 
    return active_lz[[phase,Ti.thisProc()]];
#else
    if (to == -1) return (broadcast result from root);
    else if (to == Ti.thisProc()) return result;
    else return DEFAULT_VALUE;
#endif
  }

  private inline static T CONCAT(compute_,OP)(
#ifdef INTERFACE
				INTERFACE Oper,
#endif
				T [1d] local a) {
    T val = a[0];
    for (int i = 1; i < Ti.numProcs(); i++) {
      UPDATE(val, a[i]);
#if SCAN
      a[i] = val;
#endif
    }
    return val;
  }

#undef INVOKE_OP
#undef COMPUTE_OP
#undef SOMETIMES_SINGLE