static int ack_slot; static Counter red_rctr, red_sctr; static jlong reduce_buf[2]; static Counter bcast_rctr, bcast_sctr; #ifdef WIDE_POINTERS static jGPointer bcast_buf; /* sizeof(jGPointer) on 64-bit is > sizeof(jlong) */ #else static jlong bcast_buf; #endif /* This is a pipelined implementation of reduce. Each node sends acks to its children indicating when it is ready for another value. */ #ifdef COMM_AM2 #define REDUCE_CODE(NAME, TYPE, COMB, PUT) \ \ static TYPE volatile local_reduce_buf_ ## NAME [MAX_BOX_PROCS]; \ static TYPE volatile global_reduce_val_ ## NAME; \ \ /* Only the proxy processor will run this */ \ TYPE global_ ## NAME(TYPE val) \ { \ TYPE lbuf[2]; \ int parent_box = (MYBOX - 1) / 2; \ int odd_box = 2 * MYBOX + 1; \ int even_box = 2 * MYBOX + 2; \ int parity = MYBOX & 1; \ /* wait for ack from parent */ \ if (MYBOX == 0) store_sync_ctr(sizeof(jint), &red_rctr, &red_sctr); \ \ if (MYBOX == 0) { \ if (BOXES == 1) { \ tic_poll(); \ return(val); \ } else if (BOXES == 2) { \ store_sync_ctr(sizeof(TYPE), &red_rctr, &red_sctr); \ lbuf[1] = *((TYPE *)&reduce_buf[1]); \ /* ack child */ \ __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr); \ return(COMB (val, lbuf[1])); \ } else { \ store_sync_ctr(2*sizeof(TYPE), &red_rctr, &red_sctr); \ lbuf[0] = *((TYPE *)&reduce_buf[0]); \ lbuf[1] = *((TYPE *)&reduce_buf[1]); \ /* ack children */ \ __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr); \ __i_store_ctr(toglobalb_static(even_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,even_box), &red_sctr); \ return (COMB (COMB (val, lbuf[1]), lbuf[0])); \ } \ } else { \ jGPointer dest = toglobalb_static(parent_box, &reduce_buf[parity]); \ \ if (odd_box >= BOXES) { /* Not expecting anything from any child */ \ PUT(dest, val, TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,parent_box), &red_sctr); \ } else if (even_box >= BOXES) { \ /* Not expecting anything from an even child, but wait for the odd one. */ \ store_sync_ctr(sizeof(TYPE), &red_rctr, &red_sctr); \ lbuf[1] = *((TYPE *)&reduce_buf[1]); \ /* ack child */ \ __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr); \ PUT(dest, COMB(val, lbuf[1]), \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,parent_box), &red_sctr); \ } else { \ store_sync_ctr(2*sizeof(TYPE), &red_rctr, &red_sctr); \ lbuf[0] = *((TYPE *)&reduce_buf[0]); \ lbuf[1] = *((TYPE *)&reduce_buf[1]); \ /* ack children */ \ __i_store_ctr(toglobalb_static(odd_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,odd_box), &red_sctr); \ __i_store_ctr(toglobalb_static(even_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,even_box), &red_sctr); \ PUT(dest, COMB (COMB (val, lbuf[1]), lbuf[0]), \ TIC_TRANSLATE_CSTATIC_ADDR(&red_rctr,parent_box), &red_sctr); \ } \ } \ return(val); \ } \ \ TYPE NAME (TYPE val) \ { \ local_reduce_buf_ ## NAME [MYBOXPROC] = val; \ \ if (MYBOXPROCS > 1) { \ local_barrier(); \ if (MYBOXPROC == 0) { \ int i; \ TYPE local_val = local_reduce_buf_ ## NAME [0]; \ for(i = 1; i < MYBOXPROCS; i++) { \ local_val = COMB(local_reduce_buf_ ## NAME [i], local_val); \ } \ if (BOXES == 1) { /* Are we the only box here? */ \ global_reduce_val_ ## NAME = local_val; \ } else { \ global_reduce_val_ ## NAME = global_ ## NAME (local_val); \ } \ local_barrier(); \ return global_reduce_val_ ## NAME; \ } \ else { \ local_barrier(); \ return global_reduce_val_ ## NAME; \ } \ } \ else { /* One processor per box, runtime-determined */ \ return global_reduce_val_ ## NAME = global_ ## NAME(val); \ } \ } #else /* not AMII */ #define REDUCE_CODE(NAME, TYPE, COMB, PUT) \ \ static TYPE volatile local_reduce_buf_ ## NAME [MAX_BOX_PROCS]; \ static TYPE volatile global_reduce_val_ ## NAME; \ \ \ TYPE NAME (TYPE val) \ { \ local_reduce_buf_ ## NAME [MYBOXPROC] = val; \ \ if (MYBOXPROCS > 1) { \ local_barrier(); \ if (MYBOXPROC == 0) { \ int i; \ TYPE local_val = local_reduce_buf_ ## NAME [0]; \ for(i = 1; i < MYBOXPROCS; i++) { \ local_val = COMB(local_reduce_buf_ ## NAME [i], local_val); \ } \ global_reduce_val_ ## NAME = local_val; \ } \ local_barrier(); \ return global_reduce_val_ ## NAME; \ } \ else { /* One processor per box, runtime-determined */ \ return val; \ } \ } #endif /* AMII */ /* This is a pipelined bcast. Each node sends an ack to its parent when it is ready to receive another value. */ #define BCAST_PHASE (myProcess()->broadcast_phase) #ifdef COMM_AM2 #define BCAST_CODE(NAME, TYPE, PUT) \ \ static TYPE volatile local_bcast_val_ ## NAME[2]; \ \ TYPE global_ ## NAME(TYPE val) \ { \ int parent_box = (MYBOX - 1) / 2; \ int left_box = 2 * MYBOX + 1; \ int right_box = 2 * MYBOX + 2; \ assert(sizeof(bcast_buf) >= sizeof(TYPE)); \ /* wait for acks from children */ \ if (left_box < BOXES) \ store_sync_ctr (sizeof(jint), &bcast_rctr, &bcast_sctr); \ if (right_box < BOXES) \ store_sync_ctr (sizeof(jint), &bcast_rctr, &bcast_sctr); \ \ if (MYBOX != 0) { \ store_sync_ctr(sizeof(TYPE), &bcast_rctr, &bcast_sctr); \ val = *((TYPE*) &bcast_buf); \ /* send ack to parent */ \ __i_store_ctr(toglobalb_static(parent_box, &ack_slot), 1, \ TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,parent_box), &bcast_sctr); \ } \ if (left_box < BOXES) \ PUT(toglobalb_static(left_box, (TYPE*)&bcast_buf), val, \ TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,left_box), &bcast_sctr); \ if (right_box < BOXES) \ PUT(toglobalb_static(right_box, (TYPE*)&bcast_buf), val, \ TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,right_box), &bcast_sctr); \ return(val); \ } \ \ TYPE NAME (TYPE val) { \ TIC_BEGIN_FUNCTION \ if (MYBOXPROCS > 1) { /* SMP */ \ int const this_phase = BCAST_PHASE; \ BCAST_PHASE = !(this_phase); \ if (MYBOX == COMM_GetBoxNumberForProcNumber(0)) { /* Is the broadcaster here? */ \ if (MYPROC == 0) { /* Am I the broadcaster? */ \ local_bcast_val_ ## NAME [this_phase] = val; \ if (BOXES > 1) { /* I'm on a cluster. */ \ local_bcast_val_ ## NAME [this_phase] = global_ ## NAME (val); \ } \ } \ local_barrier(); \ return *(TYPE *)&(local_bcast_val_## NAME [this_phase]); \ } else { /* The broadcaster is not here. */ \ if (MYBOXPROC == 0) { \ local_bcast_val_ ## NAME [this_phase] = global_ ## NAME ( val ); \ } \ local_barrier(); \ return *(TYPE *)&(local_bcast_val_ ## NAME [this_phase]); \ } \ } else { /* Only one processor per box. */ \ return global_ ## NAME ( val ); \ } \ } #else /* NOT AMII */ #define BCAST_CODE(NAME, TYPE, PUT) \ \ static TYPE volatile local_bcast_val_ ## NAME[2]; \ \ TYPE NAME (TYPE val) { \ TIC_BEGIN_FUNCTION \ if (MYBOXPROCS > 1) { /* SMP */ \ int const this_phase = BCAST_PHASE; \ BCAST_PHASE = !(this_phase); \ \ if (MYPROC == 0) { /* Am I the broadcaster? */ \ local_bcast_val_ ## NAME [this_phase] = val; \ } \ local_barrier(); \ return *(TYPE *)&(local_bcast_val_ ## NAME [this_phase]); \ } else return val; \ } #endif /* AMII */ #define REDUCE_TO_ALL_CODE(NAME, BCAST_NAME, REDUCE_NAME, TYPE) \ \ TYPE NAME(TYPE val) \ { \ TYPE result = REDUCE_NAME(val); \ return (BCAST_NAME(result)); \ } #define MAX_LOGP 20 /* Use two phase counter to avoid barriers. */ static int phase = 0; static Counter scan_rctr[2], scan_sctr[2]; static long long scan_buf[MAX_LOGP]; static long long temp_scan_val[MAX_LOGP]; static long long scan_result_buf; /* A slightly convoluted implementation of scan. Will come back and clean it later. The way the algorithm works is: Initially you wait for all the values in the tree rooted at MYPROC then after getting all the values you spread them back in the second phase. DOB: the SCAN_CODE below is totally broken - do not use! It's incorrect for hierarchical configurations (CLUMP), and doesn't handle static data translation correctly. */ #define SCAN_CODE(NAME, TYPE, COMB, PUT) \ \ TYPE NAME(TYPE val) \ { \ int dest=0; \ int i, count; \ int bit_vec = 1; \ TYPE scan_val; \ long long *local_buf = (long long *)scan_buf; \ \ ti_hsl_lock(&Store_mutex); \ scan_sctr[phase] = scan_rctr[phase] = 0; \ ti_hsl_unlock(&Store_mutex); \ phase = (phase+1)%2; \ *((TYPE *)&temp_scan_val[0]) = scan_val = val; \ \ /* determine the number of vals expected from other procs. */ \ for (count=0; bit_vec != PROCS; count++, bit_vec = bit_vec << 1) { \ dest = MYPROC & (~bit_vec); \ if (dest != MYPROC) \ break; \ } \ \ /* Wait for all those vals. */ \ store_sync_ctr(count*sizeof(TYPE), &scan_rctr[phase], &scan_sctr[phase]); \ \ /* Now accumulate the vals. */ \ for (i=1; i<=count; i++) { \ scan_val = COMB(scan_val, *((TYPE *)&local_buf[i])); \ *((TYPE *)&temp_scan_val[i]) = scan_val; \ } \ \ /* And store the value in the "dest". */ \ PUT(toglobalp(dest, &scan_buf[count+1]), scan_val, &scan_rctr[phase], &scan_sctr[phase]); \ \ count--; \ bit_vec = bit_vec >> 1; \ \ if (MYPROC != 0) { \ TYPE result_val; \ \ store_sync_ctr(sizeof(TYPE), &scan_rctr[phase], &scan_sctr[phase]); \ result_val = *((TYPE *)&scan_result_buf); \ for (; count >= 0; count--, bit_vec = bit_vec >> 1) { \ dest = MYPROC | bit_vec; \ PUT(toglobalp(dest, &scan_result_buf), \ COMB(result_val, *((TYPE *)&temp_scan_val[count])), \ &scan_rctr[phase], &scan_sctr[phase]); \ } \ return COMB(result_val, val); \ } \ else { \ for (; count >= 0; count--, bit_vec = bit_vec >> 1) { \ dest = MYPROC | bit_vec; \ PUT(toglobalp(dest, &scan_result_buf), \ *((TYPE *)&temp_scan_val[count]), \ &scan_rctr[phase], &scan_sctr[phase]); \ } \ return val; \ } \ }