/* see copyright.txt for usage terms */ #define TI_NO_SRCPOS #include #include #include #include #ifndef DEBUG_BARRIERS #ifdef NDEBUG #define DEBUG_BARRIERS 0 #else #define DEBUG_BARRIERS 1 #endif #endif #if DEBUG_BARRIERS #undef NDEBUG #include #define ASSERT(x) assert(x) #define CHECKZERO(x) if (x) perror(#x) static int _debug_barriers_verbose_firsttime = 1; static int _debug_barriers_verbose = 0; #define DEBUG_BARRIERS_VERBOSE(cmds) do { \ if_pf (_debug_barriers_verbose_firsttime) { \ _debug_barriers_verbose = !!getenvMaster("TI_BARRIERS_VERBOSE"); \ _debug_barriers_verbose_firsttime = 0; \ } \ if (_debug_barriers_verbose) { cmds; fflush(stderr); } \ } while (0) #else #define ASSERT(x) #define CHECKZERO(x) (x) #define DEBUG_BARRIERS_VERBOSE(cmds) ((void)0) #endif /* POLITE_BARRIER_USE_SEMAPHORE = perform signaling in polite barriers with semaphores rather than pthread cond POLITE_BARRIER_USE_PHASEDMON = use phased pthread cond variables to avoid an ambiguity in the POSIX spec - it's somewhat unclear what happens if waiting threads are signaled and attempting to reacquire the mutex and a new thread comes along and calls wait some pthread_cond implementations may not have well-defined behavior for this case */ #if defined(CYGWIN) || defined(CRAYX1) /* pthread cond is broken on some versions of Cygwin and Cray X-1 */ #define POLITE_BARRIER_USE_SEMAPHORE 1 #endif #ifdef HPUX #define POLITE_BARRIER_USE_PHASEDMON 1 #else #define POLITE_BARRIER_USE_PHASEDMON 0 #endif #if POLITE_BARRIER_USE_SEMAPHORE #include #endif typedef struct { /* these belong on a cache line together */ volatile int counter; #if POLITE_BARRIER_USE_SEMAPHORE ti_lock_t lock; sem_t sem[2]; #else tic_simplemonitor_t mon[2]; #endif char _pad1[256]; /* make sure flag is on a cache line by itself */ volatile int flag; char _pad2[256]; /* make sure flag is on a cache line by itself */ } tic_barrier_t; static volatile tic_barrier_t local_barrier_object __attribute__((aligned)); static Counter up_buf[2]; static Counter down_buf; void barrier_init() { /* CAUTION: this routine executes very early during startup, before any of the system-wide parameters have been set don't access rely on the values of any globals or parameter-returning functions (e.g. MYBOXPROCS, etc.) in here */ int i; up_buf[0] = 0; up_buf[1] = 0; down_buf = 0; /* these are OK */ for (i = 0; i < COMM_GetMyBParallelDegree(); i++) { COMM_GetHisBoxProcess(i)->barrier_phase = 0; } #if POLITE_BARRIER_USE_SEMAPHORE /* must use ti_lock_initializer because ti_lock_init may reference MYBOXPROCS */ local_barrier_object.lock = ti_lock_initializer; if (sem_init((sem_t *)&(local_barrier_object.sem[0]), 0, 0)) perror("sem_init"); if (sem_init((sem_t *)&(local_barrier_object.sem[1]), 0, 0)) perror("sem_init"); #else simplemonitor_init(((tic_simplemonitor_t *)&(local_barrier_object.mon[0]))); simplemonitor_init(((tic_simplemonitor_t *)&(local_barrier_object.mon[1]))); #endif local_barrier_object.counter = 0; local_barrier_object.flag = 0; } #ifdef COMM_AM2 TI_INLINE(barrier_incr_request) void barrier_incr_request(tic_amtoken_t token, void *val) { int *pval = (int *)val; (*pval)++; TIC_NULL_REPLY(token); } TIC_AMSHORT(barrier_incr_request, 1, 2, (token, TIC_AMRECV_PTR32(a0) ), (token, TIC_AMRECV_PTR64(a0, a1))); #endif /* COMM_AM2 */ #if DEBUG_BARRIERS volatile int debug_localbarriercount = 0; #endif /*------------------------------------------------------------------------------------*/ /* Shared memory barrier algorithm * written & tuned by Dan Bonachea * key design issues (for the "impolite" high-performance version): * avoid expensive pthread mutex locking * design for a linear number of cache invalidations for good scaling * avoid any false sharing of cache lines * pipeline the reads of remote data for coherence implementations that allow * multiple outstanding remote reads (like the Origin 2000) */ void _local_barrier() { TIC_BEGIN_FUNCTION DEBUG_BARRIERS_VERBOSE(fprintf(stderr, "P%i: Barrier %i Enter\n", MYBOXPROC, debug_localbarriercount)); /* ensure that all previous writes have been globally completed */ tic_local_wmb(); if (politep) { /* "polite" barrier (low-performance) */ const int numboxprocs = MYBOXPROCS; int mycount; if (numboxprocs == 1) { tic_local_rmb(); return; } #if POLITE_BARRIER_USE_SEMAPHORE { processInfo * const pi = COMM_GetMyProcess(); const int phase = pi->barrier_phase; sem_t * const sem = (sem_t * const)&(local_barrier_object.sem[phase]); ti_lock_t * const lock = (ti_lock_t *)&(local_barrier_object.lock); pi->barrier_phase = !phase; ti_lock(lock); mycount = local_barrier_object.counter++; ASSERT(mycount >= 0 && mycount < numboxprocs); if (mycount == numboxprocs-1) { int i; DEBUG_BARRIERS_VERBOSE(fprintf(stderr, "--- Barrier %i ---\n", debug_localbarriercount++)); local_barrier_object.counter = 0; ti_unlock(lock); for (i = numboxprocs-1; i; i--) { CHECKZERO(sem_post(sem)); } } else { ti_unlock(lock); CHECKZERO(sem_wait(sem)); } } #else /* use monitors for polite barriers */ { #if POLITE_BARRIER_USE_PHASEDMON /* work-around a bug (observed on cygwin) where threads awakening on a monitor can be knocked back into permanent sleep by a new wait call fix is to use phased monitors */ processInfo * const pi = COMM_GetMyProcess(); const int phase = pi->barrier_phase; tic_simplemonitor_t * const mon = (tic_simplemonitor_t *)&(local_barrier_object.mon[phase]); pi->barrier_phase = !phase; #else tic_simplemonitor_t * const mon = (tic_simplemonitor_t *)&(local_barrier_object.mon[0]); #endif ti_lock(&(mon->lock)); mycount = local_barrier_object.counter++; ASSERT(mycount >= 0 && mycount < numboxprocs); if (mycount == numboxprocs-1) { DEBUG_BARRIERS_VERBOSE(fprintf(stderr, "--- Barrier %i ---\n", debug_localbarriercount++)); local_barrier_object.counter = 0; ti_mon_broadcast(mon); } else ti_mon_wait(mon); ti_unlock(&(mon->lock)); } #endif DEBUG_BARRIERS_VERBOSE(fprintf(stderr, "P%i: Barrier %i Exit\n", MYBOXPROC, debug_localbarriercount-1)); } else { /* "impolite" high-performance barrier (assumes thread has CPU to itself) */ processInfo * const pi = COMM_GetMyProcess(); const int myboxproc = MYBOXPROC; const int numboxprocs = MYBOXPROCS; register const int oldphase = pi->barrier_phase; const int newphase = !oldphase; register int i; if (numboxprocs == 1) return; pi->barrier_phase = newphase; /* signal arrival */ if (myboxproc == 0) { register int n = numboxprocs - 1; #if TRUE_ALLOCA /* use alloca to try and guarantee thread-affinity of this memory */ register volatile int ** const addr = (volatile int **)tic_alloca(numboxprocs*sizeof(int*)); register int * const val = (int *)tic_alloca(numboxprocs*sizeof(int)); #else static volatile int ** addr = NULL; static int * val = NULL; if (!addr) { addr = (volatile int **)ti_malloc(numboxprocs*sizeof(int*)); val = (int *)ti_malloc(numboxprocs*sizeof(int)); } #endif for (i = 1; i < numboxprocs; i++) { register processInfo const *rpi = COMM_GetHisBoxProcess(i); addr[i-1] = (volatile int *)&(rpi->barrier_phase); val[i-1] = rpi->barrier_phase; /* pipelined read */ } while (n) { /* n is number of outstanding procs */ for (i = 0; i < n; ) { if (val[i] == oldphase) { val[i] = *addr[i]; /* pipelined read */ i++; } else { n--; addr[i] = addr[n]; val[i] = val[n]; } } gasnett_spinloop_hint(); } DEBUG_BARRIERS_VERBOSE(fprintf(stderr, "--- Barrier %i ---\n", debug_localbarriercount++)); local_barrier_object.flag = newphase; /* signal completion */ } else { /* procs other than 0 */ /* Might as well spin, no one else is running on this proc anyway */ while (local_barrier_object.flag == oldphase) { gasnett_spinloop_hint(); } } DEBUG_BARRIERS_VERBOSE(fprintf(stderr, "P%i: Barrier %i Exit\n", MYBOXPROC, debug_localbarriercount-1)); } /* prevent any subsequent loads from prefetching past this barrier */ tic_local_rmb(); return; } /*------------------------------------------------------------------------------------*/ void barrier_up() { /* If there's only one box, just do a local barrier */ if (BOXES == 1) { #ifndef MEMORY_SHARED return; #else /* Only need to do something if there are other threads here */ if (MYBOXPROCS > 1) local_barrier(); return; #endif /* NO_SMP */ } /* If we're on an SMP, run a local barrier */ if (MYBOXPROCS > 1) { /* MYBOXPROC == 0 does local barrier BEFORE global barrier in barrier_up() to prevent remote processors from continuing before all local threads arrive */ local_barrier(); if (MYBOXPROC > 0) return; } else tic_local_wmb(); #ifdef COMM_AM2 { /* Only MYBOXPROC = 0 on each box gets to run this part */ const int parent = (MYBOX - 1) / 2; const int odd_child = 2 * MYBOX + 1; const int even_child = 2 * MYBOX + 2; const int parity = MYBOX & 1; if (even_child < BOXES) { tic_poll_until(up_buf[0] != 0); up_buf[0] = 0; } if (odd_child < BOXES) { tic_poll_until(up_buf[1] != 0); up_buf[1] = 0; } if (MYBOX > 0) { tic_AMRequest(1,2,((Box)parent, TIC_AMIDX(barrier_incr_request), TIC_AMSEND_PTR(TIC_TRANSLATE_CSTATIC_ADDR(&up_buf[parity],parent)))); } } #endif /* AMII */ } void barrier_down() { if (BOXES == 1) { #ifndef MEMORY_SHARED return; #else /* Only need to do something if there are other threads here */ if (MYBOXPROCS > 1) local_barrier(); return; #endif /* NO_SMP */ } if (MYBOXPROC > 0) { local_barrier(); return; } #ifdef COMM_AM2 { /* Only MYBOXPROC = 0 on each box runs this part */ const int left = 2 * MYBOX + 1; const int right = 2 * MYBOX + 2; if (MYBOX > 0) { tic_poll_until(down_buf != 0); down_buf = 0; } if (left < BOXES) { tic_AMRequest(1,2,((Box)left, TIC_AMIDX(barrier_incr_request), TIC_AMSEND_PTR(TIC_TRANSLATE_CSTATIC_ADDR(&down_buf,left)))); } if (right < BOXES) { tic_AMRequest(1,2,((Box)right, TIC_AMIDX(barrier_incr_request), TIC_AMSEND_PTR(TIC_TRANSLATE_CSTATIC_ADDR(&down_buf,right)))); } } #endif /* AMII */ if (MYBOXPROCS > 1) { local_barrier(); return; } else tic_local_rmb(); } #ifdef COMM_GASNET extern void barrier_gasnet(void) { /* wait for local threads to arrive */ if (MYBOXPROCS > 1) local_barrier(); else tic_local_wmb(); /* perform global barrier */ if (BOXES > 1) { if (MYBOXPROC == 0) { /* thread 0 talks to GASNet */ gasnet_barrier_notify(0, GASNET_BARRIERFLAG_ANONYMOUS); gasnet_barrier_wait(0, GASNET_BARRIERFLAG_ANONYMOUS); } /* wait for global barrier to complete, if other threads are present */ if (MYBOXPROCS > 1) local_barrier(); else tic_local_rmb(); } } #endif /* * __os_sleep -- * Yield the processor for a period of time. * * PUBLIC: int __os_sleep __P((u_long, u_long)); */ int __os_sleep(long secs, long usecs) { #if 1 /* the small-interval performance of this totally sucks on Linux, * where the minimum interval is 10 ms (WAY too long to be useful for much of anything) * the same is apparently true for other OS's (IRIX, etc.) * why doesn't anybody implement high-granularity timers? */ struct timeval t; /* Don't require that the values be normalized. */ for (; usecs >= 1000000; ++secs, usecs -= 1000000) {} /* * It's important that we yield the processor here so that other * processes or threads are permitted to run. */ t.tv_sec = secs; t.tv_usec = usecs; return (select(0, NULL, NULL, NULL, &t) == -1 ? errno : 0); #elif 0 /* unfortunately, these are no better */ struct timespec t; /* Don't require that the values be normalized. */ for (; usecs >= 1000000; ++secs, usecs -= 1000000) {} t.tv_sec = secs; t.tv_nsec = usecs*1000; return (nanosleep(&t, NULL) == -1 ? errno : 0); #elif 0 return (poll(NULL, 0, (int)(secs*1000.0 + usecs/1000.0)) == -1 ? errno : 0); #else usleep((int)(secs/1000000.0 + usecs)); return 0; #endif } /* * __os_yield -- * Yield the processor. * * PUBLIC: void __os_yield __P((u_long)); */ void __os_yield(long usecs) { __os_sleep(0, usecs); }