/* This library provides global communication support for Split-C, including broadcasts, reductions, and scans. It provides operations in which all processors participate. There are two basic styles: a processor-oriented functional interface and a spread array interface. Currently only the processor interface is supported. Eventually it will also provide autonomous operations. An all_ routine end with a barrier, so that there is no conflict on use of internal state. */ #define TI_NO_SRCPOS #include #define ADD(A, B) (A + B) #ifdef MAX #undef MAX #endif #define MAX(A, B) (A < B ? B : A) #ifdef MIN #undef MIN #endif #define MIN(A, B) (A < B ? A : B) #define MULT(A, B) (A * B) #define OR(A, B) (A | B) #define XOR(A, B) (A ^ B) #define AND(A, B) (A & B) #include "comfun.def" #include "broadcast.h" REDUCE_CODE(all_reduce_to_one_iadd, jint, ADD, __i_store_ctr) REDUCE_CODE(all_reduce_to_one_badd, jbyte, ADD, __b_store_ctr) REDUCE_CODE(all_reduce_to_one_cadd, jchar, ADD, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_shadd, jshort, ADD, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_ladd, jlong, ADD, __l_store_ctr) REDUCE_CODE(all_reduce_to_one_fadd, jfloat, ADD, __f_store_ctr) REDUCE_CODE(all_reduce_to_one_dadd, jdouble, ADD, __d_store_ctr) REDUCE_CODE(all_reduce_to_one_imult, jint, MULT, __i_store_ctr) REDUCE_CODE(all_reduce_to_one_bmult, jbyte, MULT, __b_store_ctr) REDUCE_CODE(all_reduce_to_one_cmult, jchar, MULT, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_shmult, jshort, MULT, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_lmult, jlong, MULT, __l_store_ctr) REDUCE_CODE(all_reduce_to_one_fmult, jfloat, MULT, __f_store_ctr) REDUCE_CODE(all_reduce_to_one_dmult, jdouble, MULT, __d_store_ctr) REDUCE_CODE(all_reduce_to_one_imax, jint, MAX, __i_store_ctr) REDUCE_CODE(all_reduce_to_one_bmax, jbyte, MAX, __b_store_ctr) REDUCE_CODE(all_reduce_to_one_cmax, jchar, MAX, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_shmax, jshort, MAX, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_lmax, jlong, MAX, __l_store_ctr) REDUCE_CODE(all_reduce_to_one_fmax, jfloat, MAX, __f_store_ctr) REDUCE_CODE(all_reduce_to_one_dmax, jdouble, MAX, __d_store_ctr) REDUCE_CODE(all_reduce_to_one_imin, jint, MIN, __i_store_ctr) REDUCE_CODE(all_reduce_to_one_bmin, jbyte, MIN, __b_store_ctr) REDUCE_CODE(all_reduce_to_one_cmin, jchar, MIN, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_shmin, jshort, MIN, __sh_store_ctr) REDUCE_CODE(all_reduce_to_one_lmin, jlong, MIN, __l_store_ctr) REDUCE_CODE(all_reduce_to_one_fmin, jfloat, MIN, __f_store_ctr) REDUCE_CODE(all_reduce_to_one_dmin, jdouble, MIN, __d_store_ctr) REDUCE_CODE(all_reduce_to_one_or, jboolean, OR, __b_store_ctr) REDUCE_CODE(all_reduce_to_one_xor, jboolean, XOR, __b_store_ctr) REDUCE_CODE(all_reduce_to_one_and, jboolean, AND, __b_store_ctr) /* To make sure other programs dont break */ BCAST_CODE(all_bcast_i, jint, __i_store_ctr) BCAST_CODE(all_bcast_bool, jboolean, __b_store_ctr) BCAST_CODE(all_bcast_b, jbyte, __b_store_ctr) BCAST_CODE(all_bcast_c, jchar, __sh_store_ctr) BCAST_CODE(all_bcast_sh, jshort, __sh_store_ctr) BCAST_CODE(all_bcast_l, jlong, __l_store_ctr) BCAST_CODE(all_bcast_f, jfloat, __f_store_ctr) BCAST_CODE(all_bcast_d, jdouble, __d_store_ctr) BCAST_CODE(all_bcast_lp, void *, __lp_store_ctr) BCAST_CODE(all_bcast_gp, jGPointer, __gp_store_ctr) REDUCE_TO_ALL_CODE(all_reduce_to_all_iadd, all_bcast_i, all_reduce_to_one_iadd, jint) REDUCE_TO_ALL_CODE(all_reduce_to_all_badd, all_bcast_b, all_reduce_to_one_badd, jbyte) REDUCE_TO_ALL_CODE(all_reduce_to_all_cadd, all_bcast_c, all_reduce_to_one_cadd, jchar) REDUCE_TO_ALL_CODE(all_reduce_to_all_shadd, all_bcast_sh, all_reduce_to_one_shadd, jshort) REDUCE_TO_ALL_CODE(all_reduce_to_all_ladd, all_bcast_l, all_reduce_to_one_ladd, jlong) REDUCE_TO_ALL_CODE(all_reduce_to_all_fadd, all_bcast_d, all_reduce_to_one_fadd, jfloat) REDUCE_TO_ALL_CODE(all_reduce_to_all_dadd, all_bcast_d, all_reduce_to_one_dadd, jdouble) REDUCE_TO_ALL_CODE(all_reduce_to_all_imult, all_bcast_i, all_reduce_to_one_imult, jint) REDUCE_TO_ALL_CODE(all_reduce_to_all_bmult, all_bcast_b, all_reduce_to_one_bmult, jbyte) REDUCE_TO_ALL_CODE(all_reduce_to_all_cmult, all_bcast_c, all_reduce_to_one_cmult, jchar) REDUCE_TO_ALL_CODE(all_reduce_to_all_shmult, all_bcast_sh, all_reduce_to_one_shmult, jshort) REDUCE_TO_ALL_CODE(all_reduce_to_all_lmult, all_bcast_l, all_reduce_to_one_lmult, jlong) REDUCE_TO_ALL_CODE(all_reduce_to_all_fmult, all_bcast_f, all_reduce_to_one_fmult, jfloat) REDUCE_TO_ALL_CODE(all_reduce_to_all_dmult, all_bcast_d, all_reduce_to_one_dmult, jdouble) REDUCE_TO_ALL_CODE(all_reduce_to_all_imax, all_bcast_i, all_reduce_to_one_imax, jint) REDUCE_TO_ALL_CODE(all_reduce_to_all_bmax, all_bcast_b, all_reduce_to_one_bmax, jbyte) REDUCE_TO_ALL_CODE(all_reduce_to_all_cmax, all_bcast_c, all_reduce_to_one_cmax, jchar) REDUCE_TO_ALL_CODE(all_reduce_to_all_shmax, all_bcast_sh, all_reduce_to_one_shmax, jshort) REDUCE_TO_ALL_CODE(all_reduce_to_all_lmax, all_bcast_l, all_reduce_to_one_lmax, jlong) REDUCE_TO_ALL_CODE(all_reduce_to_all_fmax, all_bcast_f, all_reduce_to_one_fmax, jfloat) REDUCE_TO_ALL_CODE(all_reduce_to_all_dmax, all_bcast_d, all_reduce_to_one_dmax, jdouble) REDUCE_TO_ALL_CODE(all_reduce_to_all_imin, all_bcast_i, all_reduce_to_one_imin, jint) REDUCE_TO_ALL_CODE(all_reduce_to_all_bmin, all_bcast_b, all_reduce_to_one_bmin, jbyte) REDUCE_TO_ALL_CODE(all_reduce_to_all_cmin, all_bcast_c, all_reduce_to_one_cmin, jchar) REDUCE_TO_ALL_CODE(all_reduce_to_all_shmin, all_bcast_sh, all_reduce_to_one_shmin, jshort) REDUCE_TO_ALL_CODE(all_reduce_to_all_lmin, all_bcast_l, all_reduce_to_one_lmin, jlong) REDUCE_TO_ALL_CODE(all_reduce_to_all_fmin, all_bcast_f, all_reduce_to_one_fmin, jfloat) REDUCE_TO_ALL_CODE(all_reduce_to_all_dmin, all_bcast_d, all_reduce_to_one_dmin, jdouble) REDUCE_TO_ALL_CODE(all_reduce_to_all_or, all_bcast_bool, all_reduce_to_one_or, jboolean) REDUCE_TO_ALL_CODE(all_reduce_to_all_xor, all_bcast_bool, all_reduce_to_one_xor, jboolean) REDUCE_TO_ALL_CODE(all_reduce_to_all_and, all_bcast_bool, all_reduce_to_one_and, jboolean) SCAN_CODE(all_scan_iadd, jint, ADD, __i_store_ctr) SCAN_CODE(all_scan_badd, jbyte, ADD, __b_store_ctr) SCAN_CODE(all_scan_cadd, jchar, ADD, __sh_store_ctr) SCAN_CODE(all_scan_shadd, jshort, ADD, __sh_store_ctr) SCAN_CODE(all_scan_ladd, jlong, ADD, __l_store_ctr) SCAN_CODE(all_scan_fadd, jfloat, ADD, __f_store_ctr) SCAN_CODE(all_scan_dadd, jdouble, ADD, __d_store_ctr) SCAN_CODE(all_scan_imult, jint, MULT, __i_store_ctr) SCAN_CODE(all_scan_bmult, jbyte, MULT, __b_store_ctr) SCAN_CODE(all_scan_cmult, jchar, MULT, __sh_store_ctr) SCAN_CODE(all_scan_shmult, jshort, MULT, __sh_store_ctr) SCAN_CODE(all_scan_lmult, jlong, MULT, __l_store_ctr) SCAN_CODE(all_scan_fmult, jfloat, MULT, __f_store_ctr) SCAN_CODE(all_scan_dmult, jdouble, MULT, __d_store_ctr) SCAN_CODE(all_scan_imax, jint, MAX, __i_store_ctr) SCAN_CODE(all_scan_bmax, jbyte, MAX, __b_store_ctr) SCAN_CODE(all_scan_cmax, jchar, MAX, __sh_store_ctr) SCAN_CODE(all_scan_shmax, jshort, MAX, __sh_store_ctr) SCAN_CODE(all_scan_lmax, jlong, MAX, __l_store_ctr) SCAN_CODE(all_scan_fmax, jfloat, MAX, __f_store_ctr) SCAN_CODE(all_scan_dmax, jdouble, MAX, __d_store_ctr) SCAN_CODE(all_scan_imin, jint, MIN, __i_store_ctr) SCAN_CODE(all_scan_bmin, jbyte, MIN, __b_store_ctr) SCAN_CODE(all_scan_cmin, jchar, MIN, __sh_store_ctr) SCAN_CODE(all_scan_shmin, jshort, MIN, __sh_store_ctr) SCAN_CODE(all_scan_lmin, jlong, MIN, __l_store_ctr) SCAN_CODE(all_scan_fmin, jfloat, MIN, __f_store_ctr) SCAN_CODE(all_scan_dmin, jdouble, MIN, __d_store_ctr) SCAN_CODE(all_scan_or, jboolean, OR, __b_store_ctr) SCAN_CODE(all_scan_xor, jboolean, XOR, __b_store_ctr) SCAN_CODE(all_scan_and, jboolean, AND, __b_store_ctr) #ifdef COMM_AM2 void * global_all_bcast_buffer(void *local, juint size) { int parent_box = (MYBOX - 1) / 2; int left_box = 2 * MYBOX + 1; int right_box = 2 * MYBOX + 2; /* wait for acks from children */ if (left_box < BOXES) store_sync_ctr (sizeof(jint), &bcast_rctr, &bcast_sctr); if (right_box < BOXES) store_sync_ctr (sizeof(jint), &bcast_rctr, &bcast_sctr); if (MYBOX != 0) { store_sync_ctr(size, &bcast_rctr, &bcast_sctr); /* send ack to parent */ __i_store_ctr(toglobalb_static(parent_box, &ack_slot), 1, TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr, parent_box), &bcast_sctr); } if (left_box < BOXES) __bulk_store_ctr(toglobalb_static(left_box, local), local, size, TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,left_box), &bcast_sctr); if (right_box < BOXES) __bulk_store_ctr(toglobalb_static(right_box, local), local, size, TIC_TRANSLATE_CSTATIC_ADDR(&bcast_rctr,right_box), &bcast_sctr); return(local); } static void * volatile local_bcast_val_all_bcast_buffer[2]; void *all_bcast_buffer(void *local, juint size) { TIC_BEGIN_FUNCTION if (MYBOXPROCS > 1) { /* SMP */ int const this_phase = BCAST_PHASE; BCAST_PHASE = !(this_phase); if (MYBOX == COMM_GetBoxNumberForProcNumber(0)) { /* Is the broadcaster here? */ if (MYPROC == 0) { /* Am I the broadcaster? */ local_bcast_val_all_bcast_buffer[this_phase] = local; if (BOXES > 1) { /* I'm on a cluster. */ local_bcast_val_all_bcast_buffer[this_phase] = global_all_bcast_buffer(local, size); } } local_barrier(); return local_bcast_val_all_bcast_buffer[this_phase]; } else { /* The broadcaster is not here. */ if (MYBOXPROC == 0) { local_bcast_val_all_bcast_buffer[this_phase] = global_all_bcast_buffer(local, size); } local_barrier(); return local_bcast_val_all_bcast_buffer[this_phase]; } } else { /* Only one processor per box. */ return global_all_bcast_buffer(local, size); } } #else /* NOT AMII */ static void * volatile local_bcast_val_all_bcast_buffer[2]; void *all_bcast_buffer(void *local, juint size) { TIC_BEGIN_FUNCTION if (MYBOXPROCS > 1) { /* SMP */ int const this_phase = BCAST_PHASE; BCAST_PHASE = !(this_phase); if (MYPROC == 0) { /* Am I the broadcaster? */ local_bcast_val_all_bcast_buffer[this_phase] = local; } local_barrier(); return local_bcast_val_all_bcast_buffer[this_phase]; } else return local; } #endif /* AMII */ void init_bcast () { int left, right, left_box, right_box; left_box = 2 * MYBOX + 1; right_box = 2 * MYBOX + 2; left = (left_box >= BOXES) ? PROCS : COMM_GetProxyProcNumberForBoxNumber(left_box); right = (right_box >= BOXES) ? PROCS : COMM_GetProxyProcNumberForBoxNumber(right_box); bcast_rctr = bcast_sctr = 0; if (left_box < BOXES) { bcast_rctr += sizeof(jint); bcast_sctr += sizeof(jint); } if (right_box < BOXES) { bcast_rctr += sizeof(jint); bcast_sctr += sizeof(jint); } BCAST_PHASE = 0; CYCLE = 0; } void init_reduce() { if (MYPROC > 0) red_rctr = red_sctr = sizeof(int); else red_rctr = red_sctr = 0; } void init_scan() { scan_rctr[0] = scan_rctr[1] = scan_sctr[0] = scan_sctr[1] = 0; } void com_init() { init_bcast(); init_reduce(); init_scan(); } #undef REDUCE_CODE #undef BCAST_CODE #undef REDUCE_TO_ALL_CODE #undef SCAN_CODE