/* ti_array.c */ /* This file assumes T is a type and N is an arity. */ /* To add a method you must (as of 12/97): edit code-grid.cc, adding a line to the part that looks like D("DOMAIN_METHOD", "_ti_arraymethod_domain(" << t << ", " << n << ")", edit tlib/ti/internal/tiArray.java and tlib/ti/internal/tiArrayL.java edit titanium.h add the method defn to this file or ti_array.h */ #include "config.h" #include "runtime-options.h" #include "backend.h" #include "comm.h" #include "assertExists.h" #include "gp-type.h" #include "array-addr.h" #include #include #include #include #if (N == 2) || (N == 3) #include #endif #define N_LARGER_THAN_1 N_MINUS_1 #define N_IS_1 (N && !N_LARGER_THAN_1) #ifndef GLOBAL_ARRAY #define GLOBAL_ARRAY 1 #endif #ifndef DEBUG_ARRAYS #define DEBUG_ARRAYS 0 #endif #ifndef DUMPABLE_ARRAYS #define DUMPABLE_ARRAYS 0 #endif #ifndef DEFAULT_DUMP_DEST #define DEFAULT_DUMP_DEST "|/disks/orodruin/titanium/work/pike/bin/aview" #endif #if GLOBAL_ARRAY #define INDEX INDEX_GLOBAL #define LOCAL_PART TO_LOCAL #define BOX_PART TO_BOX #define PROC_PART TO_PROC #define EQUAL(ptr1,ptr2) EQUAL_GLOBAL((ptr1),(ptr2)) #define DIRECTLY_ADDRESSABLE(ptr) isDirectlyAddressable(ptr) #define GPTR_TO_T PTR_TO_T #else #define INDEX INDEX_LOCAL #define LOCAL_PART(p) (p) #define PROC_PART(p) MYPROC #define BOX_PART(p) MYBOX #define EQUAL(ptr1,ptr2) ((ptr1) == (ptr2)) #define DIRECTLY_ADDRESSABLE(ptr) (&*ptr,1) #define GPTR_TO_T GP_ ## T typedef GP_type(T) GPTR_TO_T; #endif /* allow calling of methods on other array types */ #if GLOBAL_ARRAY #define EXTERNAL_ARRAYMETHOD(fname, T, N) _ti_global_arraymethod_ ## fname(T,N) #define EXTERNAL_METHOD(fname, T, N) _ti_global_ ## fname(T,N) #else #define EXTERNAL_ARRAYMETHOD(fname, T, N) _ti_arraymethod_ ## fname(T,N) #define EXTERNAL_METHOD(fname, T, N) _ti_ ## fname(T,N) #endif #if GLOBAL_ARRAY && !GLOBAL_ELEMENTS # define assignable 0 #else # define assignable 1 #endif #if ELEMENTS_ARE_ATOMIC # define array_ralloc ralloc_atomic # define array_malloc ti_malloc_atomic_huge #else # define array_ralloc ralloc # define array_malloc ti_malloc_huge #endif /* region safety check for array copies */ #if !ELEMENTS_ARE_ATOMIC && !defined(NOMEMCHECKS) #if GLOBAL_ARRAY #define REGION_SAFETY_CHECK(dst_array, src_array, where) do { \ RegionId src_regionid = PG2RegionId(src_array.A); \ RegionId dst_regionid = PG2RegionId(dst_array.A); \ if (!SHARED_REGIONID(src_regionid) && SHARED_REGIONID(dst_regionid)) \ tossArrayStoreException_str("Error: May not copy elements with non-atomic type" \ " from private region to shared region " where); \ } while (0) #else #define REGION_SAFETY_CHECK(dst_array, src_array, where) do { \ RegionId src_regionid = PL2RegionId(src_array.A); \ RegionId dst_regionid = PL2RegionId(dst_array.A); \ if (!SHARED_REGIONID(src_regionid) && SHARED_REGIONID(dst_regionid)) \ tossArrayStoreException_str("Error: May not copy elements with non-atomic type" \ " from private region to shared region " where); \ } while (0) #endif #else #define REGION_SAFETY_CHECK(dst_array, src_array, where) #endif /* forward decls */ jboolean ISCONTIGUOUS_METHOD(const ti_ARRAY x); jboolean ISCONTIGUOUSOVERDOMAIN_METHOD(const ti_ARRAY x, const ti_RECTDOMAIN R); PTR_TO_T ti_get_array_data_ptr(const ti_ARRAY x); PTR_TO_T ti_get_array_data_ptr_with_domain(const ti_ARRAY x, const ti_RECTDOMAIN R); #define TI_ARRAY_C #include "ti_array_addr.h" /* must come after defines above */ #if DUMPABLE_ARRAYS void ti_dump(ti_ARRAY *px, char *dest, char *userArg) { ti_ARRAY x = *px; FILE *f; ti_RECTDOMAIN R = x.domain; ti_POINT p, q, s; jint lo[N], hi[N], stride[N], i; if (dest == NULL || dest[0] == '\0') dest = DEFAULT_DUMP_DEST; if (dest[0] == '|') f = popen(dest + 1, "w"); else f = fopen(dest, "w"); if (f == NULL) { fprintf(stderr, "Unable to open `%s'; dump request failed.\n", dest); return; } if (userArg != NULL) fprintf(f, "%s\n", userArg); fprintf(f, "%d\n", N); PRINT_DOMAIN(x.domain, f); fprintf(f, "\ndomainaddr <%d, %x>", TO_PROC(x.domain), TO_LOCAL(x.domain)); fprintf(f, "\nstride"); for (i = 0; i < N; i++) fprintf(f, " %d", x.stride[i]); fprintf(f, "\nbase"); for (i = 0; i < N; i++) fprintf(f, " %d", x.base[i]); fprintf(f, "\nsideFactors"); for (i = 0; i < N; i++) fprintf(f, " %d", x.sideFactors[i]); fprintf(f, "\ndesc %lx\nA %lx\nancestor %lx\nwhere %s\n", (long)px, (long)LOCAL_PART(x.A), (long)LOCAL_PART(x.ancestor), x.where); fprintf(f, "elementSize %d\n", sizeof(T)); fputs("elementType " T_string "\n", f); p = RECTDOMAIN_MIN(R); q = RECTDOMAIN_MAX(R); s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1); stride[i] = POINT_INDEX(s, i + 1); } forall(e, x, lo, hi, stride) { fputc('!', f); PRINT_FORALL_POINT(f); fputc(',', f); if (strcmp("jint", T_string) == 0) fprintf(f, "%d", deref(e)); else if (strcmp("jdouble", T_string) == 0) fprintf(f, "%g", deref(e)); else if (strcmp("jfloat", T_string) == 0) fprintf(f, "%f", deref(e)); else { T buf = deref(e); for (i = 0; i < sizeof(T); i++) fprintf(f, "%02x", (unsigned int)(((unsigned char *) &buf)[i])); } fputc('!', f); } if (dest[0] == '|') pclose(f); else fclose(f); } #endif jboolean ti_isnull(ti_ARRAY x) { return (LOCAL_PART(x.A) == 0); } ti_ARRAY ti_empty() { ti_ARRAY x; memset(&x, 0, sizeof(x)); return x; } ti_ARRAY ti_construct(const char *where, Region pregion, ti_RECTDOMAIN R, int isShared) { ti_ARRAY r; jint i, max[N], size, bytes; ti_POINT p; T *array; r.domain = R; r.where = where; #if DUMPABLE_ARRAYS r.dumpfn = (void(*)()) ti_dump; #endif p = RECTDOMAIN_MIN(R); /* minimum point in R */ for (i = 0; i < N; i++) r.base[i] = POINT_INDEX(p, i + 1); p = RECTDOMAIN_STRIDE(R); /* stride of R */ for (i = 0; i < N; i++) r.stride[i] = POINT_INDEX(p, i + 1); p = RECTDOMAIN_UPB(R); /* one greater than maximum point in R */ for (i = 0; i < N; i++) max[i] = PFAST_DIVIDE((POINT_INDEX(p, i + 1) - 1 - r.base[i]), r.stride[i]); r.sideFactors[N - 1] = 1; if (N > 1) { jint sf = 1; for (i = N - 2; i >= 0; i--) r.sideFactors[i] = sf *= (1 + max[i + 1]); } size = 1 + max[0]; if (N > 1) size *= r.sideFactors[0]; #if DEBUG_ARRAYS fprintf(stderr, "p%d: new array with domain = ", MYPROC); PRINT_DOMAIN(R, stderr); fprintf(stderr, "\nArray<%d>:\tbase\tmax\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\t%d\n", i, r.base[i], max[i], r.stride[i], r.sideFactors[i]); } #endif bytes = size * sizeof(T); if (pregion) array = (T *)array_ralloc(pregion, bytes); else array = (T *)array_malloc(bytes); tally_memory(bytes, isShared); #if ELEMENTS_ARE_ATOMIC /* atomic allocators don't automatically clear */ memset(array, 0, bytes); #endif #if GLOBAL_ARRAY globalize(r.A, array); #else r.A = array; #endif r.ancestor = r.A; #if EXPLICITLY_STORE_CREATOR r.creator = MYPROC; #endif return r; } #if N_LARGER_THAN_1 ti_ARRAY_SLICE SLICE_METHOD(const ti_ARRAY x, jint k, jint j) { jint i, l; ti_ARRAY_SLICE r; ti_POINT p; assertExists(x, "(in slice method)"); r.domain = RECTDOMAIN_SLICE(x.domain, k); r.where = x.where; for (i = l = 0; i < N_MINUS_1; i++, l++) { if (i == k - 1) ++l; r.stride[i] = x.stride[l]; r.base[i] = x.base[l]; r.sideFactors[i] = x.sideFactors[l]; } #if BOUNDS_CHECKING p = POINT_SET(RECTDOMAIN_MIN(x.domain), k - 1, j); ti_convinl("(in slice method)", &x, p); /* just for bounds check */ #endif INDEX(r.A, x.A, PFAST_DIVIDE((j - x.base[k - 1]), x.stride[k - 1]) * x.sideFactors[k - 1]); r.ancestor = x.ancestor; #if EXPLICITLY_STORE_CREATOR r.creator = x.creator; #endif #if DEBUG_ARRAYS fprintf(stderr, "slice(%d, %d) of [%x]\n Array<%d>:\tbase\tstride\tsideFactors\n", k, j, (int) LOCAL_PART(x.A), N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, x.base[i], x.stride[i], x.sideFactors[i]); } fprintf(stderr, "is [%x]\n Array<%d>:\tbase\tstride\tsideFactors\n", (int) LOCAL_PART(r.A), N - 1); for (i = 0; i < N - 1; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, r.base[i], r.stride[i], r.sideFactors[i]); } #endif return r; } #else void SLICE_METHOD(const ti_ARRAY x, jint k, jint j) { fputs("Error: Slice on 1d array (ignored)\n", stderr); } #endif /* ------------------------------------------------------------------------------------ */ /* return true if the elements of x and y might overlap within rectdomain R * the test is conservative, so it may return true when they actually don't overlap */ int ti_mayoverlap(const ti_ARRAY x, const ti_ARRAY y, ti_RECTDOMAIN R) { #if GLOBAL_ARRAY if (TO_BOX(x.A) != TO_BOX(y.A)) return 0; /* diff mem spaces */ #endif if (RECTDOMAIN_ISNULL(R)) return 0; /* empty domain */ { ti_POINT const rdmin = RECTDOMAIN_MIN(R); ti_POINT const rdupb = RECTDOMAIN_UPB(R); PTR_TO_T const xbegin = ti_get_array_data_ptr_with_domain(x, R); PTR_TO_T const ybegin = ti_get_array_data_ptr_with_domain(y, R); PTR_TO_T xend, yend; ti_POINT xptwithmaxaddr, yptwithmaxaddr; int i; /* determine address of max storage address */ for (i = 0; i < N; i++) { if (x.sideFactors[i] >= 0) xptwithmaxaddr = POINT_SET(xptwithmaxaddr, i, POINT_INDEX(rdupb, i + 1) - 1); else xptwithmaxaddr = POINT_SET(xptwithmaxaddr, i, POINT_INDEX(rdmin, i + 1)); if (y.sideFactors[i] >= 0) yptwithmaxaddr = POINT_SET(yptwithmaxaddr, i, POINT_INDEX(rdupb, i + 1) - 1); else yptwithmaxaddr = POINT_SET(yptwithmaxaddr, i, POINT_INDEX(rdmin, i + 1)); } INDEX(xend, x.A, ti_convinl("array copy maxpt", &x, xptwithmaxaddr)); INDEX(yend, y.A, ti_convinl("array copy maxpt", &y, yptwithmaxaddr)); /* see if the bounding memory regions are disjoint */ if (LOCAL_PART(xend) < LOCAL_PART(ybegin) || LOCAL_PART(xbegin) > LOCAL_PART(yend)) return 0; /* we could add more precise overlap detection tests here * (to detect interleaved but non-overlapping arrays) */ } return 1; } /* ------------------------------------------------------------------------------------ */ #if assignable /* x.exchange(w) is equivalent to: ordered_foreach (p in Ti.myTeam().domain()) x[p] = broadcast w from p[1]; The current implementation assumes myTeam().domain() is [0 : PROCS - 1]. */ void EXCHANGE_METHOD(const ti_ARRAY x, T w) { assertExists(x, "(in exchange method)"); if (N != 1) fprintf(stderr, "Error: exchange() used on %dd array; " "it should only be used on 1d arrays.\n", N); #if N_IS_1 if (!ISCONTIGUOUS_METHOD(x)) { jint p1; for (p1 = 0; p1 < PROCS; p1++) { T receive; { BROADCAST_BEGIN( T, p1 ); BROADCAST_bulk(receive, T, p1, w); BROADCAST_END( receive, T, p1 ); } { ti_POINT p; PTR_TO_T target; mi9constructmT8tiPoint17domains2ti(p, p1); INDEX(target, x.A, ti_convinl("(in exchange method)", &x, p)); assign(target, receive); } } } else { /* common case of contiguous arrays - use some shortcuts * (note that thanks to single restrictions, if one array is contiguous then they all are) */ GPTR_TO_T gp_array; GPTR_TO_T root_array; GPTR_TO_T target; int myproc = MYPROC; #if GLOBAL_ARRAY gp_array = x.A; #else globalize(gp_array, x.A); #endif { /* Broadcast the address for process 0's array */ BROADCAST_BEGIN( GPTR_TO_T, 0 ); BROADCAST_gp( root_array, GPTR_TO_T, 0, gp_array ); BROADCAST_END( root_array, GPTR_TO_T, 0 ); } { /* Calculate pointer offset and copy value into process 0's array */ ti_POINT p; mi9constructmT8tiPoint17domains2ti(p, myproc); INDEX_GLOBAL(target, root_array, ti_convinl("(in exchange method)", &x, p)); ASSIGN_GLOBAL_bulk(target, w); } /* Make sure every process has completed it's copy */ barrier(); /* Everyone, except 0, copy back the entire array * (could use broadcast here if all arrays local and we had an anonymous bulk broadcast) */ if (myproc != 0) { if (DIRECTLY_ADDRESSABLE(x.A)) global_to_local_copy(root_array, LOCAL_PART(x.A), PROCS); else { T *buf = (T *) ti_malloc(sizeof(T)*PROCS); global_to_local_copy(root_array, buf, PROCS); local_to_global_copy(buf, gp_array, PROCS); ti_free(buf); } } } #endif /* * Ensure that every process is done writing into its respective * array before continuing. Otherwise, one process can see * another process's array in a partially-written state. */ barrier(); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable /* This is a "copy descriptor", which encapsulates the domain and array information a remote node will need to pack array data into a buffer. */ typedef struct { ti_ARRAY x; /* Array involved in the copying. */ ti_RECTDOMAIN R; /* The subarray that will actually be copied. */ } copy_desc_t; /* Note that this name is actually a macro, to preserve uniqueness among different array types. */ /* Takes the given copy descriptor and returns a buffer that contains the desired elements of the array. The array_data_size argument is an output parameter. If "allocated" is a non-null address, the data will be copied into that provided buffer if it is null, a new buffer will be allocated and returned if it is (void *)-1, then the new buffer will be allocated as uncollectable memory */ T *PACK_METHOD(void *copy_desc_ptr, int *array_data_size, void *allocated) { ti_ARRAY x; ti_RECTDOMAIN R; int i, lo[N], hi[N], stride[N], size, idx; ti_POINT p, q, s; T *buf; copy_desc_t copy_desc = *(copy_desc_t *)copy_desc_ptr; R = copy_desc.R; x = copy_desc.x; if (RECTDOMAIN_ISNULL(R)) { *array_data_size = 0; return allocated; } p = RECTDOMAIN_MIN(R); q = RECTDOMAIN_UPB(R); s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } size = RECTDOMAIN_NUMPOINTS(R) * sizeof(T); #ifdef USE_GC_NONE if (allocated == NULL || allocated == (void*)-1) buf = (T *)ti_malloc(size); #else if (allocated == NULL) buf = (T *)ti_malloc(size); else if (allocated == (void*)-1) buf = (void *)ti_alloccheck(GC_malloc_uncollectable(size), size); #endif else buf = allocated; /* 2D case */ #if (N == 2) /* Two checks: 1. check if elements are contiguous along the last dimension if contiguous, sidefactor[N-1] should be 1 2. check if the stride of the domain of the array is the same as the stride of the intersection domain */ if ((x.sideFactors[1] == 1) && (x.stride[1] == stride[1])){ int xIncrement; int numOfRows; int stripSize; int stripSizeInBytes; T *src; T *bufTmp; int i; #if GLOBAL_ARRAY src = LOCAL_PART(x.A) + ti_convinl("pack()", &x, p); #else src = x.A + ti_convinl("pack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0],x.stride[0]); /* divide by stride[0], because stride[0] is a multiple of x.stride[0] */ numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; stripSize = PFAST_DIVIDE((hi[1] - lo[1]),x.stride[1]) + 1; stripSizeInBytes = stripSize * sizeof(T); bufTmp = buf; for (i = 0; i < numOfRows; i++){ memcpy(bufTmp, src, stripSizeInBytes); bufTmp += stripSize; src += xIncrement; } *array_data_size = size; return buf; } #endif #if (N == 3) if ((x.sideFactors[2] == 1) && (x.stride[2] == stride[2])){ int xIncrement; int yIncrement; int numOfRows; int numOfCols; int stripSize; int stripSizeInBytes; T *src; T *rowStartPtr; T *bufTmp; int i, j; #if GLOBAL_ARRAY src = LOCAL_PART(x.A) + ti_convinl("pack()", &x, p); #else src = x.A + ti_convinl("pack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0], x.stride[0]); yIncrement = x.sideFactors[1] * PFAST_DIVIDE(stride[1], x.stride[1]); numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; numOfCols = PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1; stripSize = PFAST_DIVIDE((hi[2] - lo[2]),x.stride[2]) + 1; stripSizeInBytes = stripSize * sizeof(T); bufTmp = buf; for (i = 0; i < numOfRows; i++){ rowStartPtr = src; for (j = 0; j < numOfCols; j++){ memcpy(bufTmp, src, stripSizeInBytes); bufTmp += stripSize; src += yIncrement; } src = rowStartPtr + xIncrement; } *array_data_size = size; return buf; } #endif idx = 0; forall(e, x, lo, hi, stride, deref(buf[idx++],e);, DEREF_LOCAL(buf[idx++],e);); *array_data_size = size; return buf; } /* Unpacks the given buffer into the array with the given copy descriptor. */ void UNPACK_METHOD(copy_desc_t *copy_desc_ptr, T *buf) { int i, lo[N], hi[N], stride[N], size, idx; ti_POINT p, q, s; ti_ARRAY x = copy_desc_ptr->x; ti_RECTDOMAIN R = copy_desc_ptr->R; T tmp; assertExists(x, "(in unpack method)"); if (RECTDOMAIN_ISNULL(R)) return; p = RECTDOMAIN_MIN(R); q = RECTDOMAIN_UPB(R); s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } /* 2D case */ #if (N == 2) /* Two checks: 1. check if elements are contiguous along the last dimension if contiguous, sidefactor[N-1] should be 1 2. check if the stride of the domain of the array is the same as the stride of the intersection domain */ if ((x.sideFactors[1] == 1) && (x.stride[1] == stride[1])){ int xIncrement; int numOfRows; int stripSize; int stripSizeInBytes; T *dest; int i; #if GLOBAL_ARRAY dest = LOCAL_PART(x.A) + ti_convinl("unpack()", &x, p); #else dest = x.A + ti_convinl("unpack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0], x.stride[0]); numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; stripSize = PFAST_DIVIDE((hi[1] - lo[1]),x.stride[1]) + 1; stripSizeInBytes = stripSize * sizeof(T); for (i = 0; i < numOfRows; i++){ memcpy(dest, buf, stripSizeInBytes); buf += stripSize; dest += xIncrement; } return; } #endif #if (N == 3) if ((x.sideFactors[2] == 1) && (x.stride[2] == stride[2])){ int xIncrement; int yIncrement; int numOfRows; int numOfCols; int stripSize; int stripSizeInBytes; T *dest; T *rowStartPtr; int i, j; #if GLOBAL_ARRAY dest = LOCAL_PART(x.A) + ti_convinl("unpack()", &x, p); #else dest = x.A + ti_convinl("unpack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0],x.stride[0]); yIncrement = x.sideFactors[1] * PFAST_DIVIDE(stride[1],x.stride[1]); numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; numOfCols = PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1; stripSize = PFAST_DIVIDE((hi[2] - lo[2]),x.stride[2]) + 1; stripSizeInBytes = stripSize * sizeof(T); for (i = 0; i < numOfRows; i++){ rowStartPtr = dest; for (j = 0; j < numOfCols; j++){ memcpy(dest, buf, stripSizeInBytes); buf += stripSize; dest += yIncrement; } dest = rowStartPtr + xIncrement; } return; } #endif idx = 0; #if 0 forall(e, x, lo, hi, stride, weak_assign(e, buf[idx++]);); ti_sync(); /* this is bogus - can't run ti_sync() within the context of an AM handler */ #else assert(DIRECTLY_ADDRESSABLE(x.A)); /* unpack can _only_ be used to unpack data into a local array */ forall(e, x, lo, hi, stride, assign(e, buf[idx++]);, ASSIGN_LOCAL(e, buf[idx++]);); #endif } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #ifdef DEBUG_ARRAY_COPY #define DEBUG_COPY(s) do { printf("%i: DEBUG_ARRAY_COPY: %s\n",(int)MYPROC,s); fflush(stdout); } while(0) #else #define DEBUG_COPY(s) #endif /* a smart memcpy operation that correctly handles overlap */ #ifndef smart_memcpy_defined #define smart_memcpy_defined static void smart_memcpy(void *dest, void *src, int num_bytes) { assert(num_bytes >= 0); if (num_bytes == 0) return; else { if (((jbyte *)src) + num_bytes <= ((jbyte *)dest) || ((jbyte *)dest) + num_bytes <= ((jbyte *)src)) memcpy(dest, src, num_bytes); else memmove(dest, src, num_bytes); /* handles overlap correctly */ } } #endif /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_METHOD(const ti_ARRAY x, const ti_ARRAY y) { ti_RECTDOMAIN R; /* safety checks */ assertExists(x, "(in copy method)"); assertExists(y, "(in copy method)"); REGION_SAFETY_CHECK(x, y, "(in copy method)"); /* check for a simple case, where both arrays are the same array (possibly restricted) * this is always an identity transformation, regardless of the rectdomain intersection */ if (EQUAL(x.A, y.A)) { /* we need to check not only the array data ptr, but also the base, stride and sidefactors */ if (!memcmp(x.base, y.base, sizeof(x.base)) && !memcmp(x.stride, y.stride, sizeof(x.stride)) && !memcmp(x.sideFactors, y.sideFactors, sizeof(x.sideFactors))) { DEBUG_COPY("skipping identity copy of array to itself"); return; } } { /* faster check for empty copy - try to avoid expensive intersection logic */ ti_POINT const xmin = RECTDOMAIN_MIN(x.domain); ti_POINT const xupb = RECTDOMAIN_UPB(x.domain); ti_POINT const ymin = RECTDOMAIN_MIN(y.domain); ti_POINT const yupb = RECTDOMAIN_UPB(y.domain); #if (N <= 3) if (xupb.x0 <= ymin.x0 || yupb.x0 <= xmin.x0 #if N > 1 || xupb.x1 <= ymin.x1 || yupb.x1 <= xmin.x1 #endif #if N > 2 || xupb.x2 <= ymin.x2 || yupb.x2 <= xmin.x2 #endif ) return; #endif { ti_POINT const xstride = RECTDOMAIN_STRIDE(x.domain); ti_POINT const ystride = RECTDOMAIN_STRIDE(y.domain); #if (N <= 3) if (xmin.x0 == ymin.x0 && xupb.x0 == yupb.x0 && xstride.x0 == ystride.x0 #if N > 1 && xmin.x1 == ymin.x1 && xupb.x1 == yupb.x1 && xstride.x1 == ystride.x1 #endif #if N > 2 && xmin.x2 == ymin.x2 && xupb.x2 == yupb.x2 && xstride.x2 == ystride.x2 #endif ) R = x.domain; else #endif { R = RECTDOMAIN_INTERSECTION(y.domain, x.domain); if (RECTDOMAIN_ISNULL(R)) return; /* empty copy */ } } } #if 0 /* DOB: this optimization is still under development */ #if GLOBAL_ARRAY if (TO_BOX(x.A) == TO_BOX(y.A)) #endif { /* check for the case where one array is a sub-array of another (e.g. both arrays are slices of a super-array along different dimensions) where copy would be an identity transformation This may be too rare to be worth checking for... */ ti_POINT dommin = RECTDOMAIN_MIN(R); int xminoffset = ti_convinl("copy()", &x, dommin); int yminoffset = ti_convinl("copy()", &y, dommin); /* first see if the initial points are located at the same memory address */ #if GLOBAL_ARRAY if (TO_LOCAL(x.A)+xminoffset == TO_LOCAL(y.A)+yminoffset) { #else if (x.A+xminoffset == y.A+yminoffset) { #endif /* next, check that the rest of the points will fall in the same place * (note that contiguity is not required for this) */ /* TODO: add this check - not sure how to quickly check this */ DEBUG_COPY("skipping identity copy of overlapping sub-array region"); return; } } #endif #if !ELEMENTS_ARE_ATOMIC && GLOBAL_ARRAY #if defined(USE_DISTRIBUTED_GC) && !defined(USE_GC_NONE) /* Distributed GC: we may have some escaping pointers to handle * Happens if the elements contain global pointers and the elements move across boxes */ if (!DIRECTLY_ADDRESSABLE(x.A) && DIRECTLY_ADDRESSABLE(y.A)) { /* "pushing" some global pointers */ int lo[N], hi[N], stride[N]; int i; ti_POINT const p = RECTDOMAIN_MIN(R); ti_POINT const q = RECTDOMAIN_UPB(R); ti_POINT const s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } if (sizeof(T) == sizeof(jGPointer)) { /* elements must be global pointers */ forall(e, y, lo, hi, stride, GC_PTR_ESC(TO_LOCAL(e), 1);, GC_PTR_ESC(e, 1);); } else { /* elements are immutables containing globals - hash everything */ forall(e, y, lo, hi, stride, GC_PTR_ESC_ALL_GPS(TO_LOCAL(e), sizeof(T));, GC_PTR_ESC_ALL_GPS(e, sizeof(T));); } } else if (BOX_PART(x.A) != BOX_PART(y.A)) { /* "pulling" some global pointers */ /* TODO: need to inform the source - * this is currently taken care of conservatively by the AM request handler for bulk gets * but we may be able to improve that somewhat with the type info available here */ } #endif #endif /* First we check for the fastest, simplest (and hopefully common) case - both arrays are contiguous over the domain intersection, and they store the corresponding elements in the same order do a bulk copy of the segment */ if (ISCONTIGUOUSOVERDOMAIN_METHOD(x, R) && ISCONTIGUOUSOVERDOMAIN_METHOD(y, R)) { /* we still need to check the elements are stored in the same order * (the condition coded below is sufficient, but not necessary - * it's possible to further weaken it for cases where the rectdomain * intersection doesn't span more than a single unit in some dimension, * but this case seems rare enough that we needn't bother optimizing it for now) */ int i; for (i = 0; i < N; i++) { if (PFAST_DIVIDE(((float)x.sideFactors[i]),x.stride[i]) != PFAST_DIVIDE(((float)y.sideFactors[i]),y.stride[i])) break; } if (i == N) { /* safe to use a direct region copy */ /* get pointers to start of contiguous regions to be copied */ PTR_TO_T xdata = ti_get_array_data_ptr_with_domain(x, R); PTR_TO_T ydata = ti_get_array_data_ptr_with_domain(y, R); /* get size of contiguous region */ size_t size = sizeof(T) * RECTDOMAIN_NUMPOINTS(R); #if GLOBAL_ARRAY #ifdef MEMORY_DISTRIBUTED if (isDirectlyAddressable(x.A) && isDirectlyAddressable(y.A)) { DEBUG_COPY("bulk copying contiguous region (local <- local)"); smart_memcpy(TO_LOCAL(xdata), TO_LOCAL(ydata), size); } else if (isDirectlyAddressable(x.A)) { DEBUG_COPY("bulk copying contiguous region (local <- remote)"); ti_bulk_read(TO_LOCAL(xdata), TO_BOX(ydata), TO_LOCAL(ydata), size, (ELEMENTS_ARE_ATOMIC?tic_no_ptrs:tic_gp_only)); } else if (isDirectlyAddressable(y.A)) { DEBUG_COPY("bulk copying contiguous region (remote <- local)"); ti_bulk_write(TO_BOX(xdata), TO_LOCAL(xdata), TO_LOCAL(ydata), size); } else { unsigned char *buf; DEBUG_COPY("bulk copying contiguous region (remote <- remote)"); buf = (unsigned char *) ti_malloc(size); ti_bulk_read(buf, TO_BOX(ydata), TO_LOCAL(ydata), size, (ELEMENTS_ARE_ATOMIC?tic_no_ptrs:tic_gp_only)); ti_bulk_write(TO_BOX(xdata), TO_LOCAL(xdata), buf, size); ti_free(buf); } #else /* this is shared memory */ assert(isDirectlyAddressable(xdata) && isDirectlyAddressable(ydata)); DEBUG_COPY("bulk copying contiguous region"); smart_memcpy(TO_LOCAL(xdata), TO_LOCAL(ydata), size); #endif #else /* array is local - all we need is memcpy */ DEBUG_COPY("bulk copying contiguous region"); smart_memcpy(xdata, ydata, size); #endif return; } } #if GLOBAL_ARRAY && defined(COMM_AM2) { /* we have special AM functions for scatter-gather to optimize non-contiguous copies across memory spaces * PACK_METHOD is gather, UNPACK_METHOD is scatter */ copy_desc_t copy_desc; T *buf; copy_desc.R = R; if (isDirectlyAddressable(x.A) && !isDirectlyAddressable(y.A)) { /* remote to local copy */ int num_bytes = RECTDOMAIN_NUMPOINTS(R) * sizeof(T); int fromBox = TO_BOX(y.A); DEBUG_COPY("scatter-gather AM-based copy (local <- remote)"); copy_desc.x = y; buf = (T *)ti_malloc(num_bytes); get_array((void *)PACK_METHOD, ©_desc, sizeof(copy_desc), fromBox, (void *)buf, ELEMENTS_ARE_ATOMIC); copy_desc.x = x; UNPACK_METHOD(©_desc, buf); ti_sync(); ti_free(buf); return; } else if (!isDirectlyAddressable(x.A) && isDirectlyAddressable(y.A)) { /* local to remote copy */ int num_bytes; int toBox = TO_BOX(x.A); DEBUG_COPY("scatter-gather AM-based copy (remote <- local)"); copy_desc.x = y; buf = PACK_METHOD(©_desc, &num_bytes, NULL); copy_desc.x = x; put_array((void *)UNPACK_METHOD, ©_desc, sizeof(copy_desc), buf, num_bytes, toBox); ti_sync(); ti_free(buf); return; } else if (!isDirectlyAddressable(x.A) && !isDirectlyAddressable(y.A)) { /* remote to remote copy */ int num_bytes = RECTDOMAIN_NUMPOINTS(R) * sizeof(T); int fromBox = TO_BOX(y.A); int toBox = TO_BOX(x.A); DEBUG_COPY("scatter-gather AM-based copy (remote <- remote)"); copy_desc.x = y; buf = (T *)ti_malloc(num_bytes); get_array((void *)PACK_METHOD, ©_desc, sizeof(copy_desc), fromBox, (void *)buf, ELEMENTS_ARE_ATOMIC); ti_sync(); copy_desc.x = x; put_array((void *)UNPACK_METHOD, ©_desc, sizeof(copy_desc), buf, num_bytes, toBox); ti_sync(); ti_free(buf); return; } } #endif /* GLOBAL_ARRAY && AM2 */ { const int arraysMayOverlap = ti_mayoverlap(x,y,R); int i, lo[N], hi[N], stride[N]; ti_POINT const p = RECTDOMAIN_MIN(R); ti_POINT const q = RECTDOMAIN_UPB(R); ti_POINT const s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } /* if both arrays are local and last dimension is contiguous on both arrays, use memcpy along the last dimension for higher bandwidth check if elements are contiguous along the last dimension if contiguous, sidefactor[N-1] should be 1 check if the stride of the domain of the array is the same as the stride of the intersection domain */ if (DIRECTLY_ADDRESSABLE(x.A) && DIRECTLY_ADDRESSABLE(y.A) && !arraysMayOverlap && (x.sideFactors[N-1] == 1) && (y.sideFactors[N-1] == 1) && (x.stride[N-1] == stride[N-1]) && (y.stride[N-1] == stride[N-1])) { T *src; T *dst; DEBUG_COPY("local non-contiguous copy with memcpy in last dimension"); #if GLOBAL_ARRAY src = LOCAL_PART(y.A) + ti_convinl("copy()", &y, p); dst = LOCAL_PART(x.A) + ti_convinl("copy()", &x, p); #else src = y.A + ti_convinl("copy()", &y, p); dst = x.A + ti_convinl("copy()", &x, p); #endif #if (N == 1) abort(); /* N == 1 should never make it this far */ #elif (N == 2) { int i; const int xIncrement0 = x.sideFactors[0] * PFAST_DIVIDE(stride[0],x.stride[0]); const int yIncrement0 = y.sideFactors[0] * PFAST_DIVIDE(stride[0],y.stride[0]); /* divide by stride[0], because stride[0] is a multiple of x.stride[0] */ const int numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; const int stripSizeInBytes = (PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1) * sizeof(T); for (i = numOfRows; i; i--){ memcpy(dst, src, stripSizeInBytes); src += yIncrement0; dst += xIncrement0; } return; } #elif (N == 3) { int i, j; const int xIncrement0 = x.sideFactors[0] * PFAST_DIVIDE(stride[0], x.stride[0]); const int xIncrement1 = x.sideFactors[1] * PFAST_DIVIDE(stride[1], x.stride[1]); const int yIncrement0 = y.sideFactors[0] * PFAST_DIVIDE(stride[0], y.stride[0]); const int yIncrement1 = y.sideFactors[1] * PFAST_DIVIDE(stride[1], y.stride[1]); const int numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; const int numOfCols = PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1; const int stripSizeInBytes = (PFAST_DIVIDE((hi[2] - lo[2]),stride[2]) + 1) * sizeof(T); for (i = numOfRows; i; i--){ T * const srcrowStartPtr = src; T * const dstrowStartPtr = dst; for (j = numOfCols; j; j--){ memcpy(dst, src, stripSizeInBytes); src += yIncrement1; dst += xIncrement1; } src = srcrowStartPtr + yIncrement0; dst = dstrowStartPtr + xIncrement0; } return; } #endif } /* finally, the slow, general case that handles anything */ /* Do an el-by-el copy. */ if (arraysMayOverlap) { /* areas might overlap - use an intermediate to be sure */ int const num_bytes = RECTDOMAIN_NUMPOINTS(R) * sizeof(T); T *buf = (T *)ti_malloc(num_bytes); int i=0; DEBUG_COPY("element-wise generalized copy (with overlap)"); forall(f, y, lo, hi, stride, deref(buf[i++], f), DEREF_LOCAL(buf[i++], f)); i=0; forall(e, x, lo, hi, stride, weak_assign(e, buf[i++]), WEAK_ASSIGN_LOCAL(e, buf[i++])); ti_sync(); ti_free(buf); } else { /* areas do not overlap */ T tmp; DEBUG_COPY("element-wise generalized copy (no overlap)"); forall2(e, x, f, y, lo, hi, stride, deref(tmp, f); weak_assign(e, tmp), DEREF_LOCAL(tmp, f); WEAK_ASSIGN_LOCAL(e, tmp)); ti_sync(); } return; } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ ti_1DARRAY_OF_POINT_N EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ti_1DARRAY_OF_POINT_N); ti_1DARRAY_OF_T EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,T,1)(ti_1DARRAY_OF_T); ti_1DARRAY_OF_T EXTERNAL_METHOD(CONSTRUCT,T,1)(const char *, Region, ti_RECTDOMAIN1, int); PTR_TO_T EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(ti_1DARRAY_OF_POINT_N); PTR_TO_T EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,T,1)(ti_1DARRAY_OF_T); void EXTERNAL_ARRAYMETHOD(copy,T,1)(ti_1DARRAY_OF_T, ti_1DARRAY_OF_T); ti_1DARRAY_OF_POINT_N DOMAIN_POINTLIST(ti_DOMAIN); /* ------------------------------------------------------------------------------------ */ #if assignable void GATHER_METHOD(const ti_ARRAY srcArray, const ti_1DARRAY_OF_T packedArray, const ti_1DARRAY_OF_POINT_N ptArray) { int numpts = -1; int src_box = BOX_PART(srcArray.A); ti_1DARRAY_OF_T lc_packedArray; ti_1DARRAY_OF_POINT_N lc_ptArray; ti_POINT *ptList = NULL; T **addrList = NULL; T *destList = NULL; /* safety checks */ assertExists(srcArray, "(in gather method)"); assertExists(packedArray, "(in gather method)"); assertExists(ptArray, "(in gather method)"); REGION_SAFETY_CHECK(packedArray, srcArray, "(in gather method)"); numpts = m12getNumPointsmT13tiRectDomain17domains2ti(ptArray.domain); if (numpts == 0) return; /* nothing to do */ addrList = (T**)ti_malloc(numpts * sizeof(T*)); /* build local & contiguous versions of params(with same domains), if necessary */ lc_ptArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ptArray); lc_packedArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,T,1)(packedArray); assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ptList = (ti_POINT*)LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(lc_ptArray)); destList = LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,T,1)(lc_packedArray)); { /* build list of addresses and do array-bounds check */ int i; T *baseAddr = LOCAL_PART(srcArray.A); #if ((N == 1) || (N == 2) || (N == 3)) && (BOUNDS_CHECKING == 0) T *alteredBaseAddr; #if N == 1 if ((srcArray.stride[0] == 1) && (srcArray.sideFactors[0] == 1)){ alteredBaseAddr = baseAddr - srcArray.base[0]; #endif #if N == 2 if ((srcArray.stride[0] == 1) && (srcArray.stride[1] == 1) && (srcArray.sideFactors[1] == 1)){ alteredBaseAddr = baseAddr - (srcArray.base[0]*srcArray.sideFactors[0] + srcArray.base[1]); #endif #if N == 3 if ((srcArray.stride[0] == 1) && (srcArray.stride[1] == 1) && (srcArray.stride[2] == 1) && (srcArray.sideFactors[2] == 1)){ alteredBaseAddr = baseAddr - (srcArray.base[0]*srcArray.sideFactors[0] + srcArray.base[1]*srcArray.sideFactors[1] + srcArray.base[2]); #endif for (i=0; i < numpts; i++) { TI_ARRAY_FASTBESTCONV(N, addrList[i], alteredBaseAddr, srcArray, ptList[i]); } } else{ for (i=0; i < numpts; i++) { TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, srcArray, ptList[i]); } } #else for (i=0; i < numpts; i++) { #if BOUNDS_CHECKING addrList[i] = baseAddr + ti_convinl("gather()", &srcArray, ptList[i]); #else TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, srcArray, ptList[i]); #endif } #endif } /* is the src simply local? (or are we missing AM?) - use a simple copy */ #ifdef COMM_AM2 if (DIRECTLY_ADDRESSABLE(srcArray.A)) #endif { int i; for (i=0; i < numpts; i++) { #if GLOBAL_ARRAY PTR_TO_T p; TO_GLOBALB(p, src_box, addrList[i]); deref(destList[i], p); #else deref(destList[i], addrList[i]); #endif } } #ifdef COMM_AM2 else { /* use AM to do a fast remote gather */ sparse_gather(destList, (void **)addrList, src_box, numpts, sizeof(T), ELEMENTS_ARE_ATOMIC); } #endif /* copy back & free temps */ if (!EQUAL(lc_packedArray.A, packedArray.A)) { /* need to copy back result */ EXTERNAL_ARRAYMETHOD(copy,T,1)(packedArray, lc_packedArray); assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ti_free(LOCAL_PART(lc_packedArray.A)); } if (!EQUAL(lc_ptArray.A, ptArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); ti_free(LOCAL_PART(lc_ptArray.A)); } ti_free(addrList); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void SCATTER_METHOD(const ti_ARRAY destArray, const ti_1DARRAY_OF_T packedArray, const ti_1DARRAY_OF_POINT_N ptArray) { int numpts = -1; int dest_box = BOX_PART(destArray.A); ti_1DARRAY_OF_T lc_packedArray; ti_1DARRAY_OF_POINT_N lc_ptArray; ti_POINT *ptList = NULL; T **addrList = NULL; T *srcList = NULL; /* safety checks */ assertExists(destArray, "(in scatter method)"); assertExists(packedArray, "(in scatter method)"); assertExists(ptArray, "(in scatter method)"); REGION_SAFETY_CHECK(destArray, packedArray, "(in scatter method)"); numpts = m12getNumPointsmT13tiRectDomain17domains2ti(ptArray.domain); if (numpts == 0) return; /* nothing to do */ addrList = (T**)ti_malloc(numpts * sizeof(T*)); /* build local & contiguous versions of params(with same domains), if necessary */ lc_ptArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ptArray); lc_packedArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,T,1)(packedArray); assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ptList = (ti_POINT *)LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(lc_ptArray)); srcList = LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,T,1)(lc_packedArray)); { /* build list of addresses and do array-bounds check */ int i; T *baseAddr = LOCAL_PART(destArray.A); #if ((N == 1) || (N == 2) || (N == 3)) && (BOUNDS_CHECKING == 0) T *alteredBaseAddr; #if N == 1 if ((destArray.stride[0] == 1) && (destArray.sideFactors[0] == 1)){ alteredBaseAddr = baseAddr - destArray.base[0]; #endif #if N == 2 if ((destArray.stride[0] == 1) && (destArray.stride[1] == 1) && (destArray.sideFactors[1] == 1)){ alteredBaseAddr = baseAddr - (destArray.base[0]*destArray.sideFactors[0] + destArray.base[1]); #endif #if N == 3 if ((destArray.stride[0] == 1) && (destArray.stride[1] == 1) && (destArray.stride[2] == 1) && (destArray.sideFactors[2] == 1)){ alteredBaseAddr = baseAddr - (destArray.base[0]*destArray.sideFactors[0] + destArray.base[1]*destArray.sideFactors[1] + destArray.base[2]); #endif for (i=0; i < numpts; i++) { TI_ARRAY_FASTBESTCONV(N, addrList[i], alteredBaseAddr, destArray, ptList[i]); } } else{ for (i=0; i < numpts; i++) { TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, destArray, ptList[i]); } } #else for (i=0; i < numpts; i++) { #if BOUNDS_CHECKING addrList[i] = baseAddr + ti_convinl("gather()", &destArray, ptList[i]); #else TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, destArray, ptList[i]); #endif } #endif } /* is the dest simply local? (or are we missing AM?) - use a simple copy */ #ifdef COMM_AM2 if (DIRECTLY_ADDRESSABLE(destArray.A)) #endif { int i; for (i=0; i < numpts; i++) { #if GLOBAL_ARRAY PTR_TO_T p; TO_GLOBALB(p, dest_box, addrList[i]); weak_assign(p, srcList[i]); #else weak_assign(addrList[i], srcList[i]); #endif } ti_sync(); } #ifdef COMM_AM2 else { /* use AM to do a fast remote scatter */ sparse_scatter((void **)addrList, srcList, dest_box, numpts, sizeof(T), ELEMENTS_ARE_ATOMIC); } #endif /* free temps */ if (!EQUAL(lc_packedArray.A,packedArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ti_free(LOCAL_PART(lc_packedArray.A)); } if (!EQUAL(lc_ptArray.A,ptArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); ti_free(LOCAL_PART(lc_ptArray.A)); } ti_free(addrList); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_WITHRECTDOMAIN_METHOD(const ti_ARRAY x, const ti_ARRAY y, const ti_RECTDOMAIN R) { ti_ARRAY n = y; assertExists(y, "(in copy_withrectdomain method)"); n.domain = RECTDOMAIN_INTERSECTION(y.domain, R); COPY_METHOD(x, n); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_WITHPTARRAY_METHOD(const ti_ARRAY x, const ti_ARRAY y, const ti_1DARRAY_OF_POINT_N ptArray) { int numpts = -1; ti_POINT *ptList = NULL; ti_1DARRAY_OF_POINT_N lc_ptArray; /* safety checks */ assertExists(x, "(in copy_withptarray method)"); assertExists(y, "(in copy_withptarray method)"); assertExists(ptArray, "(in copy_withptarray method)"); REGION_SAFETY_CHECK(x, y, "(in copy_withptarray method)"); numpts = m12getNumPointsmT13tiRectDomain17domains2ti(ptArray.domain); if (numpts == 0) return; /* nothing to do */ /* build local & contiguous versions of params(with same domains), if necessary */ lc_ptArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ptArray); ptList = (ti_POINT*)LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(lc_ptArray)); if (DIRECTLY_ADDRESSABLE(x.A) && DIRECTLY_ADDRESSABLE(y.A) && !ti_mayoverlap(x,y,RECTDOMAIN_INTERSECTION(y.domain, x.domain))) { /* everything is local and non-overlapping - use a direct copy */ T *baseAddrX = LOCAL_PART(x.A); T *baseAddrY = LOCAL_PART(y.A); int i; for (i=0; i < numpts; i++) { T *pXelem = baseAddrX + ti_convinl("(in copy_withptarray method)", &x, ptList[i]); T *pYelem = baseAddrY + ti_convinl("(in copy_withptarray method)", &y, ptList[i]); *pXelem = *pYelem; } } else { /* use scatter gather through a temp */ ti_1DARRAY_OF_T packedArray = EXTERNAL_METHOD(CONSTRUCT,T,1)("(in copy_withptarray method)", NULL, ptArray.domain, 0); GATHER_METHOD(y, packedArray, lc_ptArray); SCATTER_METHOD(x, packedArray, lc_ptArray); assert(DIRECTLY_ADDRESSABLE(packedArray.A)); ti_free(LOCAL_PART(packedArray.A)); } if (!EQUAL(lc_ptArray.A, ptArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); ti_free(LOCAL_PART(lc_ptArray.A)); } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_WITHDOMAIN_METHOD(const ti_ARRAY x, const ti_ARRAY y, const ti_DOMAIN dom) { /* safety checks */ assertExists(x, "(in copy_withdomain method)"); assertExists(y, "(in copy_withdomain method)"); REGION_SAFETY_CHECK(x, y, "(in copy_withptarray method)"); if (TO_LOCAL(dom) == NULL) { fprintf(stderr, "fatal error: attempt to use uninitialized domain in copy_withdomain method\n"); abort(); } #if 0 ti_1DARRAY_OF_RECTDOMAIN_N rdArray = DOMAIN_RECTDOMAINLIST(dom); #endif /* for now, simply punt to the ptArray method * future optimizations: detect strongly clustered domains and do a * rectangle-by-rectangle copy using the copy_withrectdomain method */ { /*ti_1DARRAY_OF_POINT_N ptArray = DOMAIN_POINTLIST((PT19tiMultiRectADomain17domains2ti)dom);*/ ti_1DARRAY_OF_POINT_N ptArray = DOMAIN_POINTLIST(dom); COPY_WITHPTARRAY_METHOD(x, y, ptArray); } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable /* * x.set(w) is equivalent to: * foreach (p in x.domain()) x[p] = w; */ void SET_METHOD(const ti_ARRAY x, T w) { ti_RECTDOMAIN R = x.domain; if (!RECTDOMAIN_ISNULL(R)) { int i; char *p = (char *)&w; assertExists(x, "(in set method)"); for (i=1; i < sizeof(T); i++) { /* see if all bytes are the same */ if (*p != p[i]) break; } if (i == sizeof(T) && DIRECTLY_ADDRESSABLE(x.A) && ISCONTIGUOUS_METHOD(x)) { /* use memset when possible */ PTR_TO_T pdata = ti_get_array_data_ptr(x); memset(LOCAL_PART(pdata), *p, sizeof(T) * RECTDOMAIN_NUMPOINTS(R)); } else { jint lo[N], hi[N], stride[N], i; ti_POINT const p = RECTDOMAIN_MIN(R); ti_POINT const q = RECTDOMAIN_UPB(R); ti_POINT const s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } forall(e, x, lo, hi, stride, assign(e, w), ASSIGN_LOCAL(e, w)); } } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ ti_ARRAY TRANSLATE_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; int i; assertExists(x, "(in translate method)"); n.domain = RECTDOMAIN_TRANSLATE(x.domain, p); /*p = RECTDOMAIN_MIN(n.domain);*/ for (i = 0; i < N; i++) n.base[i] += POINT_INDEX(p, i + 1); #if DEBUG_ARRAYS fprintf(stderr, "p%d: translated array from domain = ", MYPROC); PRINT_DOMAIN(x.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "p%d: translated array to domain = ", MYPROC); PRINT_DOMAIN(n.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "Array<%d>:\tbase\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, n.base[i], n.stride[i], n.sideFactors[i]); } #endif return n; } /* ------------------------------------------------------------------------------------ */ #define PERMUTE_C_ARRAY(src, dest, p) \ do { \ int i_; \ \ for (i_ = 0; i_ < N; i_++) dest[POINT_INDEX(p, i_ + 1) - 1] = src[i_]; \ } while (0) ti_ARRAY PERMUTE_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; assertExists(x, "(in permute method)"); n.domain = RECTDOMAIN_PERMUTE(n.domain, p); PERMUTE_C_ARRAY(x.base, n.base, p); PERMUTE_C_ARRAY(x.stride, n.stride, p); PERMUTE_C_ARRAY(x.sideFactors, n.sideFactors, p); return n; } #undef PERMUTE_C_ARRAY /* ------------------------------------------------------------------------------------ */ ti_ARRAY INJECT_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; int i; assertExists(x, "(in inject method)"); n.domain = RECTDOMAIN_INJECT(x.domain, p); for (i = 0; i < N; i++) { int px = POINT_INDEX(p, i + 1); n.base[i] *= px; if ((n.stride[i] *= px) < 0) { n.stride[i] *= -1; n.sideFactors[i] *= -1; } } #if DEBUG_ARRAYS fprintf(stderr, "p%d: injected array from domain = ", MYPROC); PRINT_DOMAIN(x.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "p%d: injected array to domain = ", MYPROC); PRINT_DOMAIN(n.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "\nArray<%d>:\tbase\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, n.base[i], n.stride[i], n.sideFactors[i]); } #endif return n; } /* ------------------------------------------------------------------------------------ */ ti_ARRAY PROJECT_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; int i; assertExists(x, "(in project method)"); n.domain = RECTDOMAIN_PROJECT(x.domain, p); for (i = 0; i < N; i++) { int px = POINT_INDEX(p, i + 1); assert(("project() beyond limits", n.base[i] % px == 0)); n.base[i] /= px; assert(("project() beyond limits", n.stride[i] % px == 0)); if ((n.stride[i] /= px) < 0) { n.stride[i] *= -1; n.sideFactors[i] *= -1; } } #if DEBUG_ARRAYS fprintf(stderr, "p%d: projected array from domain = ", MYPROC); PRINT_DOMAIN(x.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "p%d: projected array to domain = ", MYPROC); PRINT_DOMAIN(n.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "\nArray<%d>:\tbase\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, n.base[i], n.stride[i], n.sideFactors[i]); } #endif return n; } /* ------------------------------------------------------------------------------------ */ void ti_printdomain(ti_ARRAY x, FILE *f) { assertExists(x, "(in printDomain method)"); PRINT_DOMAIN(x.domain, f); } /* ------------------------------------------------------------------------------------ */ /* Output an array bounds violation error message and abort. The error message will contain the text of arg1 and arg2, among other things. The first argument is where the error occured in the source program. */ void ti_abv_ds(const char *errorlocation, const ti_ARRAY *x, int dim, int arg1, const char *arg2) { TI_ABV(*x, dim, errorlocation, ti_printdomain, "%d%s%s", arg1, arg2, ""); } /* ------------------------------------------------------------------------------------ */ /* Output an array bounds violation error message and abort. The error message will contain the text of arg1, arg2, and arg3, among other things. The first argument is where the error occured in the source program. */ void ti_abv_dsd(const char *errorlocation, const ti_ARRAY *x, int dim, int arg1, const char *arg2, int arg3) { TI_ABV(*x, dim, errorlocation, ti_printdomain, "%d%s%d", arg1, arg2, arg3); } /* ------------------------------------------------------------------------------------ */ void ti_boundscheck(const char *where, const ti_ARRAY *x, ti_POINT p) { #if N_LARGER_THAN_1 jint i; #else #define i 1 #endif assertExists(*x, where); #if N_LARGER_THAN_1 for (i = N; i >= 1; i--) #endif { jint const q = POINT_INDEX(p, i); jint const qm = q - x->base[i-1]; jint const lo = POINT_INDEX(RECTDOMAIN_MIN(x->domain), i); jint const hi = POINT_INDEX(RECTDOMAIN_UPB(x->domain), i) - 1; jint const s = POINT_INDEX(RECTDOMAIN_STRIDE(x->domain), i); if (s > 1 && ((q - lo) % s) != 0) { fprintf(stderr, "bounds checking error at %s:\n failed condition: ", where); PRINT_DOMAIN(x->domain, stderr); fprintf(stderr, ".contains("); PRINT_POINT(p, stderr); fprintf(stderr, ")\n array was allocated at %s\n", x->where); assert(("invalid array index", 0)); exit(-2); } else if (q < lo || q > hi) { fprintf(stderr, "bounds checking error at %s:\n failed condition: ", where); PRINT_POINT(RECTDOMAIN_MIN(x->domain), stderr); fprintf(stderr, " <= "); PRINT_POINT(p, stderr); fprintf(stderr, " <= "); PRINT_POINT(RECTDOMAIN_MAX(x->domain), stderr); fprintf(stderr, "\n array was allocated at %s\n", x->where); assert(("array index out of bounds", 0)); exit(-2); } if ((qm % x->stride[i-1]) != 0) assert(("internal error: bad array descriptor", 0)); } #if !N_LARGER_THAN_1 #undef i #endif } /* ------------------------------------------------------------------------------------ */ #define DEBUG_ISCONTIGUOUS 0 /* EFFECTS: returns true if all elements in the domain of x are stored contiguously in memory */ jboolean ISCONTIGUOUS_METHOD(const ti_ARRAY x) { assertExists(x, "(in isContiguous method)"); return ISCONTIGUOUSOVERDOMAIN_METHOD(x, x.domain); } /* REQUIRES: given domain (R) is a subset of x.domain() * EFFECTS: returns true if all elements in the given domain * are stored contiguously in memory in array x */ jboolean ISCONTIGUOUSOVERDOMAIN_METHOD(const ti_ARRAY x, const ti_RECTDOMAIN R) { assertExists(x, "(in isContiguousOverDomain method)"); if (RECTDOMAIN_ISNULL(R)) return 1; { #if 0 /* this test fails in the presence of negative sideFactors */ int numelements = RECTDOMAIN_NUMPOINTS(R); int minoffset = ti_convinl("isContiguous()", &x, RECTDOMAIN_MIN(R)); int maxoffset = ti_convinl("isContiguous()", &x, RECTDOMAIN_MAX(R)); return (numelements == maxoffset - minoffset + 1); #else /* this test handles negative sideFactors */ int i; ti_POINT pointwithminaddr, pointwithmaxaddr; ti_POINT const dommin = RECTDOMAIN_MIN(R); ti_POINT const domupb = RECTDOMAIN_UPB(R); #if DEBUG_ISCONTIGUOUS fprintf(stderr, "\n"); fprintf(stderr, "dommin="); PRINT_POINT(dommin, stderr); fprintf(stderr, "\ndomupb="); PRINT_POINT(domupb, stderr); fprintf(stderr, "\ndomstride="); PRINT_POINT(RECTDOMAIN_STRIDE(R), stderr); fflush(stderr); #endif for (i = 0; i < N; i++) { int const lo = POINT_INDEX(dommin, i + 1); int const hi = POINT_INDEX(domupb, i + 1) - 1; #if DEBUG_ISCONTIGUOUS fprintf(stderr, "\nlo=%i",lo); fprintf(stderr, "\nhi=%i",hi); #endif if (x.sideFactors[i] >= 0) { pointwithminaddr = POINT_SET(pointwithminaddr, i, lo); pointwithmaxaddr = POINT_SET(pointwithmaxaddr, i, hi); } else { pointwithminaddr = POINT_SET(pointwithminaddr, i, hi); pointwithmaxaddr = POINT_SET(pointwithmaxaddr, i, lo); } } #if DEBUG_ISCONTIGUOUS fprintf(stderr, "\npointwithminaddr="); PRINT_POINT(pointwithminaddr, stderr); fprintf(stderr, "\npointwithmaxaddr="); PRINT_POINT(pointwithmaxaddr, stderr); fprintf(stderr, "\n"); fflush(stderr); #endif { int numelements = RECTDOMAIN_NUMPOINTS(R); int minoffset = ti_convinl("isContiguous()", &x, pointwithminaddr); int maxoffset = ti_convinl("isContiguous()", &x, pointwithmaxaddr); return (numelements == maxoffset - minoffset + 1); } #endif } } #undef DEBUG_ISCONTIGUOUS /* ------------------------------------------------------------------------------------ */ /* EFFECTS: returns a ptr to the element with the lowest address in the current array * (should be used to get a handle on the array data when performing bulk operations on contiguous arrays) */ PTR_TO_T ti_get_array_data_ptr(const ti_ARRAY x) { assertExists(x, "(in ti_get_array_data_ptr)"); return ti_get_array_data_ptr_with_domain(x, x.domain); } /* REQUIRES: given domain (R) is a subset of x.domain * EFFECTS: returns a ptr to the element with the lowest address in the current array * (should be used to get a handle on the array data when performing bulk operations on contiguous arrays) */ PTR_TO_T ti_get_array_data_ptr_with_domain(const ti_ARRAY x, const ti_RECTDOMAIN R) { assertExists(x, "(in ti_get_array_data_ptr_with_domain)"); { ti_POINT pointwithminaddr; PTR_TO_T tiarray_firstelem; #if 0 /* no negative sideFactors */ pointwithminaddr = RECTDOMAIN_MIN(R); #else int i; ti_POINT const dommin = RECTDOMAIN_MIN(R); ti_POINT const domupb = RECTDOMAIN_UPB(R); for (i = 0; i < N; i++) { if (x.sideFactors[i] >= 0) pointwithminaddr = POINT_SET(pointwithminaddr, i, POINT_INDEX(dommin, i + 1)); else pointwithminaddr = POINT_SET(pointwithminaddr, i, POINT_INDEX(domupb, i + 1) - 1); } #endif INDEX(tiarray_firstelem, x.A, ti_convinl("ti_get_array_data_ptr", &x, pointwithminaddr)); return tiarray_firstelem; } } /* ------------------------------------------------------------------------------------ */ #if assignable ti_ARRAY MAKELOCALANDCONTIGUOUS_METHOD(const ti_ARRAY x) { /* returns a locally situated & contiguous version of this array, * with the same domain and same region as x * possibly copying the elements (if necessary) */ assertExists(x, "(in makeLocalAndContiguous)"); if (DIRECTLY_ADDRESSABLE(x.A) && ISCONTIGUOUS_METHOD(x)) return x; else { Region r; RegionId rid; ti_ARRAY retval; #if GLOBAL_ARRAY rid = PG2RegionId(x.A); #else rid = PL2RegionId(x.A); #endif if (rid == UNK_REGIONID) r = NULL; else r = RegionId2Region(rid); retval = ti_construct("(in makeLocalAndContiguous)", r, x.domain, 0); COPY_METHOD(retval, x); return retval; } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ /* bulk I/O methods */ /* Note: the following methods allow reading/writing Ti arrays to RandomAccessFile objects and DataInputStream/DataOutputStream objects. Another design possibility is to replace the DIS/DOS methods with ones taking InputStream/OutputStream. I don't currently see an advantage either way - both solutions are equally expressive and neither is clearly more efficient. I stuck with first solution because dispatch was easier. */ #ifndef __bulkio_typedefs #define __bulkio_typedefs /* these three seem to always get generated */ #include "layout!T16RandomAccessFile2io4java.h" #include "layout!T15DataInputStream2io4java.h" #include "layout!T16DataOutputStream2io4java.h" #include "layout!PTAjbyte.h" /* these three are more of a struggle */ #ifdef _include_layout_PT16RandomAccessFile2io4java_h_ #define GP_RAF PT16RandomAccessFile2io4java #else typedef GP_type( T16RandomAccessFile2io4java ) GP_RAF; #endif #ifdef _include_layout_PT15DataInputStream2io4java_h_ #define GP_DIS PT15DataInputStream2io4java #else typedef GP_type( T15DataInputStream2io4java ) GP_DIS; #endif #ifdef _include_layout_PT16DataOutputStream2io4java_h_ #define GP_DOS PT16DataOutputStream2io4java #else typedef GP_type( T16DataOutputStream2io4java ) GP_DOS; #endif #endif #if ELEMENTS_ARE_ATOMIC /* RandomAccessFile */ #if assignable #define TIARRAY_BULKIO_READFUNC_HEADER \ void READFROMRAF_METHOD(const ti_ARRAY x, GP_RAF fileobj) #define TIARRAY_BULKIO_READFUNC m9readFullyPTAjbyteIImT16RandomAccessFile2io4java #define TIARRAY_BULKIO_READFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_READFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_READFUNC(GP_RAF, PTAjbyte, jint, jint); #endif #define TIARRAY_BULKIO_WRITEFUNC_HEADER \ void WRITETORAF_METHOD(const ti_ARRAY x, GP_RAF fileobj) #define TIARRAY_BULKIO_WRITEFUNC m5writePTAjbyteIImT16RandomAccessFile2io4java #define TIARRAY_BULKIO_WRITEFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_WRITEFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_WRITEFUNC(GP_RAF, PTAjbyte, jint, jint); #include "ti_array_bulkio.c" #if assignable #undef TIARRAY_BULKIO_READFUNC_HEADER #undef TIARRAY_BULKIO_READFUNC #undef TIARRAY_BULKIO_READFUNC_CALLIO #endif #undef TIARRAY_BULKIO_WRITEFUNC_HEADER #undef TIARRAY_BULKIO_WRITEFUNC #undef TIARRAY_BULKIO_WRITEFUNC_CALLIO /* DataInputStream/DataOutputStream */ #if assignable #define TIARRAY_BULKIO_READFUNC_HEADER \ void READFROMDIS_METHOD(const ti_ARRAY x, GP_DIS fileobj) #define TIARRAY_BULKIO_READFUNC m9readFullyPTAjbyteIImT15DataInputStream2io4java #define TIARRAY_BULKIO_READFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_READFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_READFUNC(GP_DIS, PTAjbyte, jint, jint); #endif #define TIARRAY_BULKIO_WRITEFUNC_HEADER \ void WRITETODOS_METHOD(const ti_ARRAY x, GP_DOS fileobj) #define TIARRAY_BULKIO_WRITEFUNC m5writePTAjbyteIImT16DataOutputStream2io4java #define TIARRAY_BULKIO_WRITEFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_WRITEFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_WRITEFUNC(GP_DOS, PTAjbyte, jint, jint); #include "ti_array_bulkio.c" #if assignable #undef TIARRAY_BULKIO_READFUNC_HEADER #undef TIARRAY_BULKIO_READFUNC #undef TIARRAY_BULKIO_READFUNC_CALLIO #endif #undef TIARRAY_BULKIO_WRITEFUNC_HEADER #undef TIARRAY_BULKIO_WRITEFUNC #undef TIARRAY_BULKIO_WRITEFUNC_CALLIO #endif /* ------------------------------------------------------------------------------------ */ #undef TI_ARRAY_C #undef ti_convinl #undef PRINT_POINT #undef PRINT_DOMAIN #undef N_LARGER_THAN_1 #undef LOCAL_PART #undef PROC_PART #undef BOX_PART #undef EXTERNAL_ARRAYMETHOD #undef EXTERNAL_METHOD #undef EQUAL #undef DIRECTLY_ADDRESSABLE #undef GPTR_TO_T #undef REGION_SAFETY_CHECK #undef deref #undef INDEX #undef assign #undef weak_assign #undef assignable #undef array_ralloc #undef array_malloc