/* ti_array.c */ /* This file assumes T is a type and N is an arity. */ /* To add a method you must (as of 12/97): edit code-grid.cc, adding a line to the part that looks like D("DOMAIN_METHOD", "_ti_arraymethod_domain(" << t << ", " << n << ")", edit tlib/ti/internal/tiArray.java and tlib/ti/internal/tiArrayL.java edit titanium.h add the method defn to this file or ti_array.h */ #include "ti_config.h" #include "runtime-options.h" #include "backend.h" #include "comm.h" #include "gp-type.h" #include "array-addr.h" #include "T6Handle4lang2ti.h" #include #include #include #include #include #define N_LARGER_THAN_1 N_MINUS_1 #define N_IS_1 (N && !N_LARGER_THAN_1) #ifndef GLOBAL_ARRAY #define GLOBAL_ARRAY 1 #endif #ifndef DEBUG_ARRAYS #define DEBUG_ARRAYS 0 #endif #if GLOBAL_ARRAY #define INDEX INDEX_GLOBAL #define LOCAL_PART TO_LOCAL #define BOX_PART TO_BOX #define PROC_PART TO_PROC #define EQUAL(ptr1,ptr2) EQUAL_GLOBAL((ptr1),(ptr2)) #define DIRECTLY_ADDRESSABLE(ptr) isDirectlyAddressable(ptr) #define GPTR_TO_T PTR_TO_T #else #define INDEX INDEX_LOCAL #define LOCAL_PART(p) (p) #define PROC_PART(p) MYPROC #define BOX_PART(p) MYBOX #define EQUAL(ptr1,ptr2) ((ptr1) == (ptr2)) #define DIRECTLY_ADDRESSABLE(ptr) (&*ptr,1) #define GPTR_TO_T GP_ ## T typedef GP_type(T) GPTR_TO_T; #endif /* allow calling of methods on other array types */ #if GLOBAL_ARRAY #define EXTERNAL_ARRAYMETHOD(fname, T, N) _ti_global_arraymethod_ ## fname(T,N) #define EXTERNAL_METHOD(fname, T, N) _ti_global_ ## fname(T,N) #else #define EXTERNAL_ARRAYMETHOD(fname, T, N) _ti_arraymethod_ ## fname(T,N) #define EXTERNAL_METHOD(fname, T, N) _ti_ ## fname(T,N) #endif #if GLOBAL_ARRAY && !GLOBAL_ELEMENTS # define assignable 0 #else # define assignable 1 #endif #if ELEMENTS_ARE_ATOMIC # define array_ralloc ralloc_atomic # define array_malloc ti_malloc_atomic_huge #else # define array_ralloc ralloc # define array_malloc ti_malloc_huge #endif /* region safety check for array copies */ #if !ELEMENTS_ARE_ATOMIC && !defined(NOMEMCHECKS) #if GLOBAL_ARRAY #define REGION_SAFETY_CHECK(dst_array, src_array, where) do { \ RegionId src_regionid = PG2RegionId(src_array.A); \ RegionId dst_regionid = PG2RegionId(dst_array.A); \ if (!SHARED_REGIONID(src_regionid) && SHARED_REGIONID(dst_regionid)) \ tossArrayStoreException_str("Error: May not copy elements with non-atomic type" \ " from private region to shared region " where); \ } while (0) #else #define REGION_SAFETY_CHECK(dst_array, src_array, where) do { \ RegionId src_regionid = PL2RegionId(src_array.A); \ RegionId dst_regionid = PL2RegionId(dst_array.A); \ if (!SHARED_REGIONID(src_regionid) && SHARED_REGIONID(dst_regionid)) \ tossArrayStoreException_str("Error: May not copy elements with non-atomic type" \ " from private region to shared region " where); \ } while (0) #endif #else #define REGION_SAFETY_CHECK(dst_array, src_array, where) #endif #define _forall2ordered3(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,dim2,body,TPTR,INDEXFN,CVT) do { \ const int len0 = PFAST_DIVIDE((hi[dim0] - lo[dim0]),stride[dim0]) + 1; \ const int len1 = PFAST_DIVIDE((hi[dim1] - lo[dim1]),stride[dim1]) + 1; \ const int len2 = PFAST_DIVIDE((hi[dim2] - lo[dim2]),stride[dim2]) + 1; \ const int xIncrement2 = x.sideFactors[dim2] * PFAST_DIVIDE(stride[dim2], x.stride[dim2]); \ const int xIncrement1t= x.sideFactors[dim1] * PFAST_DIVIDE(stride[dim1], x.stride[dim1]); \ const int xIncrement1 = xIncrement1t - xIncrement2*len2; \ const int xIncrement0 = x.sideFactors[dim0] * PFAST_DIVIDE(stride[dim0], x.stride[dim0]) \ - xIncrement1t*len1; \ const int yIncrement2 = y.sideFactors[dim2] * PFAST_DIVIDE(stride[dim2], y.stride[dim2]); \ const int yIncrement1t= y.sideFactors[dim1] * PFAST_DIVIDE(stride[dim1], y.stride[dim1]); \ const int yIncrement1 = yIncrement1t - yIncrement2*len2; \ const int yIncrement0 = y.sideFactors[dim0] * PFAST_DIVIDE(stride[dim0], y.stride[dim0]) \ - yIncrement1t*len1; \ jint _i0,_i1,_i2; \ TPTR e; TPTR f; \ INDEXFN(e, CVT(x.A), ti_convinl("copy()", &x, rdmin)); \ INDEXFN(f, CVT(y.A), ti_convinl("copy()", &y, rdmin)); \ for(_i0=len0;_i0;_i0--) { \ for(_i1=len1;_i1;_i1--) { \ for(_i2=len2;_i2;_i2--) { \ body; \ INDEXFN(e,e,xIncrement2);INDEXFN(f,f,yIncrement2); \ } \ INDEXFN(e,e,xIncrement1);INDEXFN(f,f,yIncrement1); \ } \ INDEXFN(e,e,xIncrement0);INDEXFN(f,f,yIncrement0); \ } \ } while(0) \ #if GLOBAL_ARRAY #define forall2ordered3(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,dim2,body,localbody) do { \ if (isDirectlyAddressable(x.A) && isDirectlyAddressable(y.A)) \ _forall2ordered3(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,dim2,localbody,T*,INDEX_LOCAL,TO_LOCAL); \ else \ _forall2ordered3(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,dim2,body,PTR_TO_T,INDEX,_IDENTITY); \ } while(0) #else #define forall2ordered3(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,dim2,body,localbody) \ _forall2ordered3(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,dim2,localbody,T*,INDEX_LOCAL,_IDENTITY) #endif #define _forall2ordered2(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,body,TPTR,INDEXFN,CVT) do { \ const int len0 = PFAST_DIVIDE((hi[dim0] - lo[dim0]),stride[dim0]) + 1; \ const int len1 = PFAST_DIVIDE((hi[dim1] - lo[dim1]),stride[dim1]) + 1; \ const int xIncrement1 = x.sideFactors[dim1] * PFAST_DIVIDE(stride[dim1], x.stride[dim1]); \ const int xIncrement0 = x.sideFactors[dim0] * PFAST_DIVIDE(stride[dim0], x.stride[dim0]) \ - xIncrement1*len1; \ const int yIncrement1 = y.sideFactors[dim1] * PFAST_DIVIDE(stride[dim1], y.stride[dim1]); \ const int yIncrement0 = y.sideFactors[dim0] * PFAST_DIVIDE(stride[dim0], y.stride[dim0]) \ - yIncrement1*len1; \ jint _i0,_i1; \ TPTR e; TPTR f; \ INDEXFN(e, CVT(x.A), ti_convinl("copy()", &x, rdmin)); \ INDEXFN(f, CVT(y.A), ti_convinl("copy()", &y, rdmin)); \ for(_i0=len0;_i0;_i0--) { \ for(_i1=len1;_i1;_i1--) { \ body; \ INDEXFN(e,e,xIncrement1);INDEXFN(f,f,yIncrement1); \ } \ INDEXFN(e,e,xIncrement0);INDEXFN(f,f,yIncrement0); \ } \ } while(0) #if GLOBAL_ARRAY #define forall2ordered2(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,body,localbody) do { \ if (isDirectlyAddressable(x.A) && isDirectlyAddressable(y.A)) \ _forall2ordered2(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,localbody,T*,INDEX_LOCAL,TO_LOCAL); \ else \ _forall2ordered2(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,body,PTR_TO_T,INDEX,_IDENTITY); \ } while(0) #else #define forall2ordered2(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,body,localbody) \ _forall2ordered2(e,x,f,y,lo,hi,stride,rdmin,dim0,dim1,localbody,T*,INDEX_LOCAL,_IDENTITY) #endif /* forward decls */ jboolean ISCONTIGUOUS_METHOD(const ti_ARRAY x); jboolean ISCONTIGUOUSOVERDOMAIN_METHOD(const ti_ARRAY x, const ti_RECTDOMAIN R); #define INTERNAL_ISCONTIGUOUS_METHOD _CONCAT(_CONCAT(ti_internal_,ti_ARRAY),_isContiguousOverDomain) static jboolean INTERNAL_ISCONTIGUOUS_METHOD(const ti_ARRAY *px, const ti_RECTDOMAIN *pR, size_t *pnumelements, PTR_TO_T *pRdataptr); PTR_TO_T ti_get_array_data_ptr(const ti_ARRAY x); PTR_TO_T ti_get_array_data_ptr_with_domain(const ti_ARRAY *x, const ti_RECTDOMAIN *R); #define TI_ARRAY_C #include "ti_array_addr.h" /* must come after defines above */ #if DUMPABLE_ARRAYS void ti_dump(ti_ARRAY *px, char *dest, char *userArg) { ti_ARRAY x = *px; FILE *f; ti_RECTDOMAIN R = x.domain; ti_POINT p, q, s; jint lo[N], hi[N], stride[N], i; if (dest == NULL || dest[0] == '\0') { dest = getenvMaster("TI_ARRAY_DUMPFILE"); if (dest == NULL || dest[0] == '\0') { fprintf(stderr,"ERROR on p%i: no filename specified for array dump - try setting TI_ARRAY_DUMPFILE; dump request failed\n", MYPROC); return; } } if (dest[0] == '|') f = popen(dest + 1, "w"); else f = fopen(dest, "w"); if (f == NULL) { fprintf(stderr, "Unable to open `%s'; dump request failed.\n", dest); return; } if (userArg != NULL) fprintf(f, "%s\n", userArg); fprintf(f, "%d\n", N); PRINT_DOMAIN(x.domain, f); fprintf(f, "\ndomainaddr <%d, %x>", TO_PROC(x.domain), TO_LOCAL(x.domain)); fprintf(f, "\nstride"); for (i = 0; i < N; i++) fprintf(f, " %d", x.stride[i]); fprintf(f, "\nbase"); for (i = 0; i < N; i++) fprintf(f, " %d", x.base[i]); fprintf(f, "\nsideFactors"); for (i = 0; i < N; i++) fprintf(f, " %d", x.sideFactors[i]); fprintf(f, "\ndesc %lx\nA %lx\nancestor %lx\n", (long)px, (long)LOCAL_PART(x.A), (long)LOCAL_PART(x.ancestor)); #if EXPLICITLY_STORE_CREATOR fprintf(f, "creator %i\n", x.creator); #endif #if EXPLICITLY_STORE_ALLOCATION_SITE fprintf(f, "where %s\n", x.where); #endif fprintf(f, "elementSize %d\n", sizeof(T)); fputs("elementType " T_string "\n", f); p = RECTDOMAIN_MIN(R); q = RECTDOMAIN_MAX(R); s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1); stride[i] = POINT_INDEX(s, i + 1); } forall(e, x, lo, hi, stride) { fputc('!', f); PRINT_FORALL_POINT(f); fputc(',', f); if (strcmp("jint", T_string) == 0) fprintf(f, "%d", deref(e)); else if (strcmp("jdouble", T_string) == 0) fprintf(f, "%g", deref(e)); else if (strcmp("jfloat", T_string) == 0) fprintf(f, "%f", deref(e)); else { T buf = deref(e); for (i = 0; i < sizeof(T); i++) fprintf(f, "%02x", (unsigned int)(((unsigned char *) &buf)[i])); } fputc('!', f); } if (dest[0] == '|') pclose(f); else fclose(f); } #endif ti_ARRAY ti_construct(const char *where, Region pregion, ti_RECTDOMAIN R, int isShared) { ti_ARRAY r; jint i, max[N], size, bytes; ti_POINT p; T *array; r.domain = R; p = RECTDOMAIN_MIN(R); /* minimum point in R */ for (i = 0; i < N; i++) r.base[i] = POINT_INDEX(p, i + 1); p = RECTDOMAIN_STRIDE(R); /* stride of R */ for (i = 0; i < N; i++) r.stride[i] = POINT_INDEX(p, i + 1); p = RECTDOMAIN_UPB(R); /* one greater than maximum point in R */ for (i = 0; i < N; i++) max[i] = PFAST_DIVIDE((POINT_INDEX(p, i + 1) - 1 - r.base[i]), r.stride[i]); r.sideFactors[N - 1] = 1; if (N > 1) { jint sf = 1; for (i = N - 2; i >= 0; i--) r.sideFactors[i] = sf *= (1 + max[i + 1]); } size = 1 + max[0]; if (N > 1) size *= r.sideFactors[0]; #if DEBUG_ARRAYS fprintf(stderr, "p%d: new array with domain = ", MYPROC); PRINT_DOMAIN(R, stderr); fprintf(stderr, "\nArray<%d>:\tbase\tmax\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\t%d\n", i, r.base[i], max[i], r.stride[i], r.sideFactors[i]); } #endif bytes = size * sizeof(T); if (pregion) array = (T *)array_ralloc(pregion, bytes); else array = (T *)array_malloc(bytes); tally_memory(bytes, isShared); #if ELEMENTS_ARE_ATOMIC /* atomic allocators don't automatically clear */ memset(array, 0, bytes); #endif #if GLOBAL_ARRAY globalize(r.A, array); #else r.A = array; #endif r.ancestor = r.A; #if DUMPABLE_ARRAYS r.dumpfn = (void(*)()) ti_dump; #endif #if EXPLICITLY_STORE_ALLOCATION_SITE r.where = where; #endif #if EXPLICITLY_STORE_CREATOR r.creator = MYPROC; #endif return r; } #if N_LARGER_THAN_1 ti_ARRAY_SLICE SLICE_METHOD(const ti_ARRAY x, jint k, jint j) { jint i, l; ti_ARRAY_SLICE r; assertExists(x, "(in slice method)"); r.domain = RECTDOMAIN_SLICE(x.domain, k); for (i = l = 0; i < N_MINUS_1; i++, l++) { if (i == k - 1) ++l; r.stride[i] = x.stride[l]; r.base[i] = x.base[l]; r.sideFactors[i] = x.sideFactors[l]; } #if BOUNDS_CHECKING { ti_POINT p = POINT_SET(RECTDOMAIN_MIN(x.domain), k - 1, j); ti_convinl("(in slice method)", &x, p); /* just for bounds check */ } #endif INDEX(r.A, x.A, PFAST_DIVIDE((j - x.base[k - 1]), x.stride[k - 1]) * x.sideFactors[k - 1]); r.ancestor = x.ancestor; #if EXPLICITLY_STORE_ALLOCATION_SITE r.where = x.where; #endif #if EXPLICITLY_STORE_CREATOR r.creator = x.creator; #endif #if DUMPABLE_ARRAYS r.dumpfn = x.dumpfn; #endif #if DEBUG_ARRAYS fprintf(stderr, "slice(%d, %d) of [%x]\n Array<%d>:\tbase\tstride\tsideFactors\n", k, j, (int) LOCAL_PART(x.A), N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, x.base[i], x.stride[i], x.sideFactors[i]); } fprintf(stderr, "is [%x]\n Array<%d>:\tbase\tstride\tsideFactors\n", (int) LOCAL_PART(r.A), N - 1); for (i = 0; i < N - 1; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, r.base[i], r.stride[i], r.sideFactors[i]); } #endif return r; } #else void SLICE_METHOD(const ti_ARRAY x, jint k, jint j) { fputs("Error: Slice on 1d array (ignored)\n", stderr); } #endif /* ------------------------------------------------------------------------------------ */ /* return true if the elements of x and y might overlap within rectdomain R * the test is conservative, so it may return true when they actually don't overlap */ int ti_mayoverlap(const ti_ARRAY *x, const ti_ARRAY *y, ti_RECTDOMAIN *R) { #if GLOBAL_ARRAY if (TO_BOX(x->A) != TO_BOX(y->A)) return 0; /* diff mem spaces */ #endif #if 0 /* not currently a useful optimization - we never call this on empty rectdomains */ if (RECTDOMAIN_ISNULL(*R)) return 0; /* empty domain */ #endif { PTR_TO_T xend, yend; int i; /* determine address of max storage address */ ti_POINT xptwithmaxaddr = RECTDOMAIN_MAX(*R); ti_POINT yptwithmaxaddr = xptwithmaxaddr; for (i = 0; i < N; i++) { if (x->sideFactors[i] < 0) xptwithmaxaddr = POINT_SET(xptwithmaxaddr, i, POINT_INDEX(RECTDOMAIN_MIN(*R), i + 1)); if (y->sideFactors[i] < 0) yptwithmaxaddr = POINT_SET(yptwithmaxaddr, i, POINT_INDEX(RECTDOMAIN_MIN(*R), i + 1)); } INDEX(xend, x->A, ti_convinl("array copy maxpt", x, xptwithmaxaddr)); INDEX(yend, y->A, ti_convinl("array copy maxpt", y, yptwithmaxaddr)); /* see if the bounding memory regions are disjoint */ /* start with a faster, more conservative check based on the following property, that is likely to catch most cases */ assert(LOCAL_PART(ti_get_array_data_ptr_with_domain(x, R)) >= LOCAL_PART(x->A)); assert(LOCAL_PART(ti_get_array_data_ptr_with_domain(y, R)) >= LOCAL_PART(y->A)); if (LOCAL_PART(xend) < LOCAL_PART(y->A) || LOCAL_PART(x->A) > LOCAL_PART(yend)) return 0; { /* more precise test based on actual starting offset */ PTR_TO_T const xbegin = ti_get_array_data_ptr_with_domain(x, R); PTR_TO_T const ybegin = ti_get_array_data_ptr_with_domain(y, R); if (LOCAL_PART(xend) < LOCAL_PART(ybegin) || LOCAL_PART(xbegin) > LOCAL_PART(yend)) return 0; } /* we could add more precise overlap detection tests here * (to detect interleaved but non-overlapping arrays) */ } return 1; } /* ------------------------------------------------------------------------------------ */ #if assignable /* x.exchange(w) is equivalent to: ordered_foreach (p in Ti.myTeam().domain()) x[p] = broadcast w from p[1]; The current implementation assumes myTeam().domain() is [0 : PROCS - 1]. */ void EXCHANGE_METHOD(const ti_ARRAY x, T w) { assertExists(x, "(in exchange method)"); if (N != 1) fprintf(stderr, "Error: exchange() used on %dd array; " "it should only be used on 1d arrays.\n", N); ti_srcpos_freeze(); ti_trace_printf(("COLLECTIVE exchange: sz = %i, " _STRINGIFY(T), sizeof(T))); #if N_IS_1 if (!ISCONTIGUOUS_METHOD(x)) { jint p1; for (p1 = 0; p1 < PROCS; p1++) { T receive; { BROADCAST_BEGIN( T, p1 ); BROADCAST_bulk(receive, T, p1, w); BROADCAST_END( receive, T, p1 ); } { ti_POINT p; PTR_TO_T target; mi9constructmT8tiPoint17domains2ti(p, p1); INDEX(target, x.A, ti_convinl("(in exchange method)", &x, p)); assign(target, receive); } } } else { /* common case of contiguous arrays - use some shortcuts * (note that thanks to single restrictions, if one array is contiguous then they all are) */ GPTR_TO_T gp_array; GPTR_TO_T root_array; GPTR_TO_T target; int myproc = MYPROC; #if GLOBAL_ARRAY gp_array = x.A; #else globalize(gp_array, x.A); #endif { /* Broadcast the address for process 0's array */ BROADCAST_BEGIN( GPTR_TO_T, 0 ); BROADCAST_gp( root_array, GPTR_TO_T, 0, gp_array ); BROADCAST_END( root_array, GPTR_TO_T, 0 ); } { /* Calculate pointer offset and copy value into process 0's array */ ti_POINT p; mi9constructmT8tiPoint17domains2ti(p, myproc); INDEX_GLOBAL(target, root_array, ti_convinl("(in exchange method)", &x, p)); ASSIGN_GLOBAL_bulk(target, w); } /* Make sure every process has completed it's copy */ barrier(); /* Everyone, except 0, copy back the entire array * (could use broadcast here if all arrays local and we had an anonymous bulk broadcast) */ if (myproc != 0) { if (DIRECTLY_ADDRESSABLE(x.A)) global_to_local_copy(root_array, LOCAL_PART(x.A), PROCS); else { T *buf = (T *) ti_malloc_atomic_huge(sizeof(T)*PROCS); global_to_local_copy(root_array, buf, PROCS); local_to_global_copy(buf, gp_array, PROCS); ti_free(buf); } } } #endif /* * Ensure that every process is done writing into its respective * array before continuing. Otherwise, one process can see * another process's array in a partially-written state. */ barrier(); ti_srcpos_unfreeze(); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable /* This is a "copy descriptor", which encapsulates the domain and array information a remote node will need to pack array data into a buffer. */ typedef struct { ti_ARRAY x; /* Array involved in the copying. */ ti_RECTDOMAIN R; /* The subarray that will actually be copied. */ } copy_desc_t; /* Note that this name is actually a macro, to preserve uniqueness among different array types. */ /* Takes the given copy descriptor and returns a buffer that contains the desired elements of the array. The array_data_size argument is an output parameter. If "allocated" is a non-null address, the data will be copied into that provided buffer if it is null, a new buffer will be allocated and returned if it is (void *)-1, then the new buffer will be allocated as handlersafe, uncollectable memory */ #define RETURN(x) do { ti_srcpos_unfreeze(); return (x); } while (0) T *PACK_METHOD(void *copy_desc_ptr, int *array_data_size, void *allocated) { ti_ARRAY x; ti_RECTDOMAIN R; int i, lo[N], hi[N], stride[N], size, idx; ti_POINT p, q, s; T *buf; copy_desc_t copy_desc = *(copy_desc_t *)copy_desc_ptr; R = copy_desc.R; x = copy_desc.x; ti_srcpos_freeze(); if (RECTDOMAIN_ISNULL(R)) { *array_data_size = 0; RETURN(allocated); } p = RECTDOMAIN_MIN(R); q = RECTDOMAIN_UPB(R); s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } size = RECTDOMAIN_SIZE(R) * sizeof(T); #ifdef USE_GC_NONE if (allocated == NULL || allocated == (void*)-1) buf = (T *)ti_malloc_atomic_huge(size); #else if (allocated == NULL) buf = (T *)ti_malloc_atomic_huge(size); else if (allocated == (void*)-1) buf = ti_malloc_handlersafe(size); #endif else buf = allocated; /* 2D case */ #if (N == 2) /* Two checks: 1. check if elements are contiguous along the last dimension if contiguous, sidefactor[N-1] should be 1 2. check if the stride of the domain of the array is the same as the stride of the intersection domain */ if ((x.sideFactors[1] == 1) && (x.stride[1] == stride[1])){ int xIncrement; int numOfRows; int stripSize; int stripSizeInBytes; T *src; T *bufTmp; int i; #if GLOBAL_ARRAY src = LOCAL_PART(x.A) + ti_convinl("pack()", &x, p); #else src = x.A + ti_convinl("pack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0],x.stride[0]); /* divide by stride[0], because stride[0] is a multiple of x.stride[0] */ numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; stripSize = PFAST_DIVIDE((hi[1] - lo[1]),x.stride[1]) + 1; stripSizeInBytes = stripSize * sizeof(T); bufTmp = buf; for (i = 0; i < numOfRows; i++){ memcpy(bufTmp, src, stripSizeInBytes); bufTmp += stripSize; src += xIncrement; } *array_data_size = size; RETURN(buf); } #endif #if (N == 3) if ((x.sideFactors[2] == 1) && (x.stride[2] == stride[2])){ int xIncrement; int yIncrement; int numOfRows; int numOfCols; int stripSize; int stripSizeInBytes; T *src; T *rowStartPtr; T *bufTmp; int i, j; #if GLOBAL_ARRAY src = LOCAL_PART(x.A) + ti_convinl("pack()", &x, p); #else src = x.A + ti_convinl("pack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0], x.stride[0]); yIncrement = x.sideFactors[1] * PFAST_DIVIDE(stride[1], x.stride[1]); numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; numOfCols = PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1; stripSize = PFAST_DIVIDE((hi[2] - lo[2]),x.stride[2]) + 1; stripSizeInBytes = stripSize * sizeof(T); bufTmp = buf; for (i = 0; i < numOfRows; i++){ rowStartPtr = src; for (j = 0; j < numOfCols; j++){ memcpy(bufTmp, src, stripSizeInBytes); bufTmp += stripSize; src += yIncrement; } src = rowStartPtr + xIncrement; } *array_data_size = size; RETURN(buf); } #endif idx = 0; forall(e, x, lo, hi, stride, deref(buf[idx++],e);, DEREF_LOCAL(buf[idx++],e);); *array_data_size = size; RETURN(buf); } #undef RETURN /* Unpacks the given buffer into the array with the given copy descriptor. */ #define return do { ti_srcpos_unfreeze(); return; } while (0) void UNPACK_METHOD(copy_desc_t *copy_desc_ptr, T *buf) { int i, lo[N], hi[N], stride[N], size, idx; ti_POINT p, q, s; ti_ARRAY x = copy_desc_ptr->x; ti_RECTDOMAIN R = copy_desc_ptr->R; T tmp; ti_srcpos_freeze(); assertExists(x, "(in unpack method)"); if (RECTDOMAIN_ISNULL(R)) return; p = RECTDOMAIN_MIN(R); q = RECTDOMAIN_UPB(R); s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } /* 2D case */ #if (N == 2) /* Two checks: 1. check if elements are contiguous along the last dimension if contiguous, sidefactor[N-1] should be 1 2. check if the stride of the domain of the array is the same as the stride of the intersection domain */ if ((x.sideFactors[1] == 1) && (x.stride[1] == stride[1])){ int xIncrement; int numOfRows; int stripSize; int stripSizeInBytes; T *dest; int i; #if GLOBAL_ARRAY dest = LOCAL_PART(x.A) + ti_convinl("unpack()", &x, p); #else dest = x.A + ti_convinl("unpack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0], x.stride[0]); numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; stripSize = PFAST_DIVIDE((hi[1] - lo[1]),x.stride[1]) + 1; stripSizeInBytes = stripSize * sizeof(T); for (i = 0; i < numOfRows; i++){ memcpy(dest, buf, stripSizeInBytes); buf += stripSize; dest += xIncrement; } return; } #endif #if (N == 3) if ((x.sideFactors[2] == 1) && (x.stride[2] == stride[2])){ int xIncrement; int yIncrement; int numOfRows; int numOfCols; int stripSize; int stripSizeInBytes; T *dest; T *rowStartPtr; int i, j; #if GLOBAL_ARRAY dest = LOCAL_PART(x.A) + ti_convinl("unpack()", &x, p); #else dest = x.A + ti_convinl("unpack()", &x, p); #endif xIncrement = x.sideFactors[0] * PFAST_DIVIDE(stride[0],x.stride[0]); yIncrement = x.sideFactors[1] * PFAST_DIVIDE(stride[1],x.stride[1]); numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; numOfCols = PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1; stripSize = PFAST_DIVIDE((hi[2] - lo[2]),x.stride[2]) + 1; stripSizeInBytes = stripSize * sizeof(T); for (i = 0; i < numOfRows; i++){ rowStartPtr = dest; for (j = 0; j < numOfCols; j++){ memcpy(dest, buf, stripSizeInBytes); buf += stripSize; dest += yIncrement; } dest = rowStartPtr + xIncrement; } return; } #endif idx = 0; #if 0 forall(e, x, lo, hi, stride, weak_assign(e, buf[idx++]);); ti_sync(); /* this is bogus - can't run ti_sync() within the context of an AM handler */ #else assert(DIRECTLY_ADDRESSABLE(x.A)); /* unpack can _only_ be used to unpack data into a local array */ forall(e, x, lo, hi, stride, assign(e, buf[idx++]);, ASSIGN_LOCAL(e, buf[idx++]);); #endif } #undef return #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if defined(TI_TRACE) || !defined(NDEBUG) #define DEBUG_COPY_ENABLED 1 const char *debugArrayCopy; #define _DEBUG_COPY(args,sz) do { \ size_t _ti_copy_sz = (sz); \ if (_ti_copy_sz == 0) { /* force eval lazily computed sz */ \ _ti_copy_sz = RECTDOMAIN_SIZE(R) * sizeof(T); \ } else if (_ti_copy_sz == (size_t)-1) { \ _ti_copy_sz = 0; \ } \ if_pf (debugArrayCopy == 0) \ debugArrayCopy = getenvMaster("TI_DEBUG_ARRAY_COPY")+1; \ if_pf (debugArrayCopy != (char*)1) { \ const char *_file = NULL; unsigned int _line = 0; \ ti_get_srcpos(&_file, &_line); \ printf("%i: ", (int)MYPROC); \ if (_file) printf("[%s:%i] ", (_file?_file:""), _line); \ printf args; \ printf("\n"); fflush(stdout); \ } \ ti_trace_printf(args); \ } while(0) #else #define DEBUG_COPY_ENABLED 0 #define _DEBUG_COPY(args,sz) ((void)0) #endif #define DEBUG_COPY(str, sz) \ _DEBUG_COPY(("TI_ARRAY_COPY: (%llu bytes) %s %s", (unsigned long long)_ti_copy_sz, \ _STRINGIFY(ti_ARRAY), str), sz) #define FAILED_NONBLOCKING_WARNING() \ _DEBUG_COPY(("TI_ARRAY_COPY: Next TiArray.copy() was issued as non-blocking %s," \ " but was forced to be fully blocking", \ (handle==HANDLE_NBI?"NBI":"NB")),-1) /* a smart memcpy operation that correctly handles overlap */ #ifndef smart_memcpy_defined #define smart_memcpy_defined static void smart_memcpy(void *dest, void *src, int num_bytes) { assert(num_bytes >= 0); if (num_bytes == 0) return; else { if (((jbyte *)src) + num_bytes <= ((jbyte *)dest) || ((jbyte *)dest) + num_bytes <= ((jbyte *)src)) memcpy(dest, src, num_bytes); else memmove(dest, src, num_bytes); /* handles overlap correctly */ } } #endif /* ------------------------------------------------------------------------------------ */ #if assignable #define return do { ti_srcpos_unfreeze(); return; } while (0) extern void COPYINNER_METHOD(const ti_ARRAY x, const ti_ARRAY y, T6Handle4lang2ti *handle) { ti_RECTDOMAIN R; size_t R_numelements = 0; ti_srcpos_freeze(); /* safety checks */ assertExists(x, "(in copy method)"); assertExists(y, "(in copy method)"); REGION_SAFETY_CHECK(x, y, "(in copy method)"); /* check for a simple case, where both arrays are the same array (possibly restricted) * this is always an identity transformation, regardless of the rectdomain intersection */ if (EQUAL(x.A, y.A)) { /* we need to check not only the array data ptr, but also the base, stride and sidefactors */ assert(&(x.base[0]) == &(x.stride[0])+N && (jint*)&(x.sideFactors[0]) == &(x.stride[0])+2*N); if (!memcmp(x.stride, y.stride, sizeof(x.stride)+sizeof(x.base)+sizeof(x.sideFactors))) { DEBUG_COPY("skipping identity copy of array to itself", -1); return; } } { /* faster check for empty copy - try to avoid expensive intersection logic */ #if (N <= 3) ti_POINT const xmin = RECTDOMAIN_MIN(x.domain); ti_POINT const xupb = RECTDOMAIN_UPB(x.domain); ti_POINT const ymin = RECTDOMAIN_MIN(y.domain); ti_POINT const yupb = RECTDOMAIN_UPB(y.domain); if (POINTN_GET(xupb,0) <= POINTN_GET(ymin,0) || POINTN_GET(yupb,0) <= POINTN_GET(xmin,0) #if N > 1 || POINTN_GET(xupb,1) <= POINTN_GET(ymin,1) || POINTN_GET(yupb,1) <= POINTN_GET(xmin,1) #endif #if N > 2 || POINTN_GET(xupb,2) <= POINTN_GET(ymin,2) || POINTN_GET(yupb,2) <= POINTN_GET(xmin,2) #endif ) { DEBUG_COPY("skipping empty copy (non-overlapping domains)", -1); return; /* empty copy */ } if (POINTN_GET(RECTDOMAIN_STRIDE(x.domain),0) == 1 && POINTN_GET(RECTDOMAIN_STRIDE(y.domain),0) == 1 #if N > 1 && POINTN_GET(RECTDOMAIN_STRIDE(x.domain),1) == 1 && POINTN_GET(RECTDOMAIN_STRIDE(y.domain),1) == 1 #endif #if N > 2 && POINTN_GET(RECTDOMAIN_STRIDE(x.domain),2) == 1 && POINTN_GET(RECTDOMAIN_STRIDE(y.domain),2) == 1 #endif ) { /* strides are equal, so do a quick unit-stride intersection */ R = x.domain; if (POINTN_GET(ymin,0) > POINTN_GET(xmin,0)) POINTN_GET(RECTDOMAIN_MIN(R),0) = POINTN_GET(ymin,0); if (POINTN_GET(yupb,0) < POINTN_GET(xupb,0)) POINTN_GET(RECTDOMAIN_UPB(R),0) = POINTN_GET(yupb,0); #if N > 1 if (POINTN_GET(ymin,1) > POINTN_GET(xmin,1)) POINTN_GET(RECTDOMAIN_MIN(R),1) = POINTN_GET(ymin,1); if (POINTN_GET(yupb,1) < POINTN_GET(xupb,1)) POINTN_GET(RECTDOMAIN_UPB(R),1) = POINTN_GET(yupb,1); #endif #if N > 2 if (POINTN_GET(ymin,2) > POINTN_GET(xmin,2)) POINTN_GET(RECTDOMAIN_MIN(R),2) = POINTN_GET(ymin,2); if (POINTN_GET(yupb,2) < POINTN_GET(xupb,2)) POINTN_GET(RECTDOMAIN_UPB(R),2) = POINTN_GET(yupb,2); #endif } else #endif { R = RECTDOMAIN_INTERSECTION(y.domain, x.domain); if (RECTDOMAIN_ISNULL(R)) { DEBUG_COPY("skipping empty copy (disjoint or non-overlapping domains)", -1); return; /* empty copy */ } } } #if 0 /* DOB: this optimization is still under development */ #if GLOBAL_ARRAY if (TO_BOX(x.A) == TO_BOX(y.A)) #endif { /* check for the case where one array is a sub-array of another (e.g. both arrays are slices of a super-array along different dimensions) where copy would be an identity transformation This may be too rare to be worth checking for... */ ti_POINT dommin = RECTDOMAIN_MIN(R); int xminoffset = ti_convinl("copy()", &x, dommin); int yminoffset = ti_convinl("copy()", &y, dommin); /* first see if the initial points are located at the same memory address */ #if GLOBAL_ARRAY if (TO_LOCAL(x.A)+xminoffset == TO_LOCAL(y.A)+yminoffset) { #else if (x.A+xminoffset == y.A+yminoffset) { #endif /* next, check that the rest of the points will fall in the same place * (note that contiguity is not required for this) */ /* TODO: add this check - not sure how to quickly check this */ DEBUG_COPY("skipping identity copy of overlapping sub-array region"); return; } } #endif #if !ELEMENTS_ARE_ATOMIC && GLOBAL_ARRAY #if defined(USE_DISTRIBUTED_GC) && !defined(USE_GC_NONE) /* Distributed GC: we may have some escaping pointers to handle * Happens if the elements contain global pointers and the elements move across boxes */ if (!DIRECTLY_ADDRESSABLE(x.A) && DIRECTLY_ADDRESSABLE(y.A)) { /* "pushing" some global pointers */ int lo[N], hi[N], stride[N]; int i; ti_POINT const p = RECTDOMAIN_MIN(R); ti_POINT const q = RECTDOMAIN_UPB(R); ti_POINT const s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } if (sizeof(T) == sizeof(jGPointer)) { /* elements must be global pointers */ forall(e, y, lo, hi, stride, GC_PTR_ESC(TO_LOCAL(e), 1);, GC_PTR_ESC(e, 1);); } else { /* elements are immutables containing globals - hash everything */ forall(e, y, lo, hi, stride, GC_PTR_ESC_ALL_GPS(TO_LOCAL(e), sizeof(T));, GC_PTR_ESC_ALL_GPS(e, sizeof(T));); } } else if (BOX_PART(x.A) != BOX_PART(y.A)) { /* "pulling" some global pointers */ /* TODO: need to inform the source - * this is currently taken care of conservatively by the AM request handler for bulk gets * but we may be able to improve that somewhat with the type info available here */ } #endif #endif { #define DONT_KNOW (-1) int iscontiguousx = DONT_KNOW; /* 1, 0 or DONT_KNOW indication of x,y contiguity over R */ int iscontiguousy = DONT_KNOW; PTR_TO_T xdata; /* pointer to the start of the array data in R, valid iff iscontiguousx,y == 1 */ PTR_TO_T ydata; /* First we check for the fastest, simplest (and hopefully common) case - A) both arrays are contiguous in linear memory over the elements in the domain intersection, and B) they store the corresponding elements of that region in the same order in linear memory which implies we can safely do a bulk copy of the data region Property B can be checked more efficiently, so start with that. */ /* Property B) we need to check the elements are stored in the same order. Because contiguity is being checked separately, we can quickly check for correct element ordering by comparing side factors - if they are unequal then either we have a row length difference in some dimension, or the dimensional ordering has been permuted (either due to a permutation or negative injection), which implies either a violation of property A or B (respectively) iff the intersection domain transcends more than a single row in the given dimension (ie iff the region is non-singular in the dimension where the sideFactors mismatch) */ /* check for a sideFactor mismatch along a non-singular dimension */ #if N==1 if ((x.sideFactors[0] == y.sideFactors[0] || POINTN_GET(RECTDOMAIN_UPB(R),0) == POINTN_GET(RECTDOMAIN_MIN(R),0) + 1) && #elif N==2 if ((x.sideFactors[0] == y.sideFactors[0] || POINTN_GET(RECTDOMAIN_UPB(R),0) == POINTN_GET(RECTDOMAIN_MIN(R),0) + 1) && (x.sideFactors[1] == y.sideFactors[1] || POINTN_GET(RECTDOMAIN_UPB(R),1) == POINTN_GET(RECTDOMAIN_MIN(R),1) + 1) && #elif N==3 if ((x.sideFactors[0] == y.sideFactors[0] || POINTN_GET(RECTDOMAIN_UPB(R),0) == POINTN_GET(RECTDOMAIN_MIN(R),0) + 1) && (x.sideFactors[1] == y.sideFactors[1] || POINTN_GET(RECTDOMAIN_UPB(R),1) == POINTN_GET(RECTDOMAIN_MIN(R),1) + 1) && (x.sideFactors[2] == y.sideFactors[2] || POINTN_GET(RECTDOMAIN_UPB(R),2) == POINTN_GET(RECTDOMAIN_MIN(R),2) + 1) && #else int i; int matchedorder = 1; for (i = 0; i < N; i++) { if (x.sideFactors[i] != y.sideFactors[i] && POINT_INDEX(RECTDOMAIN_UPB(R),i+1) > POINT_INDEX(RECTDOMAIN_MIN(R),i+1) + 1) { matchedorder = 0; break; } } if (matchedorder && #endif (iscontiguousx = INTERNAL_ISCONTIGUOUS_METHOD(&x, &R, &R_numelements, &xdata)) && (iscontiguousy = INTERNAL_ISCONTIGUOUS_METHOD(&y, &R, &R_numelements, &ydata))) { /* safe to use a direct region copy from ydata to xdata */ /* get size of contiguous region */ size_t size = sizeof(T) * R_numelements; #if GLOBAL_ARRAY #ifdef MEMORY_DISTRIBUTED if (isDirectlyAddressable(x.A) && isDirectlyAddressable(y.A)) { DEBUG_COPY("bulk copying contiguous region (local <- local)", size); smart_memcpy(TO_LOCAL(xdata), TO_LOCAL(ydata), size); } else if (isDirectlyAddressable(x.A)) { if (handle == NULL) { DEBUG_COPY("bulk copying contiguous region (get: local <- remote)", size); ti_bulk_read(TO_LOCAL(xdata), TO_BOX(ydata), TO_LOCAL(ydata), size, (ELEMENTS_ARE_ATOMIC?tic_no_ptrs:tic_gp_only)); } else if (!ELEMENTS_ARE_ATOMIC) { /* cannot use ti_bulk_read_weak here - otherwise we'd need to * change the behavior of Handle.syncNBI() to include a ti_sync() on GASNet * (currently does not need it). Non-atomic gets generally give crappy * performance anyhow due to GC issues, so they're probably not worth * optimizing - ie the added cost to all syncNBI() is probably not worth the * little overlap we'd gain for non-atomic gets. */ DEBUG_COPY("NB/NBI array get, but elements non-atomic - using blocking bulk copying contiguous region (get: local <- remote)", size); ti_bulk_read(TO_LOCAL(xdata), TO_BOX(ydata), TO_LOCAL(ydata), size, tic_gp_only); } else if (handle == HANDLE_NBI) { DEBUG_COPY("NBI bulk copying contiguous region (get: local <- remote)", size); TI_GET_NBI_BULK_NOPTRS(TO_LOCAL(xdata), TO_BOX(ydata), TO_LOCAL(ydata), size); } else { DEBUG_COPY("NB bulk copying contiguous region (get: local <- remote)", size); TI_GET_NB_BULK_NOPTRS(handle, TO_LOCAL(xdata), TO_BOX(ydata), TO_LOCAL(ydata), size); } } else if (isDirectlyAddressable(y.A)) { if (handle == NULL) { DEBUG_COPY("bulk copying contiguous region (put: remote <- local)", size); ti_bulk_write(TO_BOX(xdata), TO_LOCAL(xdata), TO_LOCAL(ydata), size); } else if (handle == HANDLE_NBI){ DEBUG_COPY("NBI bulk copying contiguous region (put: remote <- local)", size); TI_PUT_NBI_BULK(TO_BOX(xdata), TO_LOCAL(xdata), TO_LOCAL(ydata), size); } else { DEBUG_COPY("NB bulk copying contiguous region (put: remote <- local)", size); TI_PUT_NB_BULK(handle, TO_BOX(xdata), TO_LOCAL(xdata), TO_LOCAL(ydata), size); } } else { unsigned char *buf; if (handle != NULL) FAILED_NONBLOCKING_WARNING(); DEBUG_COPY("bulk copying contiguous region (3rd party: remote <- remote)", size); buf = (unsigned char *) ti_malloc_atomic_huge(size); ti_bulk_read(buf, TO_BOX(ydata), TO_LOCAL(ydata), size, (ELEMENTS_ARE_ATOMIC?tic_no_ptrs:tic_gp_only)); ti_bulk_write(TO_BOX(xdata), TO_LOCAL(xdata), buf, size); ti_free(buf); } #else /* this is shared memory */ assert(isDirectlyAddressable(xdata) && isDirectlyAddressable(ydata)); DEBUG_COPY("bulk copying contiguous region", size); smart_memcpy(TO_LOCAL(xdata), TO_LOCAL(ydata), size); #endif #else /* array is local - all we need is memcpy */ DEBUG_COPY("bulk copying contiguous region", size); smart_memcpy(xdata, ydata, size); #endif return; } #if GLOBAL_ARRAY && defined(COMM_AM2) if (!isDirectlyAddressable(x.A) || !isDirectlyAddressable(y.A)) { /* we have special AM functions for scatter-gather to optimize non-contiguous copies across memory spaces * PACK_METHOD is gather, UNPACK_METHOD is scatter */ copy_desc_t copy_desc; T *buf; int isrowmajorx, isrowmajory; copy_desc.R = R; /* ensure we have full information - * we already know that at least one of the predicates * (iscontiguousx, iscontiguousy, matchedorder) is false, but * we may still be able to take advantage of one array's contiguity */ if (iscontiguousx == DONT_KNOW) iscontiguousx = INTERNAL_ISCONTIGUOUS_METHOD(&x, &R, &R_numelements, &xdata); if (iscontiguousy == DONT_KNOW) iscontiguousy = INTERNAL_ISCONTIGUOUS_METHOD(&y, &R, &R_numelements, &ydata); /* we actually need something stronger than contiguity and * subtlely different than matchedorder - * need to check the elements are stored in positive row-major order, because * our packing algorithm currently always packs to row-major order according * to the intersection rectdomain. * We could make this more aggressive by parameterizing the packing * algorithm to pack into an arbitrary dimensional ordering and direction, * which would allow us to always pack/unpack with the format matching the * contiguous array (even if that array has been permuted or negatively injected) */ #if (N == 1) isrowmajorx = (x.sideFactors[0] > 0); isrowmajory = (y.sideFactors[0] > 0); #elif (N == 2) isrowmajorx = (x.sideFactors[0] > x.sideFactors[1]) && (x.sideFactors[1] > 0); isrowmajory = (y.sideFactors[0] > y.sideFactors[1]) && (y.sideFactors[1] > 0); #elif (N == 3) isrowmajorx = (x.sideFactors[0] > x.sideFactors[1]) && (x.sideFactors[1] > x.sideFactors[2]) && (x.sideFactors[2] > 0); isrowmajory = (y.sideFactors[0] > y.sideFactors[1]) && (y.sideFactors[1] > y.sideFactors[2]) && (y.sideFactors[2] > 0); #else { int i; int lastx = 0, lasty = 0; isrowmajorx = 1; isrowmajory = 1; for (i = N-1; i >= 0; i--) { if (!(x.sideFactors[i] > lastx)) isrowmajorx = 0; if (!(y.sideFactors[i] > lasty)) isrowmajory = 0; lastx = x.sideFactors[i]; lasty = y.sideFactors[i]; } } #endif #if DEBUG_COPY_ENABLED { char msg[255]; if (handle != NULL) FAILED_NONBLOCKING_WARNING(); sprintf(msg,"scatter-gather AM-based copy" "(%s %scontiguous %srow-major <- %s %scontiguous %srow-major)", (isDirectlyAddressable(x.A)?"local":"remote"), (iscontiguousx?"":"non-"), (isrowmajorx?"":"non-"), (isDirectlyAddressable(y.A)?"local":"remote"), (iscontiguousy?"":"non-"), (isrowmajory?"":"non-")); DEBUG_COPY(msg, R_numelements * sizeof(T)); } #endif if (isDirectlyAddressable(x.A)) { /* remote to local copy */ int num_bytes = R_numelements * sizeof(T); if (iscontiguousx && isrowmajorx) { buf = LOCAL_PART(xdata); /* fetch directly into x */ } else { buf = (T *)ti_malloc_atomic_huge(num_bytes); } if (iscontiguousy && isrowmajory) { ti_bulk_read(buf, TO_BOX(ydata), TO_LOCAL(ydata), num_bytes, (ELEMENTS_ARE_ATOMIC?tic_no_ptrs:tic_gp_only)); } else { copy_desc.x = y; get_array((void *)PACK_METHOD, ©_desc, sizeof(copy_desc), TO_BOX(y.A), (void *)buf, ELEMENTS_ARE_ATOMIC); ti_sync(); } if (!(iscontiguousx && isrowmajorx)) { copy_desc.x = x; UNPACK_METHOD(©_desc, buf); ti_free(buf); } return; } else if (isDirectlyAddressable(y.A)) { /* local to remote copy */ int num_bytes; if (iscontiguousy && isrowmajory) { buf = LOCAL_PART(ydata); /* push directly from y */ num_bytes = R_numelements * sizeof(T); } else { copy_desc.x = y; buf = PACK_METHOD(©_desc, &num_bytes, NULL); } if (iscontiguousx && isrowmajorx) { ti_bulk_write(TO_BOX(xdata), TO_LOCAL(xdata), buf, num_bytes); } else { copy_desc.x = x; put_array((void *)UNPACK_METHOD, ©_desc, sizeof(copy_desc), buf, num_bytes, TO_BOX(x.A)); ti_sync(); } if (!(iscontiguousy && isrowmajory)) { ti_free(buf); } return; } else { /* remote to remote copy */ int num_bytes = R_numelements * sizeof(T); buf = (T *)ti_malloc_atomic_huge(num_bytes); if (iscontiguousy && isrowmajory) { ti_bulk_read(buf, TO_BOX(ydata), TO_LOCAL(ydata), num_bytes, (ELEMENTS_ARE_ATOMIC?tic_no_ptrs:tic_gp_only)); } else { copy_desc.x = y; get_array((void *)PACK_METHOD, ©_desc, sizeof(copy_desc), TO_BOX(y.A), (void *)buf, ELEMENTS_ARE_ATOMIC); ti_sync(); } if (iscontiguousx && isrowmajorx) { ti_bulk_write(TO_BOX(xdata), TO_LOCAL(xdata), buf, num_bytes); } else { copy_desc.x = x; put_array((void *)UNPACK_METHOD, ©_desc, sizeof(copy_desc), buf, num_bytes, TO_BOX(x.A)); ti_sync(); } ti_free(buf); return; } } #endif /* GLOBAL_ARRAY && AM2 */ { const int arraysMayOverlap = ti_mayoverlap(&x,&y,&R); int i, lo[N], hi[N], stride[N]; ti_POINT const p = RECTDOMAIN_MIN(R); ti_POINT const q = RECTDOMAIN_UPB(R); ti_POINT const s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } /* if both arrays are local and last dimension is contiguous on both arrays, use memcpy along the last dimension for higher bandwidth check if elements are contiguous along the last dimension if contiguous, sidefactor[N-1] should be 1 check if the stride of the domain of the array is the same as the stride of the intersection domain */ if (DIRECTLY_ADDRESSABLE(x.A) && DIRECTLY_ADDRESSABLE(y.A) && !arraysMayOverlap && (x.sideFactors[N-1] == 1) && (y.sideFactors[N-1] == 1) && (x.stride[N-1] == stride[N-1]) && (y.stride[N-1] == stride[N-1])) { T *src; T *dst; DEBUG_COPY("local non-contiguous copy with memcpy in last dimension", R_numelements * sizeof(T)); #if GLOBAL_ARRAY src = LOCAL_PART(y.A) + ti_convinl("copy()", &y, p); dst = LOCAL_PART(x.A) + ti_convinl("copy()", &x, p); #else src = y.A + ti_convinl("copy()", &y, p); dst = x.A + ti_convinl("copy()", &x, p); #endif #if (N == 1) abort(); /* N == 1 should never make it this far */ #elif (N == 2) { int i; const int xIncrement0 = x.sideFactors[0] * PFAST_DIVIDE(stride[0],x.stride[0]); const int yIncrement0 = y.sideFactors[0] * PFAST_DIVIDE(stride[0],y.stride[0]); /* divide by stride[0], because stride[0] is a multiple of x.stride[0] */ const int numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; const int stripSizeInBytes = (PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1) * sizeof(T); for (i = numOfRows; i; i--){ memcpy(dst, src, stripSizeInBytes); src += yIncrement0; dst += xIncrement0; } return; } #elif (N == 3) { int i, j; const int xIncrement0 = x.sideFactors[0] * PFAST_DIVIDE(stride[0], x.stride[0]); const int xIncrement1 = x.sideFactors[1] * PFAST_DIVIDE(stride[1], x.stride[1]); const int yIncrement0 = y.sideFactors[0] * PFAST_DIVIDE(stride[0], y.stride[0]); const int yIncrement1 = y.sideFactors[1] * PFAST_DIVIDE(stride[1], y.stride[1]); const int numOfRows = PFAST_DIVIDE((hi[0] - lo[0]),stride[0]) + 1; const int numOfCols = PFAST_DIVIDE((hi[1] - lo[1]),stride[1]) + 1; const int stripSizeInBytes = (PFAST_DIVIDE((hi[2] - lo[2]),stride[2]) + 1) * sizeof(T); for (i = numOfRows; i; i--){ T * const srcrowStartPtr = src; T * const dstrowStartPtr = dst; for (j = numOfCols; j; j--){ memcpy(dst, src, stripSizeInBytes); src += yIncrement1; dst += xIncrement1; } src = srcrowStartPtr + yIncrement0; dst = dstrowStartPtr + xIncrement0; } return; } #endif } /* finally, the slow, general case that handles anything */ /* Do an el-by-el copy. */ if (arraysMayOverlap) { /* areas might overlap - use an intermediate to be sure */ size_t const num_bytes = R_numelements * sizeof(T); T *buf = (T *)ti_malloc_atomic_huge(num_bytes); size_t i=0; DEBUG_COPY("element-wise generalized with-overlap naive copy", num_bytes); forall(f, y, lo, hi, stride, deref(buf[i++], f), DEREF_LOCAL(buf[i++], f)); i=0; forall(e, x, lo, hi, stride, weak_assign(e, buf[i++]), WEAK_ASSIGN_LOCAL(e, buf[i++])); ti_sync(); ti_free(buf); } else { /* areas do not overlap */ /* We're going to do an element-by-element copy - any ordering will work, * but we'll get better cache performance if we follow the memory contiguity * of one or both arrays. Additionally, we give preference to the contiguity * direction of the array being written, since this gets better performance * on write-back caches. As a secondary heuristic (for 3D), if both arrays have a * contiguous direction (and they differ), then those two directions should * be the inner two directions in the loop nest. * * TODO: we'd get better performance on non-contiguous arrays with a heuristic * that gives preference to the direction with the lowest sideFactor (assuming * it's small enough to fit two elements in a cache line) - the current heuristic * uses an arbitrarily chosen order when no sideFactors are 1. However, favoring the * min sidefactor entails greater heuristic overhead to calculate the right traversal. */ #define DEBUG_COPY_ORDERED(x,y,dir,dirall) do { \ char msg[255]; \ sprintf(msg,"element-wise generalized no-overlap copy, " \ "%i arrs matching cache direction %0"_STRINGIFY(N)"x", \ (x.sideFactors[dir] == 1) + (y.sideFactors[dir] == 1), dirall); \ DEBUG_COPY(msg, R_numelements * sizeof(T)); \ } while (0) #if (N == 2) { int dir = 0; T tmp; if (x.sideFactors[1] == 1) dir = 1; else if (x.sideFactors[0] == 1) dir = 0; else if (y.sideFactors[1] == 1) dir = 1; else if (y.sideFactors[0] == 1) dir = 0; else dir = 1; if (dir == 1) { DEBUG_COPY_ORDERED(x,y,1,0x01); forall2ordered2(e, x, f, y, lo, hi, stride, p, 0,1, deref(tmp, f); weak_assign(e, tmp), *e = *f); ti_sync(); } else { DEBUG_COPY_ORDERED(x,y,0,0x10); forall2ordered2(e, x, f, y, lo, hi, stride, p, 1,0, deref(tmp, f); weak_assign(e, tmp), *e = *f); ti_sync(); } } #elif (N == 3) { int dir = 0; T tmp; if (x.sideFactors[2] == 1) { if (y.sideFactors[0] == 1) dir = 0x102; else dir = 0x012; } else if (x.sideFactors[1] == 1) { if (y.sideFactors[0] == 1) dir = 0x201; else dir = 0x021; } else if (x.sideFactors[0] == 1) { if (y.sideFactors[1] == 1) dir = 0x210; else dir = 0x120; } else if (y.sideFactors[2] == 1) dir = 0x012; else if (y.sideFactors[1] == 1) dir = 0x021; else if (y.sideFactors[0] == 1) dir = 0x120; else dir = 0x012; DEBUG_COPY_ORDERED(x,y,(dir&0xF),dir); #define CASE(dir0,dir1,dir2) \ case 0x##dir0##dir1##dir2: \ forall2ordered3(e, x, f, y, lo, hi, stride, p, dir0,dir1,dir2, \ deref(tmp, f); weak_assign(e, tmp), \ *e = *f); \ break switch (dir) { CASE(0,1,2); CASE(1,0,2); CASE(0,2,1); CASE(2,0,1); CASE(1,2,0); CASE(2,1,0); default: abort(); } #undef CASE ti_sync(); } #else { T tmp; DEBUG_COPY("element-wise generalized no-overlap naive copy", R_numelements * sizeof(T)); forall2(e, x, f, y, lo, hi, stride, deref(tmp, f); weak_assign(e, tmp), *e = *f); ti_sync(); } #endif #undef DEBUG_COPY_ORDERED } return; } } } #undef return #endif /* assignable */ #undef DEBUG_COPY #undef _DEBUG_COPY /* ------------------------------------------------------------------------------------ */ ti_1DARRAY_OF_POINT_N EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ti_1DARRAY_OF_POINT_N); ti_1DARRAY_OF_T EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,T,1)(ti_1DARRAY_OF_T); ti_1DARRAY_OF_T EXTERNAL_METHOD(CONSTRUCT,T,1)(const char *, Region, ti_RECTDOMAIN1, int); PTR_TO_T EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(ti_1DARRAY_OF_POINT_N); PTR_TO_T EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,T,1)(ti_1DARRAY_OF_T); void EXTERNAL_ARRAYMETHOD(copyinner,T,1)(ti_1DARRAY_OF_T, ti_1DARRAY_OF_T, T6Handle4lang2ti *); ti_1DARRAY_OF_POINT_N DOMAIN_POINTLIST(ti_MRAD); /* ------------------------------------------------------------------------------------ */ #if assignable void GATHER_METHOD(const ti_ARRAY srcArray, const ti_1DARRAY_OF_T packedArray, const ti_1DARRAY_OF_POINT_N ptArray) { int numpts = -1; int src_box = BOX_PART(srcArray.A); ti_1DARRAY_OF_T lc_packedArray; ti_1DARRAY_OF_POINT_N lc_ptArray; ti_POINT *ptList = NULL; T **addrList = NULL; T *destList = NULL; /* safety checks */ assertExists(srcArray, "(in gather method)"); assertExists(packedArray, "(in gather method)"); assertExists(ptArray, "(in gather method)"); REGION_SAFETY_CHECK(packedArray, srcArray, "(in gather method)"); numpts = m4sizemT13tiRectDomain17domains2ti(ptArray.domain); if (numpts == 0) return; /* nothing to do */ addrList = (T**)ti_malloc_atomic_huge(numpts * sizeof(T*)); /* build local & contiguous versions of params(with same domains), if necessary */ lc_ptArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ptArray); lc_packedArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,T,1)(packedArray); assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ptList = (ti_POINT*)LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(lc_ptArray)); destList = LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,T,1)(lc_packedArray)); { /* build list of addresses and do array-bounds check */ int i; T *baseAddr = LOCAL_PART(srcArray.A); #if ((N == 1) || (N == 2) || (N == 3)) && (BOUNDS_CHECKING == 0) T *alteredBaseAddr; #if N == 1 if ((srcArray.stride[0] == 1) && (srcArray.sideFactors[0] == 1)){ alteredBaseAddr = baseAddr - srcArray.base[0]; #elif N == 2 if ((srcArray.stride[0] == 1) && (srcArray.stride[1] == 1) && (srcArray.sideFactors[1] == 1)){ alteredBaseAddr = baseAddr - (srcArray.base[0]*srcArray.sideFactors[0] + srcArray.base[1]); #elif N == 3 if ((srcArray.stride[0] == 1) && (srcArray.stride[1] == 1) && (srcArray.stride[2] == 1) && (srcArray.sideFactors[2] == 1)){ alteredBaseAddr = baseAddr - (srcArray.base[0]*srcArray.sideFactors[0] + srcArray.base[1]*srcArray.sideFactors[1] + srcArray.base[2]); #endif for (i=0; i < numpts; i++) { TI_ARRAY_FASTBESTCONV(N, addrList[i], alteredBaseAddr, srcArray, ptList[i]); } } else{ for (i=0; i < numpts; i++) { TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, srcArray, ptList[i]); } } #else /* N > 3 */ for (i=0; i < numpts; i++) { #if BOUNDS_CHECKING addrList[i] = baseAddr + ti_convinl("gather()", &srcArray, ptList[i]); #else TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, srcArray, ptList[i]); #endif } #endif } /* is the src simply local? (or are we missing AM?) - use a simple copy */ #ifdef COMM_AM2 if (DIRECTLY_ADDRESSABLE(srcArray.A)) #endif { int i; for (i=0; i < numpts; i++) { #if GLOBAL_ARRAY PTR_TO_T p; TO_GLOBALB(p, src_box, addrList[i]); deref(destList[i], p); #else deref(destList[i], addrList[i]); #endif } } #ifdef COMM_AM2 else { /* use AM to do a fast remote gather */ sparse_gather(destList, (void **)addrList, src_box, numpts, sizeof(T), ELEMENTS_ARE_ATOMIC); } #endif /* copy back & free temps */ if (!EQUAL(lc_packedArray.A, packedArray.A)) { /* need to copy back result */ EXTERNAL_ARRAYMETHOD(copyinner,T,1)(packedArray, lc_packedArray, NULL); assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ti_free(LOCAL_PART(lc_packedArray.A)); } if (!EQUAL(lc_ptArray.A, ptArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); ti_free(LOCAL_PART(lc_ptArray.A)); } ti_free(addrList); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void SCATTER_METHOD(const ti_ARRAY destArray, const ti_1DARRAY_OF_T packedArray, const ti_1DARRAY_OF_POINT_N ptArray) { int numpts = -1; int dest_box = BOX_PART(destArray.A); ti_1DARRAY_OF_T lc_packedArray; ti_1DARRAY_OF_POINT_N lc_ptArray; ti_POINT *ptList = NULL; T **addrList = NULL; T *srcList = NULL; /* safety checks */ assertExists(destArray, "(in scatter method)"); assertExists(packedArray, "(in scatter method)"); assertExists(ptArray, "(in scatter method)"); REGION_SAFETY_CHECK(destArray, packedArray, "(in scatter method)"); numpts = m4sizemT13tiRectDomain17domains2ti(ptArray.domain); if (numpts == 0) return; /* nothing to do */ addrList = (T**)ti_malloc_atomic_huge(numpts * sizeof(T*)); /* build local & contiguous versions of params(with same domains), if necessary */ lc_ptArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ptArray); lc_packedArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,T,1)(packedArray); assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ptList = (ti_POINT *)LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(lc_ptArray)); srcList = LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,T,1)(lc_packedArray)); { /* build list of addresses and do array-bounds check */ int i; T *baseAddr = LOCAL_PART(destArray.A); #if ((N == 1) || (N == 2) || (N == 3)) && (BOUNDS_CHECKING == 0) T *alteredBaseAddr; #if N == 1 if ((destArray.stride[0] == 1) && (destArray.sideFactors[0] == 1)){ alteredBaseAddr = baseAddr - destArray.base[0]; #elif N == 2 if ((destArray.stride[0] == 1) && (destArray.stride[1] == 1) && (destArray.sideFactors[1] == 1)){ alteredBaseAddr = baseAddr - (destArray.base[0]*destArray.sideFactors[0] + destArray.base[1]); #elif N == 3 if ((destArray.stride[0] == 1) && (destArray.stride[1] == 1) && (destArray.stride[2] == 1) && (destArray.sideFactors[2] == 1)){ alteredBaseAddr = baseAddr - (destArray.base[0]*destArray.sideFactors[0] + destArray.base[1]*destArray.sideFactors[1] + destArray.base[2]); #endif for (i=0; i < numpts; i++) { TI_ARRAY_FASTBESTCONV(N, addrList[i], alteredBaseAddr, destArray, ptList[i]); } } else{ for (i=0; i < numpts; i++) { TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, destArray, ptList[i]); } } #else /* N > 3 */ for (i=0; i < numpts; i++) { #if BOUNDS_CHECKING addrList[i] = baseAddr + ti_convinl("gather()", &destArray, ptList[i]); #else TI_ARRAY_FASTCONV(N, addrList[i], baseAddr, destArray, ptList[i]); #endif } #endif } /* is the dest simply local? (or are we missing AM?) - use a simple copy */ #ifdef COMM_AM2 if (DIRECTLY_ADDRESSABLE(destArray.A)) #endif { int i; for (i=0; i < numpts; i++) { #if GLOBAL_ARRAY PTR_TO_T p; TO_GLOBALB(p, dest_box, addrList[i]); weak_assign(p, srcList[i]); #else weak_assign(addrList[i], srcList[i]); #endif } ti_sync(); } #ifdef COMM_AM2 else { /* use AM to do a fast remote scatter */ sparse_scatter((void **)addrList, srcList, dest_box, numpts, sizeof(T), ELEMENTS_ARE_ATOMIC); } #endif /* free temps */ if (!EQUAL(lc_packedArray.A,packedArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_packedArray.A)); ti_free(LOCAL_PART(lc_packedArray.A)); } if (!EQUAL(lc_ptArray.A,ptArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); ti_free(LOCAL_PART(lc_ptArray.A)); } ti_free(addrList); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_WITHRECTDOMAIN_METHOD(const ti_ARRAY x, const ti_ARRAY y, const ti_RECTDOMAIN R) { ti_ARRAY n = y; assertExists(y, "(in copy_withrectdomain method)"); n.domain = RECTDOMAIN_INTERSECTION(y.domain, R); COPY_METHOD(x, n); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_WITHPTARRAY_METHOD(const ti_ARRAY x, const ti_ARRAY y, const ti_1DARRAY_OF_POINT_N ptArray) { int numpts = -1; ti_POINT *ptList = NULL; ti_1DARRAY_OF_POINT_N lc_ptArray; ti_RECTDOMAIN R; /* safety checks */ assertExists(x, "(in copy_withptarray method)"); assertExists(y, "(in copy_withptarray method)"); assertExists(ptArray, "(in copy_withptarray method)"); REGION_SAFETY_CHECK(x, y, "(in copy_withptarray method)"); numpts = m4sizemT13tiRectDomain17domains2ti(ptArray.domain); if (numpts == 0) return; /* nothing to do */ /* build local & contiguous versions of params(with same domains), if necessary */ lc_ptArray = EXTERNAL_ARRAYMETHOD(makeLocalAndContiguous,ti_POINT,1)(ptArray); ptList = (ti_POINT*)LOCAL_PART(EXTERNAL_METHOD(GET_ARRAY_DATA_PTR,ti_POINT,1)(lc_ptArray)); if (DIRECTLY_ADDRESSABLE(x.A) && DIRECTLY_ADDRESSABLE(y.A) && (R=RECTDOMAIN_INTERSECTION(y.domain, x.domain), !ti_mayoverlap(&x,&y,&R))) { /* everything is local and non-overlapping - use a direct copy */ T *baseAddrX = LOCAL_PART(x.A); T *baseAddrY = LOCAL_PART(y.A); int i; for (i=0; i < numpts; i++) { T *pXelem = baseAddrX + ti_convinl("(in copy_withptarray method)", &x, ptList[i]); T *pYelem = baseAddrY + ti_convinl("(in copy_withptarray method)", &y, ptList[i]); *pXelem = *pYelem; } } else { /* use scatter gather through a temp */ ti_1DARRAY_OF_T packedArray = EXTERNAL_METHOD(CONSTRUCT,T,1)("(in copy_withptarray method)", NULL, ptArray.domain, 0); GATHER_METHOD(y, packedArray, lc_ptArray); SCATTER_METHOD(x, packedArray, lc_ptArray); assert(DIRECTLY_ADDRESSABLE(packedArray.A)); ti_free(LOCAL_PART(packedArray.A)); } if (!EQUAL(lc_ptArray.A, ptArray.A)) { assert(DIRECTLY_ADDRESSABLE(lc_ptArray.A)); ti_free(LOCAL_PART(lc_ptArray.A)); } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable void COPY_WITHDOMAIN_METHOD(const ti_ARRAY x, const ti_ARRAY y, const ti_DOMAIN dom) { /* safety checks */ assertExists(x, "(in copy_withdomain method)"); assertExists(y, "(in copy_withdomain method)"); REGION_SAFETY_CHECK(x, y, "(in copy_withdomain method)"); if (TO_LOCAL(dom) == NULL) { fprintf(stderr, "fatal error: attempt to use uninitialized domain in copy_withdomain method\n"); abort(); } #if 0 ti_1DARRAY_OF_RECTDOMAIN_N rdArray = DOMAIN_RECTDOMAINLIST(dom); #endif /* for now, simply punt to the ptArray method * future optimizations: detect strongly clustered domains and do a * rectangle-by-rectangle copy using the copy_withrectdomain method */ { /*ti_1DARRAY_OF_POINT_N ptArray = DOMAIN_POINTLIST((PT19tiMultiRectADomain17domains2ti)dom);*/ ti_1DARRAY_OF_POINT_N ptArray = DOMAIN_POINTLIST(*(ti_MRAD*)&dom); COPY_WITHPTARRAY_METHOD(x, y, ptArray); } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ #if assignable /* * x.set(w) is equivalent to: * foreach (p in x.domain()) x[p] = w; */ void SET_METHOD(const ti_ARRAY x, T w) { ti_RECTDOMAIN R = x.domain; ti_srcpos_freeze(); if (!RECTDOMAIN_ISNULL(R)) { int i; PTR_TO_T pdata; size_t R_numelements = 0; char *p = (char *)&w; assertExists(x, "(in set method)"); for (i=1; i < sizeof(T); i++) { /* see if all bytes are the same */ if (*p != p[i]) break; } if (i == sizeof(T) && DIRECTLY_ADDRESSABLE(x.A) && INTERNAL_ISCONTIGUOUS_METHOD(&x, &x.domain, &R_numelements, &pdata)) { /* use memset when possible */ memset(LOCAL_PART(pdata), *p, sizeof(T) * R_numelements); } else { jint lo[N], hi[N], stride[N], i; ti_POINT const p = RECTDOMAIN_MIN(R); ti_POINT const q = RECTDOMAIN_UPB(R); ti_POINT const s = RECTDOMAIN_STRIDE(R); for (i = 0; i < N; i++) { lo[i] = POINT_INDEX(p, i + 1); hi[i] = POINT_INDEX(q, i + 1) - 1; stride[i] = POINT_INDEX(s, i + 1); } forall(e, x, lo, hi, stride, assign(e, w), ASSIGN_LOCAL(e, w)); } } ti_srcpos_unfreeze(); } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ ti_ARRAY TRANSLATE_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; int i; assertExists(x, "(in translate method)"); n.domain = RECTDOMAIN_TRANSLATE(x.domain, p); /*p = RECTDOMAIN_MIN(n.domain);*/ for (i = 0; i < N; i++) n.base[i] += POINT_INDEX(p, i + 1); #if DEBUG_ARRAYS fprintf(stderr, "p%d: translated array from domain = ", MYPROC); PRINT_DOMAIN(x.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "p%d: translated array to domain = ", MYPROC); PRINT_DOMAIN(n.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "Array<%d>:\tbase\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, n.base[i], n.stride[i], n.sideFactors[i]); } #endif return n; } /* ------------------------------------------------------------------------------------ */ #define PERMUTE_C_ARRAY(src, dest, p) do { \ int i_; \ for (i_ = 0; i_ < N; i_++) \ dest[i_] = src[POINT_INDEX(p, i_ + 1) - 1]; \ } while (0) ti_ARRAY PERMUTE_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; assertExists(x, "(in permute method)"); n.domain = RECTDOMAIN_PERMUTE(n.domain, p); PERMUTE_C_ARRAY(x.base, n.base, p); PERMUTE_C_ARRAY(x.stride, n.stride, p); PERMUTE_C_ARRAY(x.sideFactors, n.sideFactors, p); return n; } #undef PERMUTE_C_ARRAY /* ------------------------------------------------------------------------------------ */ ti_ARRAY INJECT_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; int i; assertExists(x, "(in inject method)"); n.domain = RECTDOMAIN_MULTIPLY(x.domain, p); for (i = 0; i < N; i++) { int px = POINT_INDEX(p, i + 1); n.base[i] *= px; if ((n.stride[i] *= px) < 0) { n.stride[i] *= -1; n.sideFactors[i] *= -1; } } #if DEBUG_ARRAYS fprintf(stderr, "p%d: injected array from domain = ", MYPROC); PRINT_DOMAIN(x.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "p%d: injected array to domain = ", MYPROC); PRINT_DOMAIN(n.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "\nArray<%d>:\tbase\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, n.base[i], n.stride[i], n.sideFactors[i]); } #endif return n; } /* ------------------------------------------------------------------------------------ */ ti_ARRAY PROJECT_METHOD(ti_ARRAY x, ti_POINT p) { ti_ARRAY n = x; int i; assertExists(x, "(in project method)"); n.domain = RECTDOMAIN_DIVIDE(x.domain, p); for (i = 0; i < N; i++) { int px = POINT_INDEX(p, i + 1); assert(("project() beyond limits", n.base[i] % px == 0)); n.base[i] /= px; assert(("project() beyond limits", n.stride[i] % px == 0)); if ((n.stride[i] /= px) < 0) { n.stride[i] *= -1; n.sideFactors[i] *= -1; } } #if DEBUG_ARRAYS fprintf(stderr, "p%d: projected array from domain = ", MYPROC); PRINT_DOMAIN(x.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "p%d: projected array to domain = ", MYPROC); PRINT_DOMAIN(n.domain, stderr); fprintf(stderr, "\n"); fprintf(stderr, "\nArray<%d>:\tbase\tstride\tsideFactors\n", N); for (i = 0; i < N; i++) { fprintf(stderr, " %d\t\t%d\t%d\t%d\n", i, n.base[i], n.stride[i], n.sideFactors[i]); } #endif return n; } /* ------------------------------------------------------------------------------------ */ void ti_printdomain(ti_ARRAY x, FILE *f) { assertExists(x, "(in printDomain method)"); PRINT_DOMAIN(x.domain, f); } /* ------------------------------------------------------------------------------------ */ /* Output an array bounds violation error message and abort. The error message will contain the text of arg1 and arg2, among other things. The first argument is where the error occured in the source program. */ void ti_abv_ds(const char *errorlocation, const ti_ARRAY *x, int dim, int arg1, const char *arg2) { TI_ABV(*x, dim, errorlocation, ti_printdomain, "%d%s%s", arg1, arg2, ""); } /* ------------------------------------------------------------------------------------ */ /* Output an array bounds violation error message and abort. The error message will contain the text of arg1, arg2, and arg3, among other things. The first argument is where the error occured in the source program. */ void ti_abv_dsd(const char *errorlocation, const ti_ARRAY *x, int dim, int arg1, const char *arg2, int arg3) { TI_ABV(*x, dim, errorlocation, ti_printdomain, "%d%s%d", arg1, arg2, arg3); } /* ------------------------------------------------------------------------------------ */ void ti_boundscheck(const char *where, const ti_ARRAY *x, ti_POINT p) { /* DOB: method must be defined even when not bounds checking, in order to link libraries * which use TiArrays and are compiled with bounds checking enabled */ #if BOUNDS_CHECKING #if N_LARGER_THAN_1 jint i; #else #define i 1 #endif assertExists(*x, where); #if N_LARGER_THAN_1 for (i = N; i >= 1; i--) #endif { jint const q = POINT_INDEX(p, i); jint const qm = q - x->base[i-1]; jint const lo = POINT_INDEX(RECTDOMAIN_MIN(x->domain), i); jint const hi = POINT_INDEX(RECTDOMAIN_UPB(x->domain), i) - 1; jint const s = POINT_INDEX(RECTDOMAIN_STRIDE(x->domain), i); if (s > 1 && ((q - lo) % s) != 0) { fprintf(stderr, "Array bounds violation on processor %d\n" "bounds checking error at %s:\n failed condition: ", MYPROC, where); PRINT_DOMAIN(x->domain, stderr); fprintf(stderr, ".contains("); PRINT_POINT(p, stderr); fprintf(stderr, ")\n"); #if EXPLICITLY_STORE_ALLOCATION_SITE fprintf(stderr, " array was allocated at %s\n", x->where); #endif abort(); exit(-2); } else if (q < lo || q > hi) { fprintf(stderr, "bounds checking error at %s:\n failed condition: ", where); PRINT_POINT(RECTDOMAIN_MIN(x->domain), stderr); fprintf(stderr, " <= "); PRINT_POINT(p, stderr); fprintf(stderr, " <= "); PRINT_POINT(RECTDOMAIN_MAX(x->domain), stderr); fprintf(stderr, ")\n"); #if EXPLICITLY_STORE_ALLOCATION_SITE fprintf(stderr, " array was allocated at %s\n", x->where); #endif abort(); exit(-2); } assert(("internal error: bad array descriptor", x->stride[i-1] == 1 || (qm % x->stride[i-1]) == 0)); } #if !N_LARGER_THAN_1 #undef i #endif #endif } /* ------------------------------------------------------------------------------------ */ #define DEBUG_ISCONTIGUOUS 0 /* REQUIRES: given domain (*pR) is a subset of *px.domain() * *pnumelements is the number of elements in *pR, or zero * * EFFECTS: returns true iff all elements in the given domain * are stored contiguously in memory in array *px * if *pnumelements was zero, it is filled in with the number of elements in *pR * if pRdataptr is non-NULL and the call returns true, then *pRdataptr recieves * the value of ti_get_array_data_ptr_with_domain(px, pR) (computed more efficiently) */ #define RETURN(val) do { ti_srcpos_unfreeze(); return (val); } while (0) static jboolean INTERNAL_ISCONTIGUOUS_METHOD(const ti_ARRAY *px, const ti_RECTDOMAIN *pR, size_t *pnumelements, PTR_TO_T *pRdataptr) { int i, minoffset, maxoffset; ti_ARRAY tmp; ti_ARRAY *_px = (ti_ARRAY *)px; int numelements = *pnumelements; ti_srcpos_freeze(); if (numelements == 0) { numelements = RECTDOMAIN_SIZE(*pR); *pnumelements = numelements; if (numelements == 0) RETURN(1); } for (i = 0; i < N; i++) { if (px->sideFactors[i] < 0) { if (_px == px) { tmp = *px; _px = &tmp; } _px->sideFactors[i] = -_px->sideFactors[i]; } } #ifdef NDEBUG maxoffset = ti_convinl("isContiguous()", _px, RECTDOMAIN_MAX(*pR)); if (maxoffset + 1 == numelements) { /* minoffset must be zero, no need to compute it */ if (pRdataptr != NULL) *pRdataptr = px->A; RETURN(1); } minoffset = ti_convinl("isContiguous()", _px, RECTDOMAIN_MIN(*pR)); if (maxoffset - minoffset + 1 == numelements) { if (pRdataptr != NULL) INDEX(*pRdataptr, px->A, minoffset); RETURN(1); } else RETURN(0); #else maxoffset = ti_convinl("isContiguous()", _px, RECTDOMAIN_MAX(*pR)); minoffset = ti_convinl("isContiguous()", _px, RECTDOMAIN_MIN(*pR)); assert(minoffset >= 0 && maxoffset >= minoffset); assert(numelements <= maxoffset - minoffset + 1); { int result = (maxoffset - minoffset + 1 == numelements); if (result && pRdataptr != NULL) INDEX(*pRdataptr, px->A, minoffset); if (RECTDOMAIN_ISNULL(*pR)) { assert(result == 1); RETURN(1); } { #if 0 /* this test fails in the presence of negative sideFactors */ int numelements = RECTDOMAIN_SIZE(R); int minoffset = ti_convinl("isContiguous()", &x, RECTDOMAIN_MIN(R)); int maxoffset = ti_convinl("isContiguous()", &x, RECTDOMAIN_MAX(R)); int ref_result = (numelements == maxoffset - minoffset + 1); assert(ref_result == result); return result; #else /* this test handles negative sideFactors */ int i; ti_POINT pointwithminaddr, pointwithmaxaddr; ti_POINT const dommin = RECTDOMAIN_MIN(*pR); ti_POINT const domupb = RECTDOMAIN_UPB(*pR); #if DEBUG_ISCONTIGUOUS fprintf(stderr, "\n"); fprintf(stderr, "dommin="); PRINT_POINT(dommin, stderr); fprintf(stderr, "\ndomupb="); PRINT_POINT(domupb, stderr); fprintf(stderr, "\ndomstride="); PRINT_POINT(RECTDOMAIN_STRIDE(*pR), stderr); fflush(stderr); #endif for (i = 0; i < N; i++) { int const lo = POINT_INDEX(dommin, i + 1); int const hi = POINT_INDEX(domupb, i + 1) - 1; #if DEBUG_ISCONTIGUOUS fprintf(stderr, "\nlo=%i",lo); fprintf(stderr, "\nhi=%i",hi); #endif if (px->sideFactors[i] >= 0) { pointwithminaddr = POINT_SET(pointwithminaddr, i, lo); pointwithmaxaddr = POINT_SET(pointwithmaxaddr, i, hi); } else { pointwithminaddr = POINT_SET(pointwithminaddr, i, hi); pointwithmaxaddr = POINT_SET(pointwithmaxaddr, i, lo); } } #if DEBUG_ISCONTIGUOUS fprintf(stderr, "\npointwithminaddr="); PRINT_POINT(pointwithminaddr, stderr); fprintf(stderr, "\npointwithmaxaddr="); PRINT_POINT(pointwithmaxaddr, stderr); fprintf(stderr, "\n"); fflush(stderr); #endif { int numelements = RECTDOMAIN_SIZE(*pR); int minoffset = ti_convinl("isContiguous()", px, pointwithminaddr); int maxoffset = ti_convinl("isContiguous()", px, pointwithmaxaddr); int ref_result = (numelements == maxoffset - minoffset + 1); assert(ref_result == result); RETURN(result); } #endif } } #endif } #undef DEBUG_ISCONTIGUOUS #undef RETURN /* EFFECTS: returns true if all elements in the domain of x are stored contiguously in memory */ jboolean ISCONTIGUOUS_METHOD(const ti_ARRAY x) { size_t numelements = 0; assertExists(x, "(in isContiguous method)"); return INTERNAL_ISCONTIGUOUS_METHOD(&x, &x.domain, &numelements, NULL); } /* REQUIRES: given domain (R) is a subset of x.domain() * EFFECTS: returns true if all elements in the given domain * are stored contiguously in memory in array x */ jboolean ISCONTIGUOUSOVERDOMAIN_METHOD(const ti_ARRAY x, const ti_RECTDOMAIN R) { size_t numelements = 0; assertExists(x, "(in isContiguousOverDomain method)"); return INTERNAL_ISCONTIGUOUS_METHOD(&x, &R, &numelements, NULL); } /* ------------------------------------------------------------------------------------ */ /* EFFECTS: returns a ptr to the element with the lowest address in the current array * (should be used to get a handle on the array data when performing bulk operations on contiguous arrays) */ PTR_TO_T ti_get_array_data_ptr(const ti_ARRAY x) { assertExists(x, "(in ti_get_array_data_ptr)"); return ti_get_array_data_ptr_with_domain(&x, &x.domain); } /* REQUIRES: given domain (R) is a subset of x.domain * EFFECTS: returns a ptr to the element with the lowest address in the current array * (should be used to get a handle on the array data when performing bulk operations on contiguous arrays) */ PTR_TO_T ti_get_array_data_ptr_with_domain(const ti_ARRAY *x, const ti_RECTDOMAIN *R) { ti_POINT pointwithminaddr; PTR_TO_T tiarray_firstelem; int i; assertExists(*x, "(in ti_get_array_data_ptr_with_domain)"); #if 0 /* no negative sideFactors */ pointwithminaddr = RECTDOMAIN_MIN(R); #elif 0 ti_POINT const dommin = RECTDOMAIN_MIN(R); ti_POINT const domupb = RECTDOMAIN_UPB(R); for (i = 0; i < N; i++) { if (x.sideFactors[i] >= 0) pointwithminaddr = POINT_SET(pointwithminaddr, i, POINT_INDEX(dommin, i + 1)); else pointwithminaddr = POINT_SET(pointwithminaddr, i, POINT_INDEX(domupb, i + 1) - 1); } #else pointwithminaddr = RECTDOMAIN_MIN(*R); for (i = 0; i < N; i++) { if (x->sideFactors[i] < 0) pointwithminaddr = POINT_SET(pointwithminaddr, i, POINT_INDEX(RECTDOMAIN_UPB(*R), i + 1) - 1); } #endif INDEX(tiarray_firstelem, x->A, ti_convinl("ti_get_array_data_ptr", x, pointwithminaddr)); return tiarray_firstelem; } /* ------------------------------------------------------------------------------------ */ #if assignable ti_ARRAY MAKELOCALANDCONTIGUOUS_METHOD(const ti_ARRAY x) { /* returns a locally situated & contiguous version of this array, * with the same domain and same region as x * possibly copying the elements (if necessary) */ assertExists(x, "(in makeLocalAndContiguous)"); if (DIRECTLY_ADDRESSABLE(x.A) && ISCONTIGUOUS_METHOD(x)) return x; else { Region r; RegionId rid; ti_ARRAY retval; #if GLOBAL_ARRAY rid = PG2RegionId(x.A); #else rid = PL2RegionId(x.A); #endif if (rid == UNK_REGIONID) r = NULL; else r = RegionId2Region(rid); retval = ti_construct("(in makeLocalAndContiguous)", r, x.domain, 0); COPY_METHOD(retval, x); return retval; } } #endif /* assignable */ /* ------------------------------------------------------------------------------------ */ /* bulk I/O methods */ /* Note: the following methods allow reading/writing Ti arrays to RandomAccessFile objects and DataInputStream/DataOutputStream objects. Another design possibility is to replace the DIS/DOS methods with ones taking InputStream/OutputStream. I don't currently see an advantage either way - both solutions are equally expressive and neither is clearly more efficient. I stuck with first solution because dispatch was easier. */ #ifndef __bulkio_typedefs #define __bulkio_typedefs /* these three seem to always get generated */ #include "layout!T16RandomAccessFile2io4java.h" #include "layout!T15DataInputStream2io4java.h" #include "layout!T16DataOutputStream2io4java.h" #include "layout!PTAjbyte.h" /* these three are more of a struggle */ #ifdef _include_layout_PT16RandomAccessFile2io4java_h_ #define GP_RAF PT16RandomAccessFile2io4java #else typedef GP_type( T16RandomAccessFile2io4java ) GP_RAF; #endif #ifdef _include_layout_PT15DataInputStream2io4java_h_ #define GP_DIS PT15DataInputStream2io4java #else typedef GP_type( T15DataInputStream2io4java ) GP_DIS; #endif #ifdef _include_layout_PT16DataOutputStream2io4java_h_ #define GP_DOS PT16DataOutputStream2io4java #else typedef GP_type( T16DataOutputStream2io4java ) GP_DOS; #endif #endif #if ELEMENTS_ARE_ATOMIC /* RandomAccessFile */ #if assignable #define TIARRAY_BULKIO_READFUNC_HEADER \ void READFROMRAF_METHOD(const ti_ARRAY x, GP_RAF fileobj) #define TIARRAY_BULKIO_READFUNC m9readFullyPTAjbyteIImT16RandomAccessFile2io4java #define TIARRAY_BULKIO_READFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_READFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_READFUNC(GP_RAF, PTAjbyte, jint, jint); #endif #define TIARRAY_BULKIO_WRITEFUNC_HEADER \ void WRITETORAF_METHOD(const ti_ARRAY x, GP_RAF fileobj) #define TIARRAY_BULKIO_WRITEFUNC m5writePTAjbyteIImT16RandomAccessFile2io4java #define TIARRAY_BULKIO_WRITEFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_WRITEFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_WRITEFUNC(GP_RAF, PTAjbyte, jint, jint); #include "ti_array_bulkio.c" #if assignable #undef TIARRAY_BULKIO_READFUNC_HEADER #undef TIARRAY_BULKIO_READFUNC #undef TIARRAY_BULKIO_READFUNC_CALLIO #endif #undef TIARRAY_BULKIO_WRITEFUNC_HEADER #undef TIARRAY_BULKIO_WRITEFUNC #undef TIARRAY_BULKIO_WRITEFUNC_CALLIO /* DataInputStream/DataOutputStream */ #if assignable #define TIARRAY_BULKIO_READFUNC_HEADER \ void READFROMDIS_METHOD(const ti_ARRAY x, GP_DIS fileobj) #define TIARRAY_BULKIO_READFUNC m9readFullyPTAjbyteIImT15DataInputStream2io4java #define TIARRAY_BULKIO_READFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_READFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_READFUNC(GP_DIS, PTAjbyte, jint, jint); #endif #define TIARRAY_BULKIO_WRITEFUNC_HEADER \ void WRITETODOS_METHOD(const ti_ARRAY x, GP_DOS fileobj) #define TIARRAY_BULKIO_WRITEFUNC m5writePTAjbyteIImT16DataOutputStream2io4java #define TIARRAY_BULKIO_WRITEFUNC_CALLIO(fileobj, array, offset, count) \ TIARRAY_BULKIO_WRITEFUNC( (fileobj), (array), (offset), (count) ) extern void TIARRAY_BULKIO_WRITEFUNC(GP_DOS, PTAjbyte, jint, jint); #include "ti_array_bulkio.c" #if assignable #undef TIARRAY_BULKIO_READFUNC_HEADER #undef TIARRAY_BULKIO_READFUNC #undef TIARRAY_BULKIO_READFUNC_CALLIO #endif #undef TIARRAY_BULKIO_WRITEFUNC_HEADER #undef TIARRAY_BULKIO_WRITEFUNC #undef TIARRAY_BULKIO_WRITEFUNC_CALLIO #endif /* ------------------------------------------------------------------------------------ */ #undef TI_ARRAY_C #undef ti_convinl #undef PRINT_POINT #undef PRINT_DOMAIN #undef N_LARGER_THAN_1 #undef LOCAL_PART #undef PROC_PART #undef BOX_PART #undef EXTERNAL_ARRAYMETHOD #undef EXTERNAL_METHOD #undef EQUAL #undef DIRECTLY_ADDRESSABLE #undef GPTR_TO_T #undef REGION_SAFETY_CHECK #undef deref #undef INDEX #undef assign #undef weak_assign #undef assignable #undef array_ralloc #undef array_malloc #undef forall2ordered2 #undef _forall2ordered2 #undef forall2ordered3 #undef _forall2ordered3