#define TI_NO_SRCPOS #include "backend.h" #include "array-byteswap.h" #include #include "primitives.h" #if 0 /* TODO: these aren't getting generated #include "layout!Ljbyte.h" #include "layout!Ljchar.h" #include "layout!Ljuint.h" #include "layout!Ljulong.h" */ #else /* a temporary(?) hack */ #define Ljbyte jbyte * #define Ljchar jchar * #define Ljuint juint * #define Ljulong julong * #endif /* OK, I know this file is really ugly to read, but byte swapping could potentially be a significant bottleneck in the bulk I/O system for little-endian machines. So this module is written with a very careful eye towards memory access pattern and to give the C optimizer as much information as possible to help make this fast. Hopefully it will never need to be modified... :-) i486 has a fast BSWAP instruction that we can hopefully use through the --swabn() macros, but it's unclear whether GNU always provides this - we use it when it does. A further enhancement would detect the intel architecture and use assembly directly to get the instruction we want. */ /*------------------------------------------------------------------------------------*/ /* Make sure we have the primitives */ #ifdef HAVE_SWABN_MACROS #define extern /* this is a hack-around for an apparent bug in little_endian.h */ #include #undef extern #endif #ifndef __swab16 #define __swab16(a) ( (((jchar)(a)) << 8) | \ (((jchar)(a)) >> 8) ) #endif #ifndef __swab32 #define __swab32(a) ((juint)( \ (((juint)(a)) << 24) | \ ((((juint)(a)) & (juint)0x0000ff00UL) << 8) | \ ((((juint)(a)) & (juint)0x00ff0000UL) >> 8) | \ (((juint)(a)) >> 24) )) #endif #ifndef __swab64 #define __swab64(a) ((julong)( \ (((julong)(a)) << 56) | \ ((((julong)(a)) & (julong)0x000000000000ff00ULL) << 40) | \ ((((julong)(a)) & (julong)0x0000000000ff0000ULL) << 24) | \ ((((julong)(a)) & (julong)0x00000000ff000000ULL) << 8) | \ ((((julong)(a)) & (julong)0x000000ff00000000ULL) >> 8) | \ ((((julong)(a)) & (julong)0x0000ff0000000000ULL) >> 24) | \ ((((julong)(a)) & (julong)0x00ff000000000000ULL) >> 40) | \ (((julong)(a)) >> 56) )) #endif /*------------------------------------------------------------------------------------*/ /* Tuning Parameters */ /* This parameter controls whether we do loop unrolling and make all the explicit memory operations 64-bits wide (may help performance on arrays of 16-bit and 32-bit values) */ #define USE_ARRAY_BSWAP_64MEMOPS 0 /* use the swab() function for the 16-bit conversion */ #define USE_SWAB_FOR_16BIT 1 /* This is for performance debugging */ #define DEBUG_SWAP_PERF 0 /*------------------------------------------------------------------------------------*/ #if USE_SWAB_FOR_16BIT #include #include #endif #if DEBUG_SWAP_PERF #include #include #endif /*------------------------------------------------------------------------------------*/ /* in-place local array byte swap */ void arrayByteSwap(Ljbyte array, jint sizeofelem, jint numofelem) { #if DEBUG_SWAP_PERF clock_t starttime = clock(); #endif switch (sizeofelem) { case 2: { /* 16-bit (jchar = 16 bits) */ assert(((jUIntPointer)array) % 2 == 0); #if USE_SWAB_FOR_16BIT swab(array, array, numofelem*sizeofelem); #else #if USE_ARRAY_BSWAP_64MEMOPS /* try to reduce memory accesses by making all explicit memory ops 64-bits wide */ { /* prolog */ while (((juint)array) % 8 != 0) { *((Ljchar)array) = __swab16(*((Ljchar)array)); array += 2; numofelem--; } } { /* main loop body */ register julong a; register julong b; register Ljulong p = (Ljulong)array; register Ljulong pend = p + (numofelem / 4); while (p < pend) { a = *p; b = __swab16((jchar)a); a >>= 16; b |= (((julong)__swab16((jchar)a)) << 16); a >>= 16; b |= (((julong)__swab16((jchar)a)) << 32); a >>= 16; b |= (((julong)__swab16((jchar)a)) << 48); *(p++) = b; } { /* epilog */ Ljchar pepilog = (Ljchar)pend; int i; for (i = 0; i < numofelem % 4; i++) *(pepilog++) = __swab16((*pepilog)); } } #else { register jchar a; register Ljchar p = (Ljchar)array; register Ljchar pend = p + numofelem; while (p < pend) { a = *p; *(p++) = __swab16(a); } } #endif #endif break; } case 4: { /* 32-bit */ assert(((jUIntPointer)array) % 4 == 0); #if USE_ARRAY_BSWAP_64MEMOPS /* try to reduce memory accesses by making all explicit memory ops 64-bits wide */ { /* prolog */ if (((juint)array) % 8 != 0) { *((Ljuint)array) = __swab32(*((Ljuint)array)); array += 4; numofelem--; } } { /* main loop body */ register julong a; register julong b; register Ljulong p = (Ljulong)array; register Ljulong pend = p + ( numofelem / 2); while (p < pend) { a = *p; b = __swab32((juint)a); a >>= 32; b |= (((julong)__swab32((juint)a)) << 32); *(p++) = b; } if (numofelem % 2) { /* epilog */ *((Ljuint)pend) = __swab32(*((Ljuint)pend)); } } #else { register juint a; register Ljuint p = (Ljuint)array; register Ljuint pend = p + numofelem; while (p < pend) { a = *p; *(p++) = __swab32(a); } } #endif break; } case 8: { /* 64-bit */ register julong a; register Ljulong p = (Ljulong)array; register Ljulong pend = p + numofelem; assert(((jUIntPointer)array) % 8 == 0); while (p < pend) { a = *p; *(p++) = __swab64(a); } break; } default: abort(); /* bad element size */ } #if DEBUG_SWAP_PERF fprintf(stderr, "arrayByteSwap time: %8.5f sec, (%lu elements, %lu bytes/element)\n", ((float)clock()-starttime)/CLOCKS_PER_SEC, numofelem, sizeofelem); #endif } /*------------------------------------------------------------------------------------*/ /* local array-to-array copy and byte swap */ void arrayCopyByteSwap(Ljbyte toarray, const Ljbyte fromarray, jint sizeofelem, jint numofelem) { #if DEBUG_SWAP_PERF clock_t starttime = clock(); #endif switch (sizeofelem) { case 2: { /* 16-bit (jchar = 16 bits) */ assert(((jUIntPointer)toarray) % 2 == 0); assert(((jUIntPointer)fromarray) % 2 == 0); #if USE_SWAB_FOR_16BIT swab(fromarray, toarray, numofelem*sizeofelem); #else #if USE_ARRAY_BSWAP_64MEMOPS /* try to reduce memory accesses by making all explicit memory ops 64-bits wide */ if (((juint)toarray) % 8 == ((juint)fromarray) % 8) { /* this should always be true, but just in case */ /* prolog */ while (((juint)fromarray) % 8 != 0) { *((Ljchar)toarray) = __swab16(*((const Ljchar)fromarray)); toarray += 2; fromarray += 2; numofelem--; } { /* main loop body */ register julong a; register julong b; register const Ljulong p = (Ljulong)fromarray; register const Ljulong pend = p + ( numofelem / 4); register Ljulong top = (Ljulong)toarray; while (p < pend) { a = *(p++); b = __swab16((jchar)a); a >>= 16; b |= (((julong)__swab16((jchar)a)) << 16); a >>= 16; b |= (((julong)__swab16((jchar)a)) << 32); a >>= 16; b |= (((julong)__swab16((jchar)a)) << 48); *(top++) = b; } { /* epilog */ const Ljchar pepilog = (Ljchar)pend; Ljchar ptoepilog = (Ljchar)top; int i; for (i = 0; i < numofelem % 4; i++) *(ptoepilog++) = __swab16((*pepilog)); } } } else #endif { register jchar a; register const Ljchar p = (Ljchar)fromarray; register const Ljchar pend = p + numofelem; register Ljchar top = (Ljchar)toarray; while (p < pend) { a = *(p++); *(top++) = __swab16(a); } } #endif break; } case 4: { /* 32-bit */ assert(((jUIntPointer)toarray) % 4 == 0); assert(((jUIntPointer)fromarray) % 4 == 0); #if USE_ARRAY_BSWAP_64MEMOPS /* try to reduce memory accesses by making all explicit memory ops 64-bits wide */ if (((juint)toarray) % 8 == ((juint)fromarray) % 8) { /* this should always be true, but just in case */ /* prolog */ if (((juint)fromarray) % 8 != 0) { *((Ljuint)toarray) = __swab32(*((const Ljuint)fromarray)); toarray += 4; fromarray += 4; numofelem--; } { /* main loop body */ register julong a; register julong b; register const Ljulong p = (Ljulong)fromarray; register const Ljulong pend = p + ( numofelem / 2); register Ljulong top = (Ljulong)toarray; while (p < pend) { a = *(p++); b = __swab32((juint)a); a >>= 32; b |= (((julong)__swab32((juint)a)) << 32); *(top++) = b; } if (numofelem % 2) { /* epilog */ *((Ljuint)top) = __swab32(*((const Ljuint)pend)); } } } else #endif { register juint a; register const Ljuint p = (Ljuint)fromarray; register const Ljuint pend = p + numofelem; register Ljuint top = (Ljuint)toarray; while (p < pend) { a = *(p++); *(top++) = __swab32(a); } } break; } case 8: { /* 64-bit */ register julong a; register const Ljulong p = (Ljulong)fromarray; register const Ljulong pend = p + numofelem; register Ljulong top = (Ljulong)toarray; assert(((jUIntPointer)toarray) % 8 == 0); assert(((jUIntPointer)fromarray) % 8 == 0); while (p < pend) { a = *(p++); *(top++) = __swab64(a); } break; } default: abort(); /* bad element size */ } #if DEBUG_SWAP_PERF fprintf(stderr, "arrayCopyByteSwap time: %8.5f sec, (%lu elements, %lu bytes/element)\n", ((float)clock()-starttime)/CLOCKS_PER_SEC, numofelem, sizeofelem); #endif } /*------------------------------------------------------------------------------------*/