diff --git a/libs/libmemcpy/impl/FastMemcpy.h b/libs/libmemcpy/impl/FastMemcpy.h index f6c79eaba60..09dd4256f18 100644 --- a/libs/libmemcpy/impl/FastMemcpy.h +++ b/libs/libmemcpy/impl/FastMemcpy.h @@ -34,6 +34,11 @@ #endif +typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; +typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; +typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; + + //--------------------------------------------------------------------- // fast copy for different sizes @@ -103,97 +108,97 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 66: memcpy_sse2_64(dd - 66, ss - 66); case 2: - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 67: memcpy_sse2_64(dd - 67, ss - 67); case 3: - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; case 68: memcpy_sse2_64(dd - 68, ss - 68); case 4: - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 69: memcpy_sse2_64(dd - 69, ss - 69); case 5: - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 70: memcpy_sse2_64(dd - 70, ss - 70); case 6: - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 71: memcpy_sse2_64(dd - 71, ss - 71); case 7: - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 72: memcpy_sse2_64(dd - 72, ss - 72); case 8: - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); break; case 73: memcpy_sse2_64(dd - 73, ss - 73); case 9: - *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); dd[-1] = ss[-1]; break; case 74: memcpy_sse2_64(dd - 74, ss - 74); case 10: - *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 75: memcpy_sse2_64(dd - 75, ss - 75); case 11: - *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 76: memcpy_sse2_64(dd - 76, ss - 76); case 12: - *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 77: memcpy_sse2_64(dd - 77, ss - 77); case 13: - *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 78: memcpy_sse2_64(dd - 78, ss - 78); case 14: - *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); break; case 79: memcpy_sse2_64(dd - 79, ss - 79); case 15: - *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); break; case 80: @@ -213,14 +218,14 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 82, ss - 82); case 18: memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 83: memcpy_sse2_64(dd - 83, ss - 83); case 19: memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; @@ -228,14 +233,14 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 84, ss - 84); case 20: memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 85: memcpy_sse2_64(dd - 85, ss - 85); case 21: memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; @@ -243,16 +248,16 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 86, ss - 86); case 22: memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 87: memcpy_sse2_64(dd - 87, ss - 87); case 23: memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 88: @@ -328,14 +333,14 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 98, ss - 98); case 34: memcpy_sse2_32(dd - 34, ss - 34); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 99: memcpy_sse2_64(dd - 99, ss - 99); case 35: memcpy_sse2_32(dd - 35, ss - 35); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; @@ -343,14 +348,14 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 100, ss - 100); case 36: memcpy_sse2_32(dd - 36, ss - 36); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 101: memcpy_sse2_64(dd - 101, ss - 101); case 37: memcpy_sse2_32(dd - 37, ss - 37); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; @@ -358,30 +363,30 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 102, ss - 102); case 38: memcpy_sse2_32(dd - 38, ss - 38); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 103: memcpy_sse2_64(dd - 103, ss - 103); case 39: memcpy_sse2_32(dd - 39, ss - 39); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 104: memcpy_sse2_64(dd - 104, ss - 104); case 40: memcpy_sse2_32(dd - 40, ss - 40); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); break; case 105: memcpy_sse2_64(dd - 105, ss - 105); case 41: memcpy_sse2_32(dd - 41, ss - 41); - *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); + *((uint64_unaligned_t*)(dd - 9)) = *((uint64_unaligned_t*)(ss - 9)); dd[-1] = ss[-1]; break; @@ -389,32 +394,32 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 106, ss - 106); case 42: memcpy_sse2_32(dd - 42, ss - 42); - *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint64_unaligned_t*)(dd - 10)) = *((uint64_unaligned_t*)(ss - 10)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 107: memcpy_sse2_64(dd - 107, ss - 107); case 43: memcpy_sse2_32(dd - 43, ss - 43); - *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint64_unaligned_t*)(dd - 11)) = *((uint64_unaligned_t*)(ss - 11)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 108: memcpy_sse2_64(dd - 108, ss - 108); case 44: memcpy_sse2_32(dd - 44, ss - 44); - *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint64_unaligned_t*)(dd - 12)) = *((uint64_unaligned_t*)(ss - 12)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 109: memcpy_sse2_64(dd - 109, ss - 109); case 45: memcpy_sse2_32(dd - 45, ss - 45); - *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + *((uint64_unaligned_t*)(dd - 13)) = *((uint64_unaligned_t*)(ss - 13)); + *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; @@ -422,16 +427,16 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { memcpy_sse2_64(dd - 110, ss - 110); case 46: memcpy_sse2_32(dd - 46, ss - 46); - *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + *((uint64_unaligned_t*)(dd - 14)) = *((uint64_unaligned_t*)(ss - 14)); + *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); break; case 111: memcpy_sse2_64(dd - 111, ss - 111); case 47: memcpy_sse2_32(dd - 47, ss - 47); - *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); - *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); + *((uint64_unaligned_t*)(dd - 15)) = *((uint64_unaligned_t*)(ss - 15)); + *((uint64_unaligned_t*)(dd - 8)) = *((uint64_unaligned_t*)(ss - 8)); break; case 112: @@ -454,7 +459,7 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 50: memcpy_sse2_32(dd - 50, ss - 50); memcpy_sse2_16(dd - 18, ss - 18); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 115: @@ -462,7 +467,7 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 51: memcpy_sse2_32(dd - 51, ss - 51); memcpy_sse2_16(dd - 19, ss - 19); - *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); + *((uint16_unaligned_t*)(dd - 3)) = *((uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; @@ -471,7 +476,7 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 52: memcpy_sse2_32(dd - 52, ss - 52); memcpy_sse2_16(dd - 20, ss - 20); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 117: @@ -479,7 +484,7 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 53: memcpy_sse2_32(dd - 53, ss - 53); memcpy_sse2_16(dd - 21, ss - 21); - *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); + *((uint32_unaligned_t*)(dd - 5)) = *((uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; @@ -488,8 +493,8 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 54: memcpy_sse2_32(dd - 54, ss - 54); memcpy_sse2_16(dd - 22, ss - 22); - *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); - *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); + *((uint32_unaligned_t*)(dd - 6)) = *((uint32_unaligned_t*)(ss - 6)); + *((uint16_unaligned_t*)(dd - 2)) = *((uint16_unaligned_t*)(ss - 2)); break; case 119: @@ -497,8 +502,8 @@ static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { case 55: memcpy_sse2_32(dd - 55, ss - 55); memcpy_sse2_16(dd - 23, ss - 23); - *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); - *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); + *((uint32_unaligned_t*)(dd - 7)) = *((uint32_unaligned_t*)(ss - 7)); + *((uint32_unaligned_t*)(dd - 4)) = *((uint32_unaligned_t*)(ss - 4)); break; case 120: