#pragma once //===================================================================== // // FastMemcpy.c - skywind3000@163.com, 2015 // // feature: // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) // //===================================================================== #include #include #include //--------------------------------------------------------------------- // force inline for compilers //--------------------------------------------------------------------- #ifndef INLINE #ifdef __GNUC__ #if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) #define INLINE __inline__ __attribute__((always_inline)) #else #define INLINE __inline__ #endif #elif defined(_MSC_VER) #define INLINE __forceinline #elif (defined(__BORLANDC__) || defined(__WATCOMC__)) #define INLINE __inline #else #define INLINE #endif #endif /// NOLINTBEGIN(modernize-use-using) typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t; typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t; typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t; /// NOLINTEND(modernize-use-using) //--------------------------------------------------------------------- // fast copy for different sizes //--------------------------------------------------------------------- static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src) { __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); } static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src) { __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); } static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src) { __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); } static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src) { __m128i m0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); __m128i m1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); __m128i m2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); __m128i m3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); __m128i m4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); __m128i m5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); __m128i m6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); __m128i m7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6); _mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7); } //--------------------------------------------------------------------- // tiny memory copy with jump table optimized //--------------------------------------------------------------------- /// Attribute is used to avoid an error with undefined behaviour sanitizer /// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer /// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library. __attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size) { unsigned char *dd = ((unsigned char*)dst) + size; const unsigned char *ss = ((const unsigned char*)src) + size; switch (size) /// NOLINT(bugprone-switch-missing-default-case) { case 64: memcpy_sse2_64(dd - 64, ss - 64); [[fallthrough]]; case 0: break; case 65: memcpy_sse2_64(dd - 65, ss - 65); [[fallthrough]]; case 1: dd[-1] = ss[-1]; break; case 66: memcpy_sse2_64(dd - 66, ss - 66); [[fallthrough]]; case 2: *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 67: memcpy_sse2_64(dd - 67, ss - 67); [[fallthrough]]; case 3: *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; case 68: memcpy_sse2_64(dd - 68, ss - 68); [[fallthrough]]; case 4: *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 69: memcpy_sse2_64(dd - 69, ss - 69); [[fallthrough]]; case 5: *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 70: memcpy_sse2_64(dd - 70, ss - 70); [[fallthrough]]; case 6: *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 71: memcpy_sse2_64(dd - 71, ss - 71); [[fallthrough]]; case 7: *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 72: memcpy_sse2_64(dd - 72, ss - 72); [[fallthrough]]; case 8: *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); break; case 73: memcpy_sse2_64(dd - 73, ss - 73); [[fallthrough]]; case 9: *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); dd[-1] = ss[-1]; break; case 74: memcpy_sse2_64(dd - 74, ss - 74); [[fallthrough]]; case 10: *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 75: memcpy_sse2_64(dd - 75, ss - 75); [[fallthrough]]; case 11: *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 76: memcpy_sse2_64(dd - 76, ss - 76); [[fallthrough]]; case 12: *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 77: memcpy_sse2_64(dd - 77, ss - 77); [[fallthrough]]; case 13: *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 78: memcpy_sse2_64(dd - 78, ss - 78); [[fallthrough]]; case 14: *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); break; case 79: memcpy_sse2_64(dd - 79, ss - 79); [[fallthrough]]; case 15: *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); break; case 80: memcpy_sse2_64(dd - 80, ss - 80); [[fallthrough]]; case 16: memcpy_sse2_16(dd - 16, ss - 16); break; case 81: memcpy_sse2_64(dd - 81, ss - 81); [[fallthrough]]; case 17: memcpy_sse2_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; case 82: memcpy_sse2_64(dd - 82, ss - 82); [[fallthrough]]; case 18: memcpy_sse2_16(dd - 18, ss - 18); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 83: memcpy_sse2_64(dd - 83, ss - 83); [[fallthrough]]; case 19: memcpy_sse2_16(dd - 19, ss - 19); *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; case 84: memcpy_sse2_64(dd - 84, ss - 84); [[fallthrough]]; case 20: memcpy_sse2_16(dd - 20, ss - 20); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 85: memcpy_sse2_64(dd - 85, ss - 85); [[fallthrough]]; case 21: memcpy_sse2_16(dd - 21, ss - 21); *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 86: memcpy_sse2_64(dd - 86, ss - 86); [[fallthrough]]; case 22: memcpy_sse2_16(dd - 22, ss - 22); *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 87: memcpy_sse2_64(dd - 87, ss - 87); [[fallthrough]]; case 23: memcpy_sse2_16(dd - 23, ss - 23); *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 88: memcpy_sse2_64(dd - 88, ss - 88); [[fallthrough]]; case 24: memcpy_sse2_16(dd - 24, ss - 24); memcpy_sse2_16(dd - 16, ss - 16); break; case 89: memcpy_sse2_64(dd - 89, ss - 89); [[fallthrough]]; case 25: memcpy_sse2_16(dd - 25, ss - 25); memcpy_sse2_16(dd - 16, ss - 16); break; case 90: memcpy_sse2_64(dd - 90, ss - 90); [[fallthrough]]; case 26: memcpy_sse2_16(dd - 26, ss - 26); memcpy_sse2_16(dd - 16, ss - 16); break; case 91: memcpy_sse2_64(dd - 91, ss - 91); [[fallthrough]]; case 27: memcpy_sse2_16(dd - 27, ss - 27); memcpy_sse2_16(dd - 16, ss - 16); break; case 92: memcpy_sse2_64(dd - 92, ss - 92); [[fallthrough]]; case 28: memcpy_sse2_16(dd - 28, ss - 28); memcpy_sse2_16(dd - 16, ss - 16); break; case 93: memcpy_sse2_64(dd - 93, ss - 93); [[fallthrough]]; case 29: memcpy_sse2_16(dd - 29, ss - 29); memcpy_sse2_16(dd - 16, ss - 16); break; case 94: memcpy_sse2_64(dd - 94, ss - 94); [[fallthrough]]; case 30: memcpy_sse2_16(dd - 30, ss - 30); memcpy_sse2_16(dd - 16, ss - 16); break; case 95: memcpy_sse2_64(dd - 95, ss - 95); [[fallthrough]]; case 31: memcpy_sse2_16(dd - 31, ss - 31); memcpy_sse2_16(dd - 16, ss - 16); break; case 96: memcpy_sse2_64(dd - 96, ss - 96); [[fallthrough]]; case 32: memcpy_sse2_32(dd - 32, ss - 32); break; case 97: memcpy_sse2_64(dd - 97, ss - 97); [[fallthrough]]; case 33: memcpy_sse2_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; case 98: memcpy_sse2_64(dd - 98, ss - 98); [[fallthrough]]; case 34: memcpy_sse2_32(dd - 34, ss - 34); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 99: memcpy_sse2_64(dd - 99, ss - 99); [[fallthrough]]; case 35: memcpy_sse2_32(dd - 35, ss - 35); *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; case 100: memcpy_sse2_64(dd - 100, ss - 100); [[fallthrough]]; case 36: memcpy_sse2_32(dd - 36, ss - 36); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 101: memcpy_sse2_64(dd - 101, ss - 101); [[fallthrough]]; case 37: memcpy_sse2_32(dd - 37, ss - 37); *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 102: memcpy_sse2_64(dd - 102, ss - 102); [[fallthrough]]; case 38: memcpy_sse2_32(dd - 38, ss - 38); *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 103: memcpy_sse2_64(dd - 103, ss - 103); [[fallthrough]]; case 39: memcpy_sse2_32(dd - 39, ss - 39); *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 104: memcpy_sse2_64(dd - 104, ss - 104); [[fallthrough]]; case 40: memcpy_sse2_32(dd - 40, ss - 40); *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); break; case 105: memcpy_sse2_64(dd - 105, ss - 105); [[fallthrough]]; case 41: memcpy_sse2_32(dd - 41, ss - 41); *((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9)); dd[-1] = ss[-1]; break; case 106: memcpy_sse2_64(dd - 106, ss - 106); [[fallthrough]]; case 42: memcpy_sse2_32(dd - 42, ss - 42); *((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10)); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 107: memcpy_sse2_64(dd - 107, ss - 107); [[fallthrough]]; case 43: memcpy_sse2_32(dd - 43, ss - 43); *((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 108: memcpy_sse2_64(dd - 108, ss - 108); [[fallthrough]]; case 44: memcpy_sse2_32(dd - 44, ss - 44); *((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 109: memcpy_sse2_64(dd - 109, ss - 109); [[fallthrough]]; case 45: memcpy_sse2_32(dd - 45, ss - 45); *((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13)); *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 110: memcpy_sse2_64(dd - 110, ss - 110); [[fallthrough]]; case 46: memcpy_sse2_32(dd - 46, ss - 46); *((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14)); *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); break; case 111: memcpy_sse2_64(dd - 111, ss - 111); [[fallthrough]]; case 47: memcpy_sse2_32(dd - 47, ss - 47); *((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15)); *((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8)); break; case 112: memcpy_sse2_64(dd - 112, ss - 112); [[fallthrough]]; case 48: memcpy_sse2_32(dd - 48, ss - 48); memcpy_sse2_16(dd - 16, ss - 16); break; case 113: memcpy_sse2_64(dd - 113, ss - 113); [[fallthrough]]; case 49: memcpy_sse2_32(dd - 49, ss - 49); memcpy_sse2_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; case 114: memcpy_sse2_64(dd - 114, ss - 114); [[fallthrough]]; case 50: memcpy_sse2_32(dd - 50, ss - 50); memcpy_sse2_16(dd - 18, ss - 18); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 115: memcpy_sse2_64(dd - 115, ss - 115); [[fallthrough]]; case 51: memcpy_sse2_32(dd - 51, ss - 51); memcpy_sse2_16(dd - 19, ss - 19); *((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3)); dd[-1] = ss[-1]; break; case 116: memcpy_sse2_64(dd - 116, ss - 116); [[fallthrough]]; case 52: memcpy_sse2_32(dd - 52, ss - 52); memcpy_sse2_16(dd - 20, ss - 20); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 117: memcpy_sse2_64(dd - 117, ss - 117); [[fallthrough]]; case 53: memcpy_sse2_32(dd - 53, ss - 53); memcpy_sse2_16(dd - 21, ss - 21); *((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5)); dd[-1] = ss[-1]; break; case 118: memcpy_sse2_64(dd - 118, ss - 118); [[fallthrough]]; case 54: memcpy_sse2_32(dd - 54, ss - 54); memcpy_sse2_16(dd - 22, ss - 22); *((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6)); *((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2)); break; case 119: memcpy_sse2_64(dd - 119, ss - 119); [[fallthrough]]; case 55: memcpy_sse2_32(dd - 55, ss - 55); memcpy_sse2_16(dd - 23, ss - 23); *((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7)); *((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4)); break; case 120: memcpy_sse2_64(dd - 120, ss - 120); [[fallthrough]]; case 56: memcpy_sse2_32(dd - 56, ss - 56); memcpy_sse2_16(dd - 24, ss - 24); memcpy_sse2_16(dd - 16, ss - 16); break; case 121: memcpy_sse2_64(dd - 121, ss - 121); [[fallthrough]]; case 57: memcpy_sse2_32(dd - 57, ss - 57); memcpy_sse2_16(dd - 25, ss - 25); memcpy_sse2_16(dd - 16, ss - 16); break; case 122: memcpy_sse2_64(dd - 122, ss - 122); [[fallthrough]]; case 58: memcpy_sse2_32(dd - 58, ss - 58); memcpy_sse2_16(dd - 26, ss - 26); memcpy_sse2_16(dd - 16, ss - 16); break; case 123: memcpy_sse2_64(dd - 123, ss - 123); [[fallthrough]]; case 59: memcpy_sse2_32(dd - 59, ss - 59); memcpy_sse2_16(dd - 27, ss - 27); memcpy_sse2_16(dd - 16, ss - 16); break; case 124: memcpy_sse2_64(dd - 124, ss - 124); [[fallthrough]]; case 60: memcpy_sse2_32(dd - 60, ss - 60); memcpy_sse2_16(dd - 28, ss - 28); memcpy_sse2_16(dd - 16, ss - 16); break; case 125: memcpy_sse2_64(dd - 125, ss - 125); [[fallthrough]]; case 61: memcpy_sse2_32(dd - 61, ss - 61); memcpy_sse2_16(dd - 29, ss - 29); memcpy_sse2_16(dd - 16, ss - 16); break; case 126: memcpy_sse2_64(dd - 126, ss - 126); [[fallthrough]]; case 62: memcpy_sse2_32(dd - 62, ss - 62); memcpy_sse2_16(dd - 30, ss - 30); memcpy_sse2_16(dd - 16, ss - 16); break; case 127: memcpy_sse2_64(dd - 127, ss - 127); [[fallthrough]]; case 63: memcpy_sse2_32(dd - 63, ss - 63); memcpy_sse2_16(dd - 31, ss - 31); memcpy_sse2_16(dd - 16, ss - 16); break; case 128: memcpy_sse2_128(dd - 128, ss - 128); break; } return dst; } //--------------------------------------------------------------------- // main routine //--------------------------------------------------------------------- void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size) /// NOLINT(misc-definitions-in-headers) { unsigned char *dst = (unsigned char*)destination; const unsigned char *src = (const unsigned char*)source; static size_t cachesize = 0x200000; // L2-cache size size_t padding; // small memory copy if (size <= 128) { return memcpy_tiny(dst, src, size); } // align destination to 16 bytes boundary padding = (16 - (((size_t)dst) & 15)) & 15; if (padding > 0) { __m128i head = _mm_loadu_si128(reinterpret_cast(src)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head); dst += padding; src += padding; size -= padding; } // medium size copy if (size <= cachesize) { __m128i c0, c1, c2, c3, c4, c5, c6, c7; for (; size >= 128; size -= 128) { c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); src += 128; _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); _mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); dst += 128; } } else { // big memory copy __m128i c0, c1, c2, c3, c4, c5, c6, c7; _mm_prefetch((const char*)(src), _MM_HINT_NTA); if ((((size_t)src) & 15) == 0) { // source aligned for (; size >= 128; size -= 128) { c0 = _mm_load_si128((reinterpret_cast(src)) + 0); c1 = _mm_load_si128((reinterpret_cast(src)) + 1); c2 = _mm_load_si128((reinterpret_cast(src)) + 2); c3 = _mm_load_si128((reinterpret_cast(src)) + 3); c4 = _mm_load_si128((reinterpret_cast(src)) + 4); c5 = _mm_load_si128((reinterpret_cast(src)) + 5); c6 = _mm_load_si128((reinterpret_cast(src)) + 6); c7 = _mm_load_si128((reinterpret_cast(src)) + 7); _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); src += 128; _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); dst += 128; } } else { // source unaligned for (; size >= 128; size -= 128) { c0 = _mm_loadu_si128((reinterpret_cast(src)) + 0); c1 = _mm_loadu_si128((reinterpret_cast(src)) + 1); c2 = _mm_loadu_si128((reinterpret_cast(src)) + 2); c3 = _mm_loadu_si128((reinterpret_cast(src)) + 3); c4 = _mm_loadu_si128((reinterpret_cast(src)) + 4); c5 = _mm_loadu_si128((reinterpret_cast(src)) + 5); c6 = _mm_loadu_si128((reinterpret_cast(src)) + 6); c7 = _mm_loadu_si128((reinterpret_cast(src)) + 7); _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); src += 128; _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6); _mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7); dst += 128; } } _mm_sfence(); } memcpy_tiny(dst, src, size); return destination; }