ClickHouse/utils/memcpy-bench/FastMemcpy.h
2021-03-14 19:52:51 +03:00

771 lines
26 KiB
C++

#pragma once
//=====================================================================
//
// FastMemcpy.c - skywind3000@163.com, 2015
//
// feature:
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
//
//=====================================================================
#include <stddef.h>
#include <stdint.h>
#include <emmintrin.h>
//---------------------------------------------------------------------
// force inline for compilers
//---------------------------------------------------------------------
#ifndef INLINE
#ifdef __GNUC__
#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
#define INLINE __inline__ __attribute__((always_inline))
#else
#define INLINE __inline__
#endif
#elif defined(_MSC_VER)
#define INLINE __forceinline
#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
#define INLINE __inline
#else
#define INLINE
#endif
#endif
typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
//---------------------------------------------------------------------
// fast copy for different sizes
//---------------------------------------------------------------------
static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
}
static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
}
static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
__m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
__m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
}
static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src)
{
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
__m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
__m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
__m128i m4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
__m128i m5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
__m128i m6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
__m128i m7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6);
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7);
}
//---------------------------------------------------------------------
// tiny memory copy with jump table optimized
//---------------------------------------------------------------------
/// Attribute is used to avoid an error with undefined behaviour sanitizer
/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
{
unsigned char *dd = ((unsigned char*)dst) + size;
const unsigned char *ss = ((const unsigned char*)src) + size;
switch (size)
{
case 64:
memcpy_sse2_64(dd - 64, ss - 64);
[[fallthrough]];
case 0:
break;
case 65:
memcpy_sse2_64(dd - 65, ss - 65);
[[fallthrough]];
case 1:
dd[-1] = ss[-1];
break;
case 66:
memcpy_sse2_64(dd - 66, ss - 66);
[[fallthrough]];
case 2:
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 67:
memcpy_sse2_64(dd - 67, ss - 67);
[[fallthrough]];
case 3:
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 68:
memcpy_sse2_64(dd - 68, ss - 68);
[[fallthrough]];
case 4:
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 69:
memcpy_sse2_64(dd - 69, ss - 69);
[[fallthrough]];
case 5:
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 70:
memcpy_sse2_64(dd - 70, ss - 70);
[[fallthrough]];
case 6:
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 71:
memcpy_sse2_64(dd - 71, ss - 71);
[[fallthrough]];
case 7:
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 72:
memcpy_sse2_64(dd - 72, ss - 72);
[[fallthrough]];
case 8:
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 73:
memcpy_sse2_64(dd - 73, ss - 73);
[[fallthrough]];
case 9:
*((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
dd[-1] = ss[-1];
break;
case 74:
memcpy_sse2_64(dd - 74, ss - 74);
[[fallthrough]];
case 10:
*((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 75:
memcpy_sse2_64(dd - 75, ss - 75);
[[fallthrough]];
case 11:
*((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 76:
memcpy_sse2_64(dd - 76, ss - 76);
[[fallthrough]];
case 12:
*((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 77:
memcpy_sse2_64(dd - 77, ss - 77);
[[fallthrough]];
case 13:
*((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 78:
memcpy_sse2_64(dd - 78, ss - 78);
[[fallthrough]];
case 14:
*((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 79:
memcpy_sse2_64(dd - 79, ss - 79);
[[fallthrough]];
case 15:
*((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 80:
memcpy_sse2_64(dd - 80, ss - 80);
[[fallthrough]];
case 16:
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 81:
memcpy_sse2_64(dd - 81, ss - 81);
[[fallthrough]];
case 17:
memcpy_sse2_16(dd - 17, ss - 17);
dd[-1] = ss[-1];
break;
case 82:
memcpy_sse2_64(dd - 82, ss - 82);
[[fallthrough]];
case 18:
memcpy_sse2_16(dd - 18, ss - 18);
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 83:
memcpy_sse2_64(dd - 83, ss - 83);
[[fallthrough]];
case 19:
memcpy_sse2_16(dd - 19, ss - 19);
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 84:
memcpy_sse2_64(dd - 84, ss - 84);
[[fallthrough]];
case 20:
memcpy_sse2_16(dd - 20, ss - 20);
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 85:
memcpy_sse2_64(dd - 85, ss - 85);
[[fallthrough]];
case 21:
memcpy_sse2_16(dd - 21, ss - 21);
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 86:
memcpy_sse2_64(dd - 86, ss - 86);
[[fallthrough]];
case 22:
memcpy_sse2_16(dd - 22, ss - 22);
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 87:
memcpy_sse2_64(dd - 87, ss - 87);
[[fallthrough]];
case 23:
memcpy_sse2_16(dd - 23, ss - 23);
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 88:
memcpy_sse2_64(dd - 88, ss - 88);
[[fallthrough]];
case 24:
memcpy_sse2_16(dd - 24, ss - 24);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 89:
memcpy_sse2_64(dd - 89, ss - 89);
[[fallthrough]];
case 25:
memcpy_sse2_16(dd - 25, ss - 25);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 90:
memcpy_sse2_64(dd - 90, ss - 90);
[[fallthrough]];
case 26:
memcpy_sse2_16(dd - 26, ss - 26);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 91:
memcpy_sse2_64(dd - 91, ss - 91);
[[fallthrough]];
case 27:
memcpy_sse2_16(dd - 27, ss - 27);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 92:
memcpy_sse2_64(dd - 92, ss - 92);
[[fallthrough]];
case 28:
memcpy_sse2_16(dd - 28, ss - 28);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 93:
memcpy_sse2_64(dd - 93, ss - 93);
[[fallthrough]];
case 29:
memcpy_sse2_16(dd - 29, ss - 29);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 94:
memcpy_sse2_64(dd - 94, ss - 94);
[[fallthrough]];
case 30:
memcpy_sse2_16(dd - 30, ss - 30);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 95:
memcpy_sse2_64(dd - 95, ss - 95);
[[fallthrough]];
case 31:
memcpy_sse2_16(dd - 31, ss - 31);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 96:
memcpy_sse2_64(dd - 96, ss - 96);
[[fallthrough]];
case 32:
memcpy_sse2_32(dd - 32, ss - 32);
break;
case 97:
memcpy_sse2_64(dd - 97, ss - 97);
[[fallthrough]];
case 33:
memcpy_sse2_32(dd - 33, ss - 33);
dd[-1] = ss[-1];
break;
case 98:
memcpy_sse2_64(dd - 98, ss - 98);
[[fallthrough]];
case 34:
memcpy_sse2_32(dd - 34, ss - 34);
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 99:
memcpy_sse2_64(dd - 99, ss - 99);
[[fallthrough]];
case 35:
memcpy_sse2_32(dd - 35, ss - 35);
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 100:
memcpy_sse2_64(dd - 100, ss - 100);
[[fallthrough]];
case 36:
memcpy_sse2_32(dd - 36, ss - 36);
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 101:
memcpy_sse2_64(dd - 101, ss - 101);
[[fallthrough]];
case 37:
memcpy_sse2_32(dd - 37, ss - 37);
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 102:
memcpy_sse2_64(dd - 102, ss - 102);
[[fallthrough]];
case 38:
memcpy_sse2_32(dd - 38, ss - 38);
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 103:
memcpy_sse2_64(dd - 103, ss - 103);
[[fallthrough]];
case 39:
memcpy_sse2_32(dd - 39, ss - 39);
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 104:
memcpy_sse2_64(dd - 104, ss - 104);
[[fallthrough]];
case 40:
memcpy_sse2_32(dd - 40, ss - 40);
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 105:
memcpy_sse2_64(dd - 105, ss - 105);
[[fallthrough]];
case 41:
memcpy_sse2_32(dd - 41, ss - 41);
*((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
dd[-1] = ss[-1];
break;
case 106:
memcpy_sse2_64(dd - 106, ss - 106);
[[fallthrough]];
case 42:
memcpy_sse2_32(dd - 42, ss - 42);
*((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 107:
memcpy_sse2_64(dd - 107, ss - 107);
[[fallthrough]];
case 43:
memcpy_sse2_32(dd - 43, ss - 43);
*((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 108:
memcpy_sse2_64(dd - 108, ss - 108);
[[fallthrough]];
case 44:
memcpy_sse2_32(dd - 44, ss - 44);
*((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 109:
memcpy_sse2_64(dd - 109, ss - 109);
[[fallthrough]];
case 45:
memcpy_sse2_32(dd - 45, ss - 45);
*((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 110:
memcpy_sse2_64(dd - 110, ss - 110);
[[fallthrough]];
case 46:
memcpy_sse2_32(dd - 46, ss - 46);
*((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 111:
memcpy_sse2_64(dd - 111, ss - 111);
[[fallthrough]];
case 47:
memcpy_sse2_32(dd - 47, ss - 47);
*((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
break;
case 112:
memcpy_sse2_64(dd - 112, ss - 112);
[[fallthrough]];
case 48:
memcpy_sse2_32(dd - 48, ss - 48);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 113:
memcpy_sse2_64(dd - 113, ss - 113);
[[fallthrough]];
case 49:
memcpy_sse2_32(dd - 49, ss - 49);
memcpy_sse2_16(dd - 17, ss - 17);
dd[-1] = ss[-1];
break;
case 114:
memcpy_sse2_64(dd - 114, ss - 114);
[[fallthrough]];
case 50:
memcpy_sse2_32(dd - 50, ss - 50);
memcpy_sse2_16(dd - 18, ss - 18);
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 115:
memcpy_sse2_64(dd - 115, ss - 115);
[[fallthrough]];
case 51:
memcpy_sse2_32(dd - 51, ss - 51);
memcpy_sse2_16(dd - 19, ss - 19);
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
dd[-1] = ss[-1];
break;
case 116:
memcpy_sse2_64(dd - 116, ss - 116);
[[fallthrough]];
case 52:
memcpy_sse2_32(dd - 52, ss - 52);
memcpy_sse2_16(dd - 20, ss - 20);
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 117:
memcpy_sse2_64(dd - 117, ss - 117);
[[fallthrough]];
case 53:
memcpy_sse2_32(dd - 53, ss - 53);
memcpy_sse2_16(dd - 21, ss - 21);
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
dd[-1] = ss[-1];
break;
case 118:
memcpy_sse2_64(dd - 118, ss - 118);
[[fallthrough]];
case 54:
memcpy_sse2_32(dd - 54, ss - 54);
memcpy_sse2_16(dd - 22, ss - 22);
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
break;
case 119:
memcpy_sse2_64(dd - 119, ss - 119);
[[fallthrough]];
case 55:
memcpy_sse2_32(dd - 55, ss - 55);
memcpy_sse2_16(dd - 23, ss - 23);
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
break;
case 120:
memcpy_sse2_64(dd - 120, ss - 120);
[[fallthrough]];
case 56:
memcpy_sse2_32(dd - 56, ss - 56);
memcpy_sse2_16(dd - 24, ss - 24);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 121:
memcpy_sse2_64(dd - 121, ss - 121);
[[fallthrough]];
case 57:
memcpy_sse2_32(dd - 57, ss - 57);
memcpy_sse2_16(dd - 25, ss - 25);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 122:
memcpy_sse2_64(dd - 122, ss - 122);
[[fallthrough]];
case 58:
memcpy_sse2_32(dd - 58, ss - 58);
memcpy_sse2_16(dd - 26, ss - 26);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 123:
memcpy_sse2_64(dd - 123, ss - 123);
[[fallthrough]];
case 59:
memcpy_sse2_32(dd - 59, ss - 59);
memcpy_sse2_16(dd - 27, ss - 27);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 124:
memcpy_sse2_64(dd - 124, ss - 124);
[[fallthrough]];
case 60:
memcpy_sse2_32(dd - 60, ss - 60);
memcpy_sse2_16(dd - 28, ss - 28);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 125:
memcpy_sse2_64(dd - 125, ss - 125);
[[fallthrough]];
case 61:
memcpy_sse2_32(dd - 61, ss - 61);
memcpy_sse2_16(dd - 29, ss - 29);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 126:
memcpy_sse2_64(dd - 126, ss - 126);
[[fallthrough]];
case 62:
memcpy_sse2_32(dd - 62, ss - 62);
memcpy_sse2_16(dd - 30, ss - 30);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 127:
memcpy_sse2_64(dd - 127, ss - 127);
[[fallthrough]];
case 63:
memcpy_sse2_32(dd - 63, ss - 63);
memcpy_sse2_16(dd - 31, ss - 31);
memcpy_sse2_16(dd - 16, ss - 16);
break;
case 128:
memcpy_sse2_128(dd - 128, ss - 128);
break;
}
return dst;
}
//---------------------------------------------------------------------
// main routine
//---------------------------------------------------------------------
void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size)
{
unsigned char *dst = (unsigned char*)destination;
const unsigned char *src = (const unsigned char*)source;
static size_t cachesize = 0x200000; // L2-cache size
size_t padding;
// small memory copy
if (size <= 128)
{
return memcpy_tiny(dst, src, size);
}
// align destination to 16 bytes boundary
padding = (16 - (((size_t)dst) & 15)) & 15;
if (padding > 0)
{
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
dst += padding;
src += padding;
size -= padding;
}
// medium size copy
if (size <= cachesize)
{
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
for (; size >= 128; size -= 128)
{
c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
dst += 128;
}
}
else
{ // big memory copy
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
if ((((size_t)src) & 15) == 0)
{ // source aligned
for (; size >= 128; size -= 128)
{
c0 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 0);
c1 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 1);
c2 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 2);
c3 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 3);
c4 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 4);
c5 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 5);
c6 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 6);
c7 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
dst += 128;
}
}
else
{ // source unaligned
for (; size >= 128; size -= 128)
{
c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
src += 128;
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
dst += 128;
}
}
_mm_sfence();
}
memcpy_tiny(dst, src, size);
return destination;
}