mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
771 lines
26 KiB
C++
771 lines
26 KiB
C++
#pragma once
|
|
|
|
//=====================================================================
|
|
//
|
|
// FastMemcpy.c - skywind3000@163.com, 2015
|
|
//
|
|
// feature:
|
|
// 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1)
|
|
//
|
|
//=====================================================================
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <emmintrin.h>
|
|
|
|
|
|
//---------------------------------------------------------------------
|
|
// force inline for compilers
|
|
//---------------------------------------------------------------------
|
|
#ifndef INLINE
|
|
#ifdef __GNUC__
|
|
#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
|
|
#define INLINE __inline__ __attribute__((always_inline))
|
|
#else
|
|
#define INLINE __inline__
|
|
#endif
|
|
#elif defined(_MSC_VER)
|
|
#define INLINE __forceinline
|
|
#elif (defined(__BORLANDC__) || defined(__WATCOMC__))
|
|
#define INLINE __inline
|
|
#else
|
|
#define INLINE
|
|
#endif
|
|
#endif
|
|
|
|
typedef __attribute__((__aligned__(1))) uint16_t uint16_unaligned_t;
|
|
typedef __attribute__((__aligned__(1))) uint32_t uint32_unaligned_t;
|
|
typedef __attribute__((__aligned__(1))) uint64_t uint64_unaligned_t;
|
|
|
|
//---------------------------------------------------------------------
|
|
// fast copy for different sizes
|
|
//---------------------------------------------------------------------
|
|
static INLINE void memcpy_sse2_16(void * __restrict dst, const void * __restrict src)
|
|
{
|
|
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
|
|
}
|
|
|
|
static INLINE void memcpy_sse2_32(void * __restrict dst, const void * __restrict src)
|
|
{
|
|
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
|
|
}
|
|
|
|
static INLINE void memcpy_sse2_64(void * __restrict dst, const void * __restrict src)
|
|
{
|
|
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
|
|
__m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
|
|
__m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
|
|
}
|
|
|
|
static INLINE void memcpy_sse2_128(void * __restrict dst, const void * __restrict src)
|
|
{
|
|
__m128i m0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
__m128i m1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
|
|
__m128i m2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
|
|
__m128i m3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
|
|
__m128i m4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
|
|
__m128i m5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
|
|
__m128i m6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
|
|
__m128i m7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 0, m0);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 1, m1);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 2, m2);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 3, m3);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 4, m4);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 5, m5);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 6, m6);
|
|
_mm_storeu_si128((reinterpret_cast<__m128i*>(dst)) + 7, m7);
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------
|
|
// tiny memory copy with jump table optimized
|
|
//---------------------------------------------------------------------
|
|
/// Attribute is used to avoid an error with undefined behaviour sanitizer
|
|
/// ../contrib/FastMemcpy/FastMemcpy.h:91:56: runtime error: applying zero offset to null pointer
|
|
/// Found by 01307_orc_output_format.sh, cause - ORCBlockInputFormat and external ORC library.
|
|
__attribute__((__no_sanitize__("undefined"))) inline void *memcpy_tiny(void * __restrict dst, const void * __restrict src, size_t size)
|
|
{
|
|
unsigned char *dd = ((unsigned char*)dst) + size;
|
|
const unsigned char *ss = ((const unsigned char*)src) + size;
|
|
|
|
switch (size)
|
|
{
|
|
case 64:
|
|
memcpy_sse2_64(dd - 64, ss - 64);
|
|
[[fallthrough]];
|
|
case 0:
|
|
break;
|
|
|
|
case 65:
|
|
memcpy_sse2_64(dd - 65, ss - 65);
|
|
[[fallthrough]];
|
|
case 1:
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 66:
|
|
memcpy_sse2_64(dd - 66, ss - 66);
|
|
[[fallthrough]];
|
|
case 2:
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 67:
|
|
memcpy_sse2_64(dd - 67, ss - 67);
|
|
[[fallthrough]];
|
|
case 3:
|
|
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 68:
|
|
memcpy_sse2_64(dd - 68, ss - 68);
|
|
[[fallthrough]];
|
|
case 4:
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 69:
|
|
memcpy_sse2_64(dd - 69, ss - 69);
|
|
[[fallthrough]];
|
|
case 5:
|
|
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 70:
|
|
memcpy_sse2_64(dd - 70, ss - 70);
|
|
[[fallthrough]];
|
|
case 6:
|
|
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 71:
|
|
memcpy_sse2_64(dd - 71, ss - 71);
|
|
[[fallthrough]];
|
|
case 7:
|
|
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 72:
|
|
memcpy_sse2_64(dd - 72, ss - 72);
|
|
[[fallthrough]];
|
|
case 8:
|
|
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
|
|
break;
|
|
|
|
case 73:
|
|
memcpy_sse2_64(dd - 73, ss - 73);
|
|
[[fallthrough]];
|
|
case 9:
|
|
*((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 74:
|
|
memcpy_sse2_64(dd - 74, ss - 74);
|
|
[[fallthrough]];
|
|
case 10:
|
|
*((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 75:
|
|
memcpy_sse2_64(dd - 75, ss - 75);
|
|
[[fallthrough]];
|
|
case 11:
|
|
*((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 76:
|
|
memcpy_sse2_64(dd - 76, ss - 76);
|
|
[[fallthrough]];
|
|
case 12:
|
|
*((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 77:
|
|
memcpy_sse2_64(dd - 77, ss - 77);
|
|
[[fallthrough]];
|
|
case 13:
|
|
*((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
|
|
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 78:
|
|
memcpy_sse2_64(dd - 78, ss - 78);
|
|
[[fallthrough]];
|
|
case 14:
|
|
*((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
|
|
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
|
|
break;
|
|
|
|
case 79:
|
|
memcpy_sse2_64(dd - 79, ss - 79);
|
|
[[fallthrough]];
|
|
case 15:
|
|
*((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
|
|
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
|
|
break;
|
|
|
|
case 80:
|
|
memcpy_sse2_64(dd - 80, ss - 80);
|
|
[[fallthrough]];
|
|
case 16:
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 81:
|
|
memcpy_sse2_64(dd - 81, ss - 81);
|
|
[[fallthrough]];
|
|
case 17:
|
|
memcpy_sse2_16(dd - 17, ss - 17);
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 82:
|
|
memcpy_sse2_64(dd - 82, ss - 82);
|
|
[[fallthrough]];
|
|
case 18:
|
|
memcpy_sse2_16(dd - 18, ss - 18);
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 83:
|
|
memcpy_sse2_64(dd - 83, ss - 83);
|
|
[[fallthrough]];
|
|
case 19:
|
|
memcpy_sse2_16(dd - 19, ss - 19);
|
|
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 84:
|
|
memcpy_sse2_64(dd - 84, ss - 84);
|
|
[[fallthrough]];
|
|
case 20:
|
|
memcpy_sse2_16(dd - 20, ss - 20);
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 85:
|
|
memcpy_sse2_64(dd - 85, ss - 85);
|
|
[[fallthrough]];
|
|
case 21:
|
|
memcpy_sse2_16(dd - 21, ss - 21);
|
|
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 86:
|
|
memcpy_sse2_64(dd - 86, ss - 86);
|
|
[[fallthrough]];
|
|
case 22:
|
|
memcpy_sse2_16(dd - 22, ss - 22);
|
|
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 87:
|
|
memcpy_sse2_64(dd - 87, ss - 87);
|
|
[[fallthrough]];
|
|
case 23:
|
|
memcpy_sse2_16(dd - 23, ss - 23);
|
|
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 88:
|
|
memcpy_sse2_64(dd - 88, ss - 88);
|
|
[[fallthrough]];
|
|
case 24:
|
|
memcpy_sse2_16(dd - 24, ss - 24);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 89:
|
|
memcpy_sse2_64(dd - 89, ss - 89);
|
|
[[fallthrough]];
|
|
case 25:
|
|
memcpy_sse2_16(dd - 25, ss - 25);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 90:
|
|
memcpy_sse2_64(dd - 90, ss - 90);
|
|
[[fallthrough]];
|
|
case 26:
|
|
memcpy_sse2_16(dd - 26, ss - 26);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 91:
|
|
memcpy_sse2_64(dd - 91, ss - 91);
|
|
[[fallthrough]];
|
|
case 27:
|
|
memcpy_sse2_16(dd - 27, ss - 27);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 92:
|
|
memcpy_sse2_64(dd - 92, ss - 92);
|
|
[[fallthrough]];
|
|
case 28:
|
|
memcpy_sse2_16(dd - 28, ss - 28);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 93:
|
|
memcpy_sse2_64(dd - 93, ss - 93);
|
|
[[fallthrough]];
|
|
case 29:
|
|
memcpy_sse2_16(dd - 29, ss - 29);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 94:
|
|
memcpy_sse2_64(dd - 94, ss - 94);
|
|
[[fallthrough]];
|
|
case 30:
|
|
memcpy_sse2_16(dd - 30, ss - 30);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 95:
|
|
memcpy_sse2_64(dd - 95, ss - 95);
|
|
[[fallthrough]];
|
|
case 31:
|
|
memcpy_sse2_16(dd - 31, ss - 31);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 96:
|
|
memcpy_sse2_64(dd - 96, ss - 96);
|
|
[[fallthrough]];
|
|
case 32:
|
|
memcpy_sse2_32(dd - 32, ss - 32);
|
|
break;
|
|
|
|
case 97:
|
|
memcpy_sse2_64(dd - 97, ss - 97);
|
|
[[fallthrough]];
|
|
case 33:
|
|
memcpy_sse2_32(dd - 33, ss - 33);
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 98:
|
|
memcpy_sse2_64(dd - 98, ss - 98);
|
|
[[fallthrough]];
|
|
case 34:
|
|
memcpy_sse2_32(dd - 34, ss - 34);
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 99:
|
|
memcpy_sse2_64(dd - 99, ss - 99);
|
|
[[fallthrough]];
|
|
case 35:
|
|
memcpy_sse2_32(dd - 35, ss - 35);
|
|
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 100:
|
|
memcpy_sse2_64(dd - 100, ss - 100);
|
|
[[fallthrough]];
|
|
case 36:
|
|
memcpy_sse2_32(dd - 36, ss - 36);
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 101:
|
|
memcpy_sse2_64(dd - 101, ss - 101);
|
|
[[fallthrough]];
|
|
case 37:
|
|
memcpy_sse2_32(dd - 37, ss - 37);
|
|
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 102:
|
|
memcpy_sse2_64(dd - 102, ss - 102);
|
|
[[fallthrough]];
|
|
case 38:
|
|
memcpy_sse2_32(dd - 38, ss - 38);
|
|
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 103:
|
|
memcpy_sse2_64(dd - 103, ss - 103);
|
|
[[fallthrough]];
|
|
case 39:
|
|
memcpy_sse2_32(dd - 39, ss - 39);
|
|
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 104:
|
|
memcpy_sse2_64(dd - 104, ss - 104);
|
|
[[fallthrough]];
|
|
case 40:
|
|
memcpy_sse2_32(dd - 40, ss - 40);
|
|
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
|
|
break;
|
|
|
|
case 105:
|
|
memcpy_sse2_64(dd - 105, ss - 105);
|
|
[[fallthrough]];
|
|
case 41:
|
|
memcpy_sse2_32(dd - 41, ss - 41);
|
|
*((uint64_unaligned_t*)(dd - 9)) = *((const uint64_unaligned_t*)(ss - 9));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 106:
|
|
memcpy_sse2_64(dd - 106, ss - 106);
|
|
[[fallthrough]];
|
|
case 42:
|
|
memcpy_sse2_32(dd - 42, ss - 42);
|
|
*((uint64_unaligned_t*)(dd - 10)) = *((const uint64_unaligned_t*)(ss - 10));
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 107:
|
|
memcpy_sse2_64(dd - 107, ss - 107);
|
|
[[fallthrough]];
|
|
case 43:
|
|
memcpy_sse2_32(dd - 43, ss - 43);
|
|
*((uint64_unaligned_t*)(dd - 11)) = *((const uint64_unaligned_t*)(ss - 11));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 108:
|
|
memcpy_sse2_64(dd - 108, ss - 108);
|
|
[[fallthrough]];
|
|
case 44:
|
|
memcpy_sse2_32(dd - 44, ss - 44);
|
|
*((uint64_unaligned_t*)(dd - 12)) = *((const uint64_unaligned_t*)(ss - 12));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 109:
|
|
memcpy_sse2_64(dd - 109, ss - 109);
|
|
[[fallthrough]];
|
|
case 45:
|
|
memcpy_sse2_32(dd - 45, ss - 45);
|
|
*((uint64_unaligned_t*)(dd - 13)) = *((const uint64_unaligned_t*)(ss - 13));
|
|
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 110:
|
|
memcpy_sse2_64(dd - 110, ss - 110);
|
|
[[fallthrough]];
|
|
case 46:
|
|
memcpy_sse2_32(dd - 46, ss - 46);
|
|
*((uint64_unaligned_t*)(dd - 14)) = *((const uint64_unaligned_t*)(ss - 14));
|
|
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
|
|
break;
|
|
|
|
case 111:
|
|
memcpy_sse2_64(dd - 111, ss - 111);
|
|
[[fallthrough]];
|
|
case 47:
|
|
memcpy_sse2_32(dd - 47, ss - 47);
|
|
*((uint64_unaligned_t*)(dd - 15)) = *((const uint64_unaligned_t*)(ss - 15));
|
|
*((uint64_unaligned_t*)(dd - 8)) = *((const uint64_unaligned_t*)(ss - 8));
|
|
break;
|
|
|
|
case 112:
|
|
memcpy_sse2_64(dd - 112, ss - 112);
|
|
[[fallthrough]];
|
|
case 48:
|
|
memcpy_sse2_32(dd - 48, ss - 48);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 113:
|
|
memcpy_sse2_64(dd - 113, ss - 113);
|
|
[[fallthrough]];
|
|
case 49:
|
|
memcpy_sse2_32(dd - 49, ss - 49);
|
|
memcpy_sse2_16(dd - 17, ss - 17);
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 114:
|
|
memcpy_sse2_64(dd - 114, ss - 114);
|
|
[[fallthrough]];
|
|
case 50:
|
|
memcpy_sse2_32(dd - 50, ss - 50);
|
|
memcpy_sse2_16(dd - 18, ss - 18);
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 115:
|
|
memcpy_sse2_64(dd - 115, ss - 115);
|
|
[[fallthrough]];
|
|
case 51:
|
|
memcpy_sse2_32(dd - 51, ss - 51);
|
|
memcpy_sse2_16(dd - 19, ss - 19);
|
|
*((uint16_unaligned_t*)(dd - 3)) = *((const uint16_unaligned_t*)(ss - 3));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 116:
|
|
memcpy_sse2_64(dd - 116, ss - 116);
|
|
[[fallthrough]];
|
|
case 52:
|
|
memcpy_sse2_32(dd - 52, ss - 52);
|
|
memcpy_sse2_16(dd - 20, ss - 20);
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 117:
|
|
memcpy_sse2_64(dd - 117, ss - 117);
|
|
[[fallthrough]];
|
|
case 53:
|
|
memcpy_sse2_32(dd - 53, ss - 53);
|
|
memcpy_sse2_16(dd - 21, ss - 21);
|
|
*((uint32_unaligned_t*)(dd - 5)) = *((const uint32_unaligned_t*)(ss - 5));
|
|
dd[-1] = ss[-1];
|
|
break;
|
|
|
|
case 118:
|
|
memcpy_sse2_64(dd - 118, ss - 118);
|
|
[[fallthrough]];
|
|
case 54:
|
|
memcpy_sse2_32(dd - 54, ss - 54);
|
|
memcpy_sse2_16(dd - 22, ss - 22);
|
|
*((uint32_unaligned_t*)(dd - 6)) = *((const uint32_unaligned_t*)(ss - 6));
|
|
*((uint16_unaligned_t*)(dd - 2)) = *((const uint16_unaligned_t*)(ss - 2));
|
|
break;
|
|
|
|
case 119:
|
|
memcpy_sse2_64(dd - 119, ss - 119);
|
|
[[fallthrough]];
|
|
case 55:
|
|
memcpy_sse2_32(dd - 55, ss - 55);
|
|
memcpy_sse2_16(dd - 23, ss - 23);
|
|
*((uint32_unaligned_t*)(dd - 7)) = *((const uint32_unaligned_t*)(ss - 7));
|
|
*((uint32_unaligned_t*)(dd - 4)) = *((const uint32_unaligned_t*)(ss - 4));
|
|
break;
|
|
|
|
case 120:
|
|
memcpy_sse2_64(dd - 120, ss - 120);
|
|
[[fallthrough]];
|
|
case 56:
|
|
memcpy_sse2_32(dd - 56, ss - 56);
|
|
memcpy_sse2_16(dd - 24, ss - 24);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 121:
|
|
memcpy_sse2_64(dd - 121, ss - 121);
|
|
[[fallthrough]];
|
|
case 57:
|
|
memcpy_sse2_32(dd - 57, ss - 57);
|
|
memcpy_sse2_16(dd - 25, ss - 25);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 122:
|
|
memcpy_sse2_64(dd - 122, ss - 122);
|
|
[[fallthrough]];
|
|
case 58:
|
|
memcpy_sse2_32(dd - 58, ss - 58);
|
|
memcpy_sse2_16(dd - 26, ss - 26);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 123:
|
|
memcpy_sse2_64(dd - 123, ss - 123);
|
|
[[fallthrough]];
|
|
case 59:
|
|
memcpy_sse2_32(dd - 59, ss - 59);
|
|
memcpy_sse2_16(dd - 27, ss - 27);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 124:
|
|
memcpy_sse2_64(dd - 124, ss - 124);
|
|
[[fallthrough]];
|
|
case 60:
|
|
memcpy_sse2_32(dd - 60, ss - 60);
|
|
memcpy_sse2_16(dd - 28, ss - 28);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 125:
|
|
memcpy_sse2_64(dd - 125, ss - 125);
|
|
[[fallthrough]];
|
|
case 61:
|
|
memcpy_sse2_32(dd - 61, ss - 61);
|
|
memcpy_sse2_16(dd - 29, ss - 29);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 126:
|
|
memcpy_sse2_64(dd - 126, ss - 126);
|
|
[[fallthrough]];
|
|
case 62:
|
|
memcpy_sse2_32(dd - 62, ss - 62);
|
|
memcpy_sse2_16(dd - 30, ss - 30);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 127:
|
|
memcpy_sse2_64(dd - 127, ss - 127);
|
|
[[fallthrough]];
|
|
case 63:
|
|
memcpy_sse2_32(dd - 63, ss - 63);
|
|
memcpy_sse2_16(dd - 31, ss - 31);
|
|
memcpy_sse2_16(dd - 16, ss - 16);
|
|
break;
|
|
|
|
case 128:
|
|
memcpy_sse2_128(dd - 128, ss - 128);
|
|
break;
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------
|
|
// main routine
|
|
//---------------------------------------------------------------------
|
|
void* memcpy_fast_sse(void * __restrict destination, const void * __restrict source, size_t size)
|
|
{
|
|
unsigned char *dst = (unsigned char*)destination;
|
|
const unsigned char *src = (const unsigned char*)source;
|
|
static size_t cachesize = 0x200000; // L2-cache size
|
|
size_t padding;
|
|
|
|
// small memory copy
|
|
if (size <= 128)
|
|
{
|
|
return memcpy_tiny(dst, src, size);
|
|
}
|
|
|
|
// align destination to 16 bytes boundary
|
|
padding = (16 - (((size_t)dst) & 15)) & 15;
|
|
|
|
if (padding > 0)
|
|
{
|
|
__m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
|
|
dst += padding;
|
|
src += padding;
|
|
size -= padding;
|
|
}
|
|
|
|
// medium size copy
|
|
if (size <= cachesize)
|
|
{
|
|
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
|
|
|
|
for (; size >= 128; size -= 128)
|
|
{
|
|
c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
|
|
c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
|
|
c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
|
|
c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
|
|
c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
|
|
c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
|
|
c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
|
|
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
|
|
src += 128;
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
|
|
_mm_store_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
|
|
dst += 128;
|
|
}
|
|
}
|
|
else
|
|
{ // big memory copy
|
|
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
|
|
|
|
_mm_prefetch((const char*)(src), _MM_HINT_NTA);
|
|
|
|
if ((((size_t)src) & 15) == 0)
|
|
{ // source aligned
|
|
for (; size >= 128; size -= 128)
|
|
{
|
|
c0 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
c1 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 1);
|
|
c2 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 2);
|
|
c3 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 3);
|
|
c4 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 4);
|
|
c5 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 5);
|
|
c6 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 6);
|
|
c7 = _mm_load_si128((reinterpret_cast<const __m128i*>(src)) + 7);
|
|
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
|
|
src += 128;
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
|
|
dst += 128;
|
|
}
|
|
}
|
|
else
|
|
{ // source unaligned
|
|
for (; size >= 128; size -= 128)
|
|
{
|
|
c0 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 0);
|
|
c1 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 1);
|
|
c2 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 2);
|
|
c3 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 3);
|
|
c4 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 4);
|
|
c5 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 5);
|
|
c6 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 6);
|
|
c7 = _mm_loadu_si128((reinterpret_cast<const __m128i*>(src)) + 7);
|
|
_mm_prefetch((const char*)(src + 256), _MM_HINT_NTA);
|
|
src += 128;
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 0), c0);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 1), c1);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 2), c2);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 3), c3);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 4), c4);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 5), c5);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 6), c6);
|
|
_mm_stream_si128(((reinterpret_cast<__m128i*>(dst)) + 7), c7);
|
|
dst += 128;
|
|
}
|
|
}
|
|
_mm_sfence();
|
|
}
|
|
|
|
memcpy_tiny(dst, src, size);
|
|
|
|
return destination;
|
|
}
|