Better memequal implementation

This commit is contained in:
Alexey Milovidov 2021-09-09 05:38:48 +03:00
parent 8588e4d9bb
commit 0e8b4e608b

View File

@ -96,6 +96,34 @@ inline bool compareSSE2x4(const char * p1, const char * p2)
inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)
{
if (size <= 16)
{
if (size >= 8)
{
/// Chunks of 8..16 bytes.
return unalignedLoad<uint64_t>(p1) == unalignedLoad<uint64_t>(p2)
&& unalignedLoad<uint64_t>(p1 + size - 8) == unalignedLoad<uint64_t>(p2 + size - 8);
}
else if (size >= 4)
{
/// Chunks of 4..7 bytes.
return unalignedLoad<uint32_t>(p1) == unalignedLoad<uint32_t>(p2)
&& unalignedLoad<uint32_t>(p1 + size - 4) == unalignedLoad<uint32_t>(p2 + size - 4);
}
else if (size >= 2)
{
/// Chunks of 2..3 bytes.
return unalignedLoad<uint16_t>(p1) == unalignedLoad<uint16_t>(p2)
&& unalignedLoad<uint16_t>(p1 + size - 2) == unalignedLoad<uint16_t>(p2 + size - 2);
}
else if (size >= 1)
{
/// A single byte.
return *p1 == *p2;
}
return true;
}
while (size >= 64)
{
if (compareSSE2x4(p1, p2))
@ -108,39 +136,15 @@ inline bool memequalSSE2Wide(const char * p1, const char * p2, size_t size)
return false;
}
switch ((size % 64) / 16)
switch (size / 16)
{
case 3: if (!compareSSE2(p1 + 32, p2 + 32)) return false; [[fallthrough]];
case 2: if (!compareSSE2(p1 + 16, p2 + 16)) return false; [[fallthrough]];
case 1: if (!compareSSE2(p1 , p2 )) return false; [[fallthrough]];
case 0: break;
case 1: if (!compareSSE2(p1, p2)) return false; [[fallthrough]];
case 0: return compareSSE2(p1 + size - 16, p2 + size - 16);
}
p1 += (size % 64) / 16 * 16;
p2 += (size % 64) / 16 * 16;
switch (size % 16)
{
case 15: if (p1[14] != p2[14]) return false; [[fallthrough]];
case 14: if (p1[13] != p2[13]) return false; [[fallthrough]];
case 13: if (p1[12] != p2[12]) return false; [[fallthrough]];
case 12: if (unalignedLoad<uint32_t>(p1 + 8) == unalignedLoad<uint32_t>(p2 + 8)) goto l8; else return false;
case 11: if (p1[10] != p2[10]) return false; [[fallthrough]];
case 10: if (p1[9] != p2[9]) return false; [[fallthrough]];
case 9: if (p1[8] != p2[8]) return false;
l8: [[fallthrough]];
case 8: return unalignedLoad<uint64_t>(p1) == unalignedLoad<uint64_t>(p2);
case 7: if (p1[6] != p2[6]) return false; [[fallthrough]];
case 6: if (p1[5] != p2[5]) return false; [[fallthrough]];
case 5: if (p1[4] != p2[4]) return false; [[fallthrough]];
case 4: return unalignedLoad<uint32_t>(p1) == unalignedLoad<uint32_t>(p2);
case 3: if (p1[2] != p2[2]) return false; [[fallthrough]];
case 2: return unalignedLoad<uint16_t>(p1) == unalignedLoad<uint16_t>(p2);
case 1: if (p1[0] != p2[0]) return false; [[fallthrough]];
case 0: break;
}
return true;
__builtin_unreachable();
}
#endif