diff --git a/src/Common/memchrSmall.h b/src/Common/memchrSmall.h new file mode 100644 index 00000000000..d3c345685e8 --- /dev/null +++ b/src/Common/memchrSmall.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + +#if defined(__SSE2__) +# include + +namespace detail +{ +inline const char * memchrSmallAllowOverflow15Impl(const char * s, int c, ssize_t n) +{ + __msan_unpoison_overflow_15(s, n); + + __m128i c16 = _mm_set1_epi8(c); + while (n > 0) + { + __m128i block = _mm_loadu_si128(reinterpret_cast(s)); + UInt16 mask = _mm_movemask_epi8(_mm_cmpeq_epi8(block, c16)); + if (mask) + { + auto offset = std::countl_zero(mask); + return offset < n ? s + offset : nullptr; + } + + s += 16; + n -= 16; + } + + return nullptr; +} +} + +/// Works under assumption, that it's possible to read up to 15 excessive bytes after end of 's' region +inline const void * memchrSmallAllowOverflow15(const void * s, int c, size_t n) +{ + return detail::memchrSmallAllowOverflow15Impl(reinterpret_cast(s), c, n); +} + +#else +inline const void * memchrSmallAllowOverflow15(const void * s, int c, size_t n) +{ + return memchr(s, c, n); +} +#endif diff --git a/src/Functions/splitByChar.cpp b/src/Functions/splitByChar.cpp index bfaea38aec9..74156b7d4d1 100644 --- a/src/Functions/splitByChar.cpp +++ b/src/Functions/splitByChar.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -93,7 +94,7 @@ public: bool get(Pos & token_begin, Pos & token_end) { - if (!pos) + if (!pos) [[unlikely]] return false; token_begin = pos; @@ -114,7 +115,7 @@ public: return false; } - pos = reinterpret_cast(memchr(pos, separator, end - pos)); + pos = reinterpret_cast(memchrSmallAllowOverflow15(pos, separator, end - pos)); if (pos) { token_end = pos; diff --git a/tests/performance/function_tokens.xml b/tests/performance/function_tokens.xml index f2850267da8..1ff56323d62 100644 --- a/tests/performance/function_tokens.xml +++ b/tests/performance/function_tokens.xml @@ -1,4 +1,5 @@ with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByChar(' ', materialize(s)) as w from numbers(1000000) with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp(' ', materialize(s)) as w from numbers(1000000) + with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp('\s+', materialize(s)) as w from numbers(100000)