ClickHouse/src/Common/StringSearcher.h

923 lines
31 KiB
C++
Raw Normal View History

#pragma once
2021-10-02 07:13:14 +00:00
#include <base/getPageSize.h>
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>
2019-05-14 09:58:33 +00:00
#include <Core/Defines.h>
#include <Poco/Unicode.h>
2022-06-28 11:29:07 +00:00
#include <cstdint>
#include <cstring>
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
namespace DB
{
2016-01-12 02:21:15 +00:00
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
2016-01-12 02:21:15 +00:00
}
/** Variants for searching a substring in a string.
* In most cases, performance is less than Volnitsky (see Volnitsky.h).
*/
2023-02-23 12:58:42 +00:00
namespace impl
{
class StringSearcherBase
{
public:
bool force_fallback = false;
2023-02-23 12:58:42 +00:00
#ifdef __SSE2__
protected:
2023-02-23 12:58:42 +00:00
static constexpr size_t N = sizeof(__m128i);
2023-02-23 12:58:42 +00:00
bool isPageSafe(const void * const ptr) const
{
2023-02-23 12:58:42 +00:00
return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= page_size - N;
}
2023-02-23 12:58:42 +00:00
private:
const Int64 page_size = ::getPageSize();
#endif
};
2023-02-23 12:58:42 +00:00
/// Performs case-sensitive or case-insensitive search of ASCII or UTF-8 strings
template <bool CaseSensitive, bool ASCII> class StringSearcher;
2023-02-24 10:13:35 +00:00
/// Case-sensitive ASCII and UTF8 searcher
template <bool ASCII>
class StringSearcher<true, ASCII> : public StringSearcherBase
{
private:
2023-02-24 10:13:35 +00:00
/// string to be searched for
2020-02-22 05:46:35 +00:00
const uint8_t * const needle;
2023-02-24 10:13:35 +00:00
const uint8_t * const needle_end;
/// first character in `needle`
uint8_t first_needle_character = 0;
#ifdef __SSE4_1__
2023-02-24 10:13:35 +00:00
/// second character of "needle" (if its length is > 1)
uint8_t second_needle_character = 0;
/// first/second needle character broadcasted into a 16 bytes vector
__m128i first_needle_character_vec;
__m128i second_needle_character_vec;
/// vector of first 16 characters of `needle`
__m128i cache = _mm_setzero_si128();
uint16_t cachemask = 0;
#endif
public:
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-24 10:13:35 +00:00
StringSearcher(const CharT * needle_, size_t needle_size)
2023-02-23 12:58:42 +00:00
: needle(reinterpret_cast<const uint8_t *>(needle_))
2023-02-24 10:13:35 +00:00
, needle_end(needle + needle_size)
{
2023-02-24 10:13:35 +00:00
if (needle_size == 0)
return;
2023-02-24 10:13:35 +00:00
first_needle_character = *needle;
2023-02-24 10:13:35 +00:00
#ifdef __SSE4_1__
first_needle_character_vec = _mm_set1_epi8(first_needle_character);
if (needle_size > 1)
{
2023-02-24 10:13:35 +00:00
second_needle_character = *(needle + 1);
second_needle_character_vec = _mm_set1_epi8(second_needle_character);
}
const auto * needle_pos = needle;
2023-02-24 10:13:35 +00:00
for (uint8_t i = 0; i < N; ++i)
{
2023-02-24 10:13:35 +00:00
cache = _mm_srli_si128(cache, 1);
2023-02-24 10:13:35 +00:00
if (needle_pos != needle_end)
{
2023-02-24 10:13:35 +00:00
cache = _mm_insert_epi8(cache, *needle_pos, N - 1);
cachemask |= 1 << i;
++needle_pos;
}
}
#endif
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-24 10:13:35 +00:00
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
{
#ifdef __SSE4_1__
2023-02-24 10:13:35 +00:00
if (isPageSafe(pos))
{
2023-02-24 10:13:35 +00:00
const __m128i haystack_characters = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const __m128i comparison_result = _mm_cmpeq_epi8(haystack_characters, cache);
const uint16_t comparison_result_mask = _mm_movemask_epi8(comparison_result);
if (0xffff == cachemask)
{
2023-02-24 10:13:35 +00:00
if (comparison_result_mask == cachemask)
{
2023-02-24 10:13:35 +00:00
pos += N;
const auto * needle_pos = needle + N;
while (needle_pos < needle_end && *pos == *needle_pos)
++pos, ++needle_pos;
if (needle_pos == needle_end)
return true;
}
}
2023-02-24 10:13:35 +00:00
else if ((comparison_result_mask & cachemask) == cachemask)
return true;
return false;
}
#endif
2023-02-24 10:13:35 +00:00
if (*pos == first_needle_character)
{
2023-02-24 10:13:35 +00:00
++pos;
const auto * needle_pos = needle + 1;
2023-02-24 10:13:35 +00:00
while (needle_pos < needle_end && *pos == *needle_pos)
++pos, ++needle_pos;
if (needle_pos == needle_end)
return true;
}
return false;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
2023-02-24 10:13:35 +00:00
const auto needle_size = needle_end - needle;
if (needle == needle_end)
return haystack;
#ifdef __SSE4_1__
2023-02-24 10:13:35 +00:00
/// Fast path for single-character needles. Compare 16 characters of the haystack against the needle character at once.
if (needle_size == 1)
{
while (haystack < haystack_end)
{
2023-02-24 10:13:35 +00:00
if (haystack + N <= haystack_end && isPageSafe(haystack))
{
const __m128i haystack_characters = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const __m128i comparison_result = _mm_cmpeq_epi8(haystack_characters, first_needle_character_vec);
const uint16_t comparison_result_mask = _mm_movemask_epi8(comparison_result);
if (comparison_result_mask == 0)
{
haystack += N;
continue;
}
2023-02-24 10:13:35 +00:00
const int offset = std::countr_zero(comparison_result_mask);
haystack += offset;
2023-02-24 10:13:35 +00:00
return haystack;
}
if (haystack == haystack_end)
return haystack_end;
if (*haystack == first_needle_character)
return haystack;
++haystack;
}
return haystack_end;
}
#endif
while (haystack < haystack_end && haystack_end - haystack >= needle_size)
{
#ifdef __SSE4_1__
/// Compare the [0:15] bytes from haystack and broadcasted 16 bytes vector from first character of needle.
/// Compare the [1:16] bytes from haystack and broadcasted 16 bytes vector from second character of needle.
/// Bit AND the results of above two comparisons and get the mask.
if ((haystack + 1 + N) <= haystack_end && isPageSafe(haystack + 1))
{
const __m128i haystack_characters_from_1st = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const __m128i haystack_characters_from_2nd = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack + 1));
const __m128i comparison_result_1st = _mm_cmpeq_epi8(haystack_characters_from_1st, first_needle_character_vec);
const __m128i comparison_result_2nd = _mm_cmpeq_epi8(haystack_characters_from_2nd, second_needle_character_vec);
const __m128i comparison_result_combined = _mm_and_si128(comparison_result_1st, comparison_result_2nd);
const uint16_t comparison_result_mask = _mm_movemask_epi8(comparison_result_combined);
/// If the mask = 0, then first two characters [0:1] from needle are not in the [0:17] bytes of haystack.
if (comparison_result_mask == 0)
{
2023-02-23 12:58:42 +00:00
haystack += N;
continue;
}
2023-02-24 10:13:35 +00:00
const int offset = std::countr_zero(comparison_result_mask);
haystack += offset;
2023-02-23 12:58:42 +00:00
if (haystack + N <= haystack_end && isPageSafe(haystack))
{
2023-02-24 10:13:35 +00:00
/// Already find the haystack position where the [pos:pos + 1] two characters exactly match the first two characters of needle.
/// Compare the 16 bytes from needle (cache) and the first 16 bytes from haystack at once if the haystack size >= 16 bytes.
const __m128i haystack_characters = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const __m128i comparison_result_cache = _mm_cmpeq_epi8(haystack_characters, cache);
const uint16_t mask_offset = _mm_movemask_epi8(comparison_result_cache);
if (0xffff == cachemask)
{
2023-02-24 10:13:35 +00:00
if (mask_offset == cachemask)
{
2023-02-24 10:13:35 +00:00
const auto * haystack_pos = haystack + N;
const auto * needle_pos = needle + N;
while (haystack_pos < haystack_end && needle_pos < needle_end &&
*haystack_pos == *needle_pos)
++haystack_pos, ++needle_pos;
if (needle_pos == needle_end)
return haystack;
}
}
2023-02-24 10:13:35 +00:00
else if ((mask_offset & cachemask) == cachemask)
return haystack;
2023-02-24 10:13:35 +00:00
++haystack;
continue;
}
}
#endif
if (haystack == haystack_end)
return haystack_end;
2023-02-24 10:13:35 +00:00
if (*haystack == first_needle_character)
{
2023-02-24 10:13:35 +00:00
const auto * haystack_pos = haystack + 1;
const auto * needle_pos = needle + 1;
2023-02-24 10:13:35 +00:00
while (haystack_pos < haystack_end && needle_pos < needle_end &&
*haystack_pos == *needle_pos)
++haystack_pos, ++needle_pos;
if (needle_pos == needle_end)
return haystack;
}
2023-02-24 10:13:35 +00:00
++haystack;
}
return haystack_end;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
const CharT * search(const CharT * haystack, size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
};
/// Case-insensitive ASCII searcher
template <>
class StringSearcher<false, true> : public StringSearcherBase
{
private:
/// string to be searched for
2020-02-22 05:46:35 +00:00
const uint8_t * const needle;
const uint8_t * const needle_end;
/// lower and uppercase variants of the first character in `needle`
2023-02-23 12:58:42 +00:00
uint8_t l = 0;
uint8_t u = 0;
#ifdef __SSE4_1__
/// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
__m128i patl, patu;
/// lower and uppercase vectors of first 16 characters of `needle`
__m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128();
2023-02-23 12:58:42 +00:00
int cachemask = 0;
#endif
public:
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
StringSearcher(const CharT * needle_, size_t needle_size)
: needle(reinterpret_cast<const uint8_t *>(needle_))
, needle_end(needle + needle_size)
{
2023-02-24 10:14:58 +00:00
if (needle_size == 0)
return;
2020-02-22 05:46:35 +00:00
l = static_cast<uint8_t>(std::tolower(*needle));
u = static_cast<uint8_t>(std::toupper(*needle));
#ifdef __SSE4_1__
patl = _mm_set1_epi8(l);
patu = _mm_set1_epi8(u);
const auto * needle_pos = needle;
2023-02-23 12:58:42 +00:00
for (size_t i = 0; i < N; ++i)
{
cachel = _mm_srli_si128(cachel, 1);
cacheu = _mm_srli_si128(cacheu, 1);
if (needle_pos != needle_end)
{
2023-02-23 12:58:42 +00:00
cachel = _mm_insert_epi8(cachel, std::tolower(*needle_pos), N - 1);
cacheu = _mm_insert_epi8(cacheu, std::toupper(*needle_pos), N - 1);
cachemask |= 1 << i;
++needle_pos;
}
}
#endif
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
{
#ifdef __SSE4_1__
2023-02-23 12:58:42 +00:00
if (isPageSafe(pos))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
if (0xffff == cachemask)
{
if (mask == cachemask)
{
2023-02-23 12:58:42 +00:00
pos += N;
const auto * needle_pos = needle + N;
while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos))
2018-08-26 01:24:21 +00:00
{
++pos;
++needle_pos;
}
if (needle_pos == needle_end)
return true;
}
}
else if ((mask & cachemask) == cachemask)
return true;
return false;
}
#endif
if (*pos == l || *pos == u)
{
++pos;
const auto * needle_pos = needle + 1;
while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos))
2018-08-26 01:24:21 +00:00
{
++pos;
++needle_pos;
}
if (needle_pos == needle_end)
return true;
}
return false;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
2019-06-12 02:35:25 +00:00
if (needle == needle_end)
return haystack;
while (haystack < haystack_end)
{
#ifdef __SSE4_1__
2023-02-23 12:58:42 +00:00
if (haystack + N <= haystack_end && isPageSafe(haystack))
{
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
if (mask == 0)
{
2023-02-23 12:58:42 +00:00
haystack += N;
continue;
}
const auto offset = __builtin_ctz(mask);
haystack += offset;
2023-02-23 12:58:42 +00:00
if (haystack + N <= haystack_end && isPageSafe(haystack))
{
const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset);
if (0xffff == cachemask)
{
if (mask_offset == cachemask)
{
2023-02-23 12:58:42 +00:00
const auto * haystack_pos = haystack + N;
const auto * needle_pos = needle + N;
while (haystack_pos < haystack_end && needle_pos < needle_end &&
std::tolower(*haystack_pos) == std::tolower(*needle_pos))
2018-08-26 01:24:21 +00:00
{
++haystack_pos;
++needle_pos;
}
if (needle_pos == needle_end)
return haystack;
}
}
else if ((mask_offset & cachemask) == cachemask)
return haystack;
++haystack;
continue;
}
}
2023-02-24 10:13:35 +00:00
#endif
if (haystack == haystack_end)
return haystack_end;
if (*haystack == l || *haystack == u)
{
const auto * haystack_pos = haystack + 1;
const auto * needle_pos = needle + 1;
while (haystack_pos < haystack_end && needle_pos < needle_end &&
std::tolower(*haystack_pos) == std::tolower(*needle_pos))
{
++haystack_pos;
++needle_pos;
}
if (needle_pos == needle_end)
return haystack;
}
++haystack;
}
return haystack_end;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
};
/// Case-insensitive UTF-8 searcher
template <>
class StringSearcher<false, false> : public StringSearcherBase
{
private:
using UTF8SequenceBuffer = uint8_t[6];
/// substring to be searched for
const uint8_t * const needle;
const size_t needle_size;
const uint8_t * const needle_end = needle + needle_size;
/// lower and uppercase variants of the first octet of the first character in `needle`
bool first_needle_symbol_is_ascii = false;
uint8_t l = 0;
uint8_t u = 0;
#ifdef __SSE4_1__
/// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
__m128i patl;
__m128i patu;
/// lower and uppercase vectors of first 16 characters of `needle`
__m128i cachel = _mm_setzero_si128();
__m128i cacheu = _mm_setzero_si128();
int cachemask = 0;
size_t cache_valid_len = 0;
size_t cache_actual_len = 0;
#endif
public:
template <typename CharT>
requires (sizeof(CharT) == 1)
StringSearcher(const CharT * needle_, size_t needle_size_)
: needle(reinterpret_cast<const uint8_t *>(needle_))
, needle_size(needle_size_)
{
2023-02-24 10:14:58 +00:00
if (needle_size == 0)
2023-02-24 10:13:35 +00:00
return;
UTF8SequenceBuffer l_seq;
UTF8SequenceBuffer u_seq;
if (*needle < 0x80u)
{
first_needle_symbol_is_ascii = true;
l = std::tolower(*needle);
u = std::toupper(*needle);
}
else
{
auto first_u32 = UTF8::convertUTF8ToCodePoint(needle, needle_size);
/// Invalid UTF-8
if (!first_u32)
{
/// Process it verbatim as a sequence of bytes.
size_t src_len = UTF8::seqLength(*needle);
memcpy(l_seq, needle, src_len);
memcpy(u_seq, needle, src_len);
}
else
{
uint32_t first_l_u32 = Poco::Unicode::toLower(*first_u32);
uint32_t first_u_u32 = Poco::Unicode::toUpper(*first_u32);
/// lower and uppercase variants of the first octet of the first character in `needle`
size_t length_l = UTF8::convertCodePointToUTF8(first_l_u32, l_seq, sizeof(l_seq));
size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
if (length_l != length_u)
force_fallback = true;
}
l = l_seq[0];
u = u_seq[0];
if (force_fallback)
return;
}
#ifdef __SSE4_1__
/// for detecting leftmost position of the first symbol
patl = _mm_set1_epi8(l);
patu = _mm_set1_epi8(u);
/// lower and uppercase vectors of first 16 octets of `needle`
const auto * needle_pos = needle;
for (size_t i = 0; i < N;)
{
if (needle_pos == needle_end)
{
cachel = _mm_srli_si128(cachel, 1);
cacheu = _mm_srli_si128(cacheu, 1);
++i;
continue;
}
size_t src_len = std::min<size_t>(needle_end - needle_pos, UTF8::seqLength(*needle_pos));
auto c_u32 = UTF8::convertUTF8ToCodePoint(needle_pos, src_len);
if (c_u32)
{
int c_l_u32 = Poco::Unicode::toLower(*c_u32);
int c_u_u32 = Poco::Unicode::toUpper(*c_u32);
size_t dst_l_len = UTF8::convertCodePointToUTF8(c_l_u32, l_seq, sizeof(l_seq));
size_t dst_u_len = UTF8::convertCodePointToUTF8(c_u_u32, u_seq, sizeof(u_seq));
/// @note Unicode standard states it is a rare but possible occasion
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
{
force_fallback = true;
return;
}
}
2023-02-24 10:13:35 +00:00
cache_actual_len += src_len;
if (cache_actual_len < N)
cache_valid_len += src_len;
2023-02-24 10:13:35 +00:00
for (size_t j = 0; j < src_len && i < N; ++j, ++i)
{
2023-02-24 10:13:35 +00:00
cachel = _mm_srli_si128(cachel, 1);
cacheu = _mm_srli_si128(cacheu, 1);
2023-02-24 10:13:35 +00:00
if (needle_pos != needle_end)
2018-08-26 01:24:21 +00:00
{
2023-02-24 10:13:35 +00:00
cachel = _mm_insert_epi8(cachel, l_seq[j], N - 1);
cacheu = _mm_insert_epi8(cacheu, u_seq[j], N - 1);
cachemask |= 1 << i;
2018-08-26 01:24:21 +00:00
++needle_pos;
}
}
}
2023-02-24 10:13:35 +00:00
#endif
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-24 10:13:35 +00:00
ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const CharT * const haystack_end, const uint8_t * needle_pos) const
{
2023-02-24 10:13:35 +00:00
while (haystack_pos < haystack_end && needle_pos < needle_end)
{
auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
2023-02-24 10:13:35 +00:00
/// Invalid UTF-8, should not compare equals
if (!haystack_code_point || !needle_code_point)
break;
2023-02-24 10:13:35 +00:00
/// Not equals case insensitive.
if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
break;
2023-02-24 10:13:35 +00:00
auto len = UTF8::seqLength(*haystack_pos);
haystack_pos += len;
2023-02-24 10:13:35 +00:00
len = UTF8::seqLength(*needle_pos);
needle_pos += len;
}
2023-02-24 10:13:35 +00:00
return needle_pos == needle_end;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-24 10:13:35 +00:00
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * haystack_end, const CharT * pos) const
{
2023-02-24 10:13:35 +00:00
#ifdef __SSE4_1__
2023-02-24 10:13:35 +00:00
if (isPageSafe(pos) && !force_fallback)
{
2023-02-24 10:13:35 +00:00
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
if (0xffff == cachemask)
{
2023-02-24 10:13:35 +00:00
if (mask == cachemask)
{
2023-02-24 10:13:35 +00:00
if (compareTrivial(pos, haystack_end, needle))
return true;
}
}
2023-02-24 10:13:35 +00:00
else if ((mask & cachemask) == cachemask)
{
if (compareTrivial(pos, haystack_end, needle))
return true;
}
return false;
}
#endif
2023-02-24 10:13:35 +00:00
if (*pos == l || *pos == u)
{
2023-02-24 10:13:35 +00:00
pos += first_needle_symbol_is_ascii;
const auto * needle_pos = needle + first_needle_symbol_is_ascii;
2023-02-24 10:13:35 +00:00
if (compareTrivial(pos, haystack_end, needle_pos))
return true;
}
return false;
}
2023-02-24 10:13:35 +00:00
/** Returns haystack_end if not found.
*/
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
2023-02-24 10:14:58 +00:00
if (needle_size == 0)
return haystack;
2023-02-24 10:13:35 +00:00
while (haystack < haystack_end)
{
2023-02-24 10:13:35 +00:00
#ifdef __SSE4_1__
if (haystack + N <= haystack_end && isPageSafe(haystack) && !force_fallback)
{
2023-02-24 10:13:35 +00:00
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
2023-02-24 10:13:35 +00:00
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
2023-02-24 10:13:35 +00:00
if (mask == 0)
{
2023-02-23 12:58:42 +00:00
haystack += N;
2023-02-24 10:13:35 +00:00
UTF8::syncForward(haystack, haystack_end);
continue;
}
2023-02-24 10:13:35 +00:00
const auto offset = __builtin_ctz(mask);
haystack += offset;
2023-02-23 12:58:42 +00:00
if (haystack + N <= haystack_end && isPageSafe(haystack))
{
2023-02-24 10:13:35 +00:00
const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
const auto mask_offset_both = _mm_movemask_epi8(v_against_l_or_u_offset);
if (0xffff == cachemask)
{
2023-02-24 10:13:35 +00:00
if (mask_offset_both == cachemask)
{
2023-02-24 10:13:35 +00:00
if (compareTrivial(haystack, haystack_end, needle))
return haystack;
}
}
2023-02-24 10:13:35 +00:00
else if ((mask_offset_both & cachemask) == cachemask)
{
if (compareTrivial(haystack, haystack_end, needle))
return haystack;
}
2023-02-24 10:13:35 +00:00
/// first octet was ok, but not the first 16, move to start of next sequence and reapply
haystack += UTF8::seqLength(*haystack);
continue;
}
}
#endif
if (haystack == haystack_end)
return haystack_end;
2023-02-24 10:13:35 +00:00
if (*haystack == l || *haystack == u)
{
2023-02-24 10:13:35 +00:00
auto haystack_pos = haystack + first_needle_symbol_is_ascii;
const auto * needle_pos = needle + first_needle_symbol_is_ascii;
2023-02-24 10:13:35 +00:00
if (compareTrivial(haystack_pos, haystack_end, needle_pos))
return haystack;
}
2023-02-24 10:13:35 +00:00
/// advance to the start of the next sequence
haystack += UTF8::seqLength(*haystack);
}
return haystack_end;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
const CharT * search(const CharT * haystack, size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
};
2023-02-24 10:13:35 +00:00
// Searches for needle surrounded by token-separators.
// Separators are anything inside ASCII (0-128) and not alphanum.
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
// should work just fine. But any Unicode whitespace is not considered a token separtor.
template <typename StringSearcher>
2022-01-26 10:45:26 +00:00
class TokenSearcher : public StringSearcherBase
{
StringSearcher searcher;
size_t needle_size;
public:
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
TokenSearcher(const CharT * needle_, size_t needle_size_)
: searcher(needle_, needle_size_)
, needle_size(needle_size_)
{
2020-01-03 15:28:38 +00:00
if (std::any_of(needle_, needle_ + needle_size_, isTokenSeparator))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Needle must not contain whitespace or separator characters");
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
ALWAYS_INLINE bool compare(const CharT * haystack, const CharT * haystack_end, const CharT * pos) const
{
// use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token.
if (isToken(haystack, haystack_end, pos))
return searcher.compare(haystack, haystack_end, pos);
return false;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
// use searcher.search(), then verify that returned value is a token
// if it is not, skip it and re-run
2020-02-22 05:46:35 +00:00
const auto * pos = haystack;
while (pos < haystack_end)
{
pos = searcher.search(pos, haystack_end);
if (pos == haystack_end || isToken(haystack, haystack_end, pos))
return pos;
// assuming that heendle does not contain any token separators.
pos += needle_size;
}
return haystack_end;
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
const CharT * search(const CharT * haystack, size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2020-02-22 05:46:35 +00:00
ALWAYS_INLINE bool isToken(const CharT * haystack, const CharT * const haystack_end, const CharT* p) const
{
return (p == haystack || isTokenSeparator(*(p - 1)))
&& (p + needle_size >= haystack_end || isTokenSeparator(*(p + needle_size)));
}
2020-02-22 05:46:35 +00:00
ALWAYS_INLINE static bool isTokenSeparator(const uint8_t c)
{
return !(isAlphaNumericASCII(c) || !isASCII(c));
}
};
2023-02-23 12:58:42 +00:00
}
2023-02-23 12:58:42 +00:00
using ASCIICaseSensitiveStringSearcher = impl::StringSearcher<true, true>;
using ASCIICaseInsensitiveStringSearcher = impl::StringSearcher<false, true>;
using UTF8CaseSensitiveStringSearcher = impl::StringSearcher<true, false>;
using UTF8CaseInsensitiveStringSearcher = impl::StringSearcher<false, false>;
using ASCIICaseSensitiveTokenSearcher = impl::TokenSearcher<ASCIICaseSensitiveStringSearcher>;
using ASCIICaseInsensitiveTokenSearcher = impl::TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
/// Use only with short haystacks where cheap initialization is required.
template <bool CaseInsensitive>
2023-02-23 12:58:42 +00:00
struct StdLibASCIIStringSearcher
{
const char * const needle_start;
const char * const needle_end;
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
StdLibASCIIStringSearcher(const CharT * const needle_start_, size_t needle_size_)
: needle_start(reinterpret_cast<const char *>(needle_start_))
, needle_end(reinterpret_cast<const char *>(needle_start) + needle_size_)
{}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
{
if constexpr (CaseInsensitive)
return std::search(
haystack_start, haystack_end, needle_start, needle_end,
[](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
else
return std::search(
haystack_start, haystack_end, needle_start, needle_end,
[](char c1, char c2) {return c1 == c2;});
}
template <typename CharT>
requires (sizeof(CharT) == 1)
2023-02-23 12:58:42 +00:00
const CharT * search(const CharT * haystack_start, size_t haystack_length) const
{
return search(haystack_start, haystack_start + haystack_length);
}
};
}