2015-10-14 12:12:56 +00:00
|
|
|
#pragma once
|
|
|
|
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/getPageSize.h>
|
2019-08-21 08:12:39 +00:00
|
|
|
#include <Common/Exception.h>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/UTF8Helpers.h>
|
2019-05-14 09:58:33 +00:00
|
|
|
#include <Core/Defines.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/range.h>
|
2015-10-14 12:12:56 +00:00
|
|
|
#include <Poco/Unicode.h>
|
2022-06-28 11:29:07 +00:00
|
|
|
#include <cstdint>
|
|
|
|
#include <cstring>
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2017-04-01 07:20:54 +00:00
|
|
|
#include <emmintrin.h>
|
2017-02-07 21:26:32 +00:00
|
|
|
#endif
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
#include <smmintrin.h>
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
|
|
|
|
2015-10-14 12:12:56 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-01-12 02:21:15 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2019-08-21 08:12:39 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2016-01-12 02:21:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/** Variants for searching a substring in a string.
|
|
|
|
* In most cases, performance is less than Volnitsky (see Volnitsky.h).
|
2016-01-27 03:26:36 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2022-01-25 18:56:05 +00:00
|
|
|
class StringSearcherBase
|
2016-01-16 00:45:19 +00:00
|
|
|
{
|
2022-01-25 18:56:05 +00:00
|
|
|
public:
|
|
|
|
bool force_fallback = false;
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2022-01-25 18:56:05 +00:00
|
|
|
protected:
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr auto n = sizeof(__m128i);
|
2020-10-29 19:52:12 +00:00
|
|
|
const int page_size = ::getPageSize();
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
bool pageSafe(const void * const ptr) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= page_size - n;
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2015-10-14 12:12:56 +00:00
|
|
|
/// Performs case-sensitive and case-insensitive search of UTF-8 strings
|
|
|
|
template <bool CaseSensitive, bool ASCII> class StringSearcher;
|
|
|
|
|
|
|
|
/// Case-insensitive UTF-8 searcher
|
2016-01-16 00:45:19 +00:00
|
|
|
template <>
|
2022-01-25 18:56:05 +00:00
|
|
|
class StringSearcher<false, false> : public StringSearcherBase
|
2015-10-14 12:12:56 +00:00
|
|
|
{
|
2016-01-16 00:45:19 +00:00
|
|
|
private:
|
2020-02-22 05:46:35 +00:00
|
|
|
using UTF8SequenceBuffer = uint8_t[6];
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/// substring to be searched for
|
2020-02-22 05:46:35 +00:00
|
|
|
const uint8_t * const needle;
|
2017-07-21 06:35:58 +00:00
|
|
|
const size_t needle_size;
|
2020-02-22 05:46:35 +00:00
|
|
|
const uint8_t * const needle_end = needle + needle_size;
|
2017-04-01 07:20:54 +00:00
|
|
|
/// lower and uppercase variants of the first octet of the first character in `needle`
|
|
|
|
bool first_needle_symbol_is_ascii{};
|
2020-02-22 05:46:35 +00:00
|
|
|
uint8_t l{};
|
|
|
|
uint8_t u{};
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
/// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
|
2020-04-26 17:34:22 +00:00
|
|
|
__m128i patl;
|
|
|
|
__m128i patu;
|
2017-04-01 07:20:54 +00:00
|
|
|
/// lower and uppercase vectors of first 16 characters of `needle`
|
2020-04-26 17:34:22 +00:00
|
|
|
__m128i cachel = _mm_setzero_si128();
|
|
|
|
__m128i cacheu = _mm_setzero_si128();
|
2017-04-01 07:20:54 +00:00
|
|
|
int cachemask{};
|
2017-07-21 06:35:58 +00:00
|
|
|
size_t cache_valid_len{};
|
|
|
|
size_t cache_actual_len{};
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
|
|
|
public:
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
StringSearcher(const CharT * needle_, const size_t needle_size_)
|
|
|
|
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_size{needle_size_}
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (0 == needle_size)
|
|
|
|
return;
|
|
|
|
|
2020-04-30 13:25:17 +00:00
|
|
|
UTF8SequenceBuffer l_seq;
|
|
|
|
UTF8SequenceBuffer u_seq;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (*needle < 0x80u)
|
|
|
|
{
|
|
|
|
first_needle_symbol_is_ascii = true;
|
2018-05-08 19:44:54 +00:00
|
|
|
l = std::tolower(*needle);
|
|
|
|
u = std::toupper(*needle);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-01-31 02:36:52 +00:00
|
|
|
auto first_u32 = UTF8::convertUTF8ToCodePoint(needle, needle_size);
|
2021-01-27 17:32:59 +00:00
|
|
|
|
|
|
|
/// Invalid UTF-8
|
2021-01-31 02:36:52 +00:00
|
|
|
if (!first_u32)
|
2021-01-27 17:32:59 +00:00
|
|
|
{
|
2021-01-28 07:16:36 +00:00
|
|
|
/// Process it verbatim as a sequence of bytes.
|
|
|
|
size_t src_len = UTF8::seqLength(*needle);
|
|
|
|
|
|
|
|
memcpy(l_seq, needle, src_len);
|
|
|
|
memcpy(u_seq, needle, src_len);
|
2021-01-27 17:32:59 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2021-01-31 02:36:52 +00:00
|
|
|
uint32_t first_l_u32 = Poco::Unicode::toLower(*first_u32);
|
|
|
|
uint32_t first_u_u32 = Poco::Unicode::toUpper(*first_u32);
|
2021-01-27 17:32:59 +00:00
|
|
|
|
|
|
|
/// lower and uppercase variants of the first octet of the first character in `needle`
|
2021-01-31 02:36:52 +00:00
|
|
|
size_t length_l = UTF8::convertCodePointToUTF8(first_l_u32, l_seq, sizeof(l_seq));
|
2021-10-26 10:32:07 +00:00
|
|
|
size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
|
2021-01-31 02:36:52 +00:00
|
|
|
|
2021-10-26 10:32:07 +00:00
|
|
|
if (length_l != length_u)
|
2022-01-25 18:56:05 +00:00
|
|
|
force_fallback = true;
|
2021-01-27 17:32:59 +00:00
|
|
|
}
|
2021-01-28 07:16:36 +00:00
|
|
|
|
|
|
|
l = l_seq[0];
|
|
|
|
u = u_seq[0];
|
2022-01-25 18:56:05 +00:00
|
|
|
|
|
|
|
if (force_fallback)
|
|
|
|
return;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
/// for detecting leftmost position of the first symbol
|
|
|
|
patl = _mm_set1_epi8(l);
|
|
|
|
patu = _mm_set1_epi8(u);
|
|
|
|
/// lower and uppercase vectors of first 16 octets of `needle`
|
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
for (size_t i = 0; i < n;)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (needle_pos == needle_end)
|
|
|
|
{
|
|
|
|
cachel = _mm_srli_si128(cachel, 1);
|
|
|
|
cacheu = _mm_srli_si128(cacheu, 1);
|
|
|
|
++i;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-01-29 04:54:46 +00:00
|
|
|
size_t src_len = std::min<size_t>(needle_end - needle_pos, UTF8::seqLength(*needle_pos));
|
2021-01-31 02:36:52 +00:00
|
|
|
auto c_u32 = UTF8::convertUTF8ToCodePoint(needle_pos, src_len);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-01-31 02:36:52 +00:00
|
|
|
if (c_u32)
|
2021-01-27 17:32:59 +00:00
|
|
|
{
|
2021-01-31 02:36:52 +00:00
|
|
|
int c_l_u32 = Poco::Unicode::toLower(*c_u32);
|
|
|
|
int c_u_u32 = Poco::Unicode::toUpper(*c_u32);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-02-17 12:36:48 +00:00
|
|
|
size_t dst_l_len = UTF8::convertCodePointToUTF8(c_l_u32, l_seq, sizeof(l_seq));
|
|
|
|
size_t dst_u_len = UTF8::convertCodePointToUTF8(c_u_u32, u_seq, sizeof(u_seq));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-01-27 17:32:59 +00:00
|
|
|
/// @note Unicode standard states it is a rare but possible occasion
|
|
|
|
if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
|
2022-01-25 18:56:05 +00:00
|
|
|
{
|
|
|
|
force_fallback = true;
|
|
|
|
return;
|
|
|
|
}
|
2021-01-27 17:32:59 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
cache_actual_len += src_len;
|
|
|
|
if (cache_actual_len < n)
|
|
|
|
cache_valid_len += src_len;
|
|
|
|
|
2017-07-21 06:35:58 +00:00
|
|
|
for (size_t j = 0; j < src_len && i < n; ++j, ++i)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
cachel = _mm_srli_si128(cachel, 1);
|
|
|
|
cacheu = _mm_srli_si128(cacheu, 1);
|
|
|
|
|
|
|
|
if (needle_pos != needle_end)
|
|
|
|
{
|
|
|
|
cachel = _mm_insert_epi8(cachel, l_seq[j], n - 1);
|
|
|
|
cacheu = _mm_insert_epi8(cacheu, u_seq[j], n - 1);
|
|
|
|
|
|
|
|
cachemask |= 1 << i;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2021-10-26 10:32:07 +00:00
|
|
|
ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const CharT * const haystack_end, const uint8_t * needle_pos) const
|
|
|
|
{
|
|
|
|
while (haystack_pos < haystack_end && needle_pos < needle_end)
|
|
|
|
{
|
|
|
|
auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
|
|
|
|
auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
|
|
|
|
|
|
|
|
/// Invalid UTF-8, should not compare equals
|
|
|
|
if (!haystack_code_point || !needle_code_point)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/// Not equals case insensitive.
|
|
|
|
if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
|
|
|
|
break;
|
|
|
|
|
2022-01-25 18:56:05 +00:00
|
|
|
auto len = UTF8::seqLength(*haystack_pos);
|
2021-10-26 10:32:07 +00:00
|
|
|
haystack_pos += len;
|
2022-01-25 18:56:05 +00:00
|
|
|
|
|
|
|
len = UTF8::seqLength(*needle_pos);
|
2021-10-26 10:32:07 +00:00
|
|
|
needle_pos += len;
|
|
|
|
}
|
|
|
|
|
|
|
|
return needle_pos == needle_end;
|
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2021-01-31 02:36:52 +00:00
|
|
|
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * haystack_end, const CharT * pos) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2022-01-25 18:56:05 +00:00
|
|
|
if (pageSafe(pos) && !force_fallback)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
|
|
|
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
|
|
|
|
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
|
|
|
|
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
|
|
|
|
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
|
|
|
|
|
|
|
|
if (0xffff == cachemask)
|
|
|
|
{
|
|
|
|
if (mask == cachemask)
|
|
|
|
{
|
2021-10-26 10:32:07 +00:00
|
|
|
if (compareTrivial(pos, haystack_end, needle))
|
2017-04-01 07:20:54 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ((mask & cachemask) == cachemask)
|
2021-10-26 10:32:07 +00:00
|
|
|
{
|
|
|
|
if (compareTrivial(pos, haystack_end, needle))
|
|
|
|
return true;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (*pos == l || *pos == u)
|
|
|
|
{
|
|
|
|
pos += first_needle_symbol_is_ascii;
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle + first_needle_symbol_is_ascii;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2021-10-26 10:32:07 +00:00
|
|
|
if (compareTrivial(pos, haystack_end, needle_pos))
|
2017-04-01 07:20:54 +00:00
|
|
|
return true;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
/** Returns haystack_end if not found.
|
|
|
|
*/
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (0 == needle_size)
|
|
|
|
return haystack;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (haystack < haystack_end)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2022-01-25 18:56:05 +00:00
|
|
|
if (haystack + n <= haystack_end && pageSafe(haystack) && !force_fallback)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
|
|
|
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
|
|
|
|
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
|
|
|
|
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
|
|
|
|
|
|
|
|
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
|
|
|
|
|
|
|
|
if (mask == 0)
|
|
|
|
{
|
|
|
|
haystack += n;
|
2018-05-07 02:01:11 +00:00
|
|
|
UTF8::syncForward(haystack, haystack_end);
|
2017-04-01 07:20:54 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto offset = __builtin_ctz(mask);
|
|
|
|
haystack += offset;
|
|
|
|
|
2020-09-03 18:13:57 +00:00
|
|
|
if (haystack + n <= haystack_end && pageSafe(haystack))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
|
|
|
const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
|
|
|
|
const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
|
|
|
|
const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
|
2021-10-26 10:32:07 +00:00
|
|
|
const auto mask_offset_both = _mm_movemask_epi8(v_against_l_or_u_offset);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (0xffff == cachemask)
|
|
|
|
{
|
2021-10-26 10:32:07 +00:00
|
|
|
if (mask_offset_both == cachemask)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2021-10-26 10:32:07 +00:00
|
|
|
if (compareTrivial(haystack, haystack_end, needle))
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack;
|
|
|
|
}
|
|
|
|
}
|
2021-10-26 10:32:07 +00:00
|
|
|
else if ((mask_offset_both & cachemask) == cachemask)
|
|
|
|
{
|
|
|
|
if (compareTrivial(haystack, haystack_end, needle))
|
|
|
|
return haystack;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/// first octet was ok, but not the first 16, move to start of next sequence and reapply
|
2018-05-07 02:01:11 +00:00
|
|
|
haystack += UTF8::seqLength(*haystack);
|
2017-04-01 07:20:54 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (haystack == haystack_end)
|
|
|
|
return haystack_end;
|
|
|
|
|
|
|
|
if (*haystack == l || *haystack == u)
|
|
|
|
{
|
|
|
|
auto haystack_pos = haystack + first_needle_symbol_is_ascii;
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle + first_needle_symbol_is_ascii;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-10-26 10:32:07 +00:00
|
|
|
if (compareTrivial(haystack_pos, haystack_end, needle_pos))
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// advance to the start of the next sequence
|
2018-05-07 02:01:11 +00:00
|
|
|
haystack += UTF8::seqLength(*haystack);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return haystack_end;
|
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
return search(haystack, haystack + haystack_size);
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
};
|
|
|
|
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2015-10-14 12:12:56 +00:00
|
|
|
/// Case-insensitive ASCII searcher
|
2016-01-16 00:45:19 +00:00
|
|
|
template <>
|
2022-01-25 18:56:05 +00:00
|
|
|
class StringSearcher<false, true> : public StringSearcherBase
|
2015-10-14 12:12:56 +00:00
|
|
|
{
|
2016-01-16 00:45:19 +00:00
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
/// string to be searched for
|
2020-02-22 05:46:35 +00:00
|
|
|
const uint8_t * const needle;
|
|
|
|
const uint8_t * const needle_end;
|
2017-04-01 07:20:54 +00:00
|
|
|
/// lower and uppercase variants of the first character in `needle`
|
2020-02-22 05:46:35 +00:00
|
|
|
uint8_t l{};
|
|
|
|
uint8_t u{};
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
/// vectors filled with `l` and `u`, for determining leftmost position of the first symbol
|
|
|
|
__m128i patl, patu;
|
|
|
|
/// lower and uppercase vectors of first 16 characters of `needle`
|
|
|
|
__m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128();
|
|
|
|
int cachemask{};
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
|
|
|
public:
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
StringSearcher(const CharT * needle_, const size_t needle_size)
|
|
|
|
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_end{needle + needle_size}
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (0 == needle_size)
|
|
|
|
return;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2020-02-22 05:46:35 +00:00
|
|
|
l = static_cast<uint8_t>(std::tolower(*needle));
|
|
|
|
u = static_cast<uint8_t>(std::toupper(*needle));
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
patl = _mm_set1_epi8(l);
|
|
|
|
patu = _mm_set1_epi8(u);
|
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-06-15 19:55:21 +00:00
|
|
|
for (const auto i : collections::range(0, n))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
cachel = _mm_srli_si128(cachel, 1);
|
|
|
|
cacheu = _mm_srli_si128(cacheu, 1);
|
|
|
|
|
|
|
|
if (needle_pos != needle_end)
|
|
|
|
{
|
|
|
|
cachel = _mm_insert_epi8(cachel, std::tolower(*needle_pos), n - 1);
|
|
|
|
cacheu = _mm_insert_epi8(cacheu, std::toupper(*needle_pos), n - 1);
|
|
|
|
cachemask |= 1 << i;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-05-10 04:00:19 +00:00
|
|
|
if (pageSafe(pos))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
|
|
|
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
|
|
|
|
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
|
|
|
|
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
|
|
|
|
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
|
|
|
|
|
|
|
|
if (0xffff == cachemask)
|
|
|
|
{
|
|
|
|
if (mask == cachemask)
|
|
|
|
{
|
|
|
|
pos += n;
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle + n;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos))
|
2018-08-26 01:24:21 +00:00
|
|
|
{
|
|
|
|
++pos;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ((mask & cachemask) == cachemask)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (*pos == l || *pos == u)
|
|
|
|
{
|
|
|
|
++pos;
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle + 1;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (needle_pos < needle_end && std::tolower(*pos) == std::tolower(*needle_pos))
|
2018-08-26 01:24:21 +00:00
|
|
|
{
|
|
|
|
++pos;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return true;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-06-12 02:35:25 +00:00
|
|
|
if (needle == needle_end)
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (haystack < haystack_end)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-05-10 04:00:19 +00:00
|
|
|
if (haystack + n <= haystack_end && pageSafe(haystack))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
|
|
|
const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
|
|
|
|
const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
|
|
|
|
const auto v_against_l_or_u = _mm_or_si128(v_against_l, v_against_u);
|
|
|
|
|
|
|
|
const auto mask = _mm_movemask_epi8(v_against_l_or_u);
|
|
|
|
|
|
|
|
if (mask == 0)
|
|
|
|
{
|
|
|
|
haystack += n;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto offset = __builtin_ctz(mask);
|
|
|
|
haystack += offset;
|
|
|
|
|
2020-09-03 18:13:57 +00:00
|
|
|
if (haystack + n <= haystack_end && pageSafe(haystack))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
|
|
|
const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
|
|
|
|
const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
|
|
|
|
const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
|
|
|
|
const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (0xffff == cachemask)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
if (mask_offset == cachemask)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * haystack_pos = haystack + n;
|
|
|
|
const auto * needle_pos = needle + n;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
while (haystack_pos < haystack_end && needle_pos < needle_end &&
|
|
|
|
std::tolower(*haystack_pos) == std::tolower(*needle_pos))
|
2018-08-26 01:24:21 +00:00
|
|
|
{
|
|
|
|
++haystack_pos;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return haystack;
|
|
|
|
}
|
|
|
|
}
|
2019-01-04 12:10:00 +00:00
|
|
|
else if ((mask_offset & cachemask) == cachemask)
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack;
|
|
|
|
|
|
|
|
++haystack;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (haystack == haystack_end)
|
|
|
|
return haystack_end;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (*haystack == l || *haystack == u)
|
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * haystack_pos = haystack + 1;
|
|
|
|
const auto * needle_pos = needle + 1;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (haystack_pos < haystack_end && needle_pos < needle_end &&
|
|
|
|
std::tolower(*haystack_pos) == std::tolower(*needle_pos))
|
2018-08-26 01:24:21 +00:00
|
|
|
{
|
|
|
|
++haystack_pos;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return haystack;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
++haystack;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack_end;
|
|
|
|
}
|
2016-01-27 03:11:28 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
return search(haystack, haystack + haystack_size);
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
};
|
|
|
|
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2015-10-14 12:12:56 +00:00
|
|
|
/// Case-sensitive searcher (both ASCII and UTF-8)
|
2016-01-16 00:45:19 +00:00
|
|
|
template <bool ASCII>
|
2022-01-25 18:56:05 +00:00
|
|
|
class StringSearcher<true, ASCII> : public StringSearcherBase
|
2015-10-14 12:12:56 +00:00
|
|
|
{
|
2016-01-16 00:45:19 +00:00
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
/// string to be searched for
|
2020-02-22 05:46:35 +00:00
|
|
|
const uint8_t * const needle;
|
|
|
|
const uint8_t * const needle_end;
|
2017-04-01 07:20:54 +00:00
|
|
|
/// first character in `needle`
|
2020-02-22 05:46:35 +00:00
|
|
|
uint8_t first{};
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
/// vector filled `first` for determining leftmost position of the first symbol
|
|
|
|
__m128i pattern;
|
|
|
|
/// vector of first 16 characters of `needle`
|
|
|
|
__m128i cache = _mm_setzero_si128();
|
|
|
|
int cachemask{};
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
|
|
|
public:
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
StringSearcher(const CharT * needle_, const size_t needle_size)
|
|
|
|
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_end{needle + needle_size}
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (0 == needle_size)
|
|
|
|
return;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
first = *needle;
|
2016-01-16 00:45:19 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-04-01 07:20:54 +00:00
|
|
|
pattern = _mm_set1_epi8(first);
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2021-06-15 19:55:21 +00:00
|
|
|
for (const auto i : collections::range(0, n))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
cache = _mm_srli_si128(cache, 1);
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (needle_pos != needle_end)
|
|
|
|
{
|
|
|
|
cache = _mm_insert_epi8(cache, *needle_pos, n - 1);
|
|
|
|
cachemask |= 1 << i;
|
|
|
|
++needle_pos;
|
|
|
|
}
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-05-10 04:00:19 +00:00
|
|
|
if (pageSafe(pos))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
|
|
|
|
const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache);
|
|
|
|
const auto mask = _mm_movemask_epi8(v_against_cache);
|
|
|
|
|
|
|
|
if (0xffff == cachemask)
|
|
|
|
{
|
|
|
|
if (mask == cachemask)
|
|
|
|
{
|
|
|
|
pos += n;
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle + n;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
while (needle_pos < needle_end && *pos == *needle_pos)
|
|
|
|
++pos, ++needle_pos;
|
|
|
|
|
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ((mask & cachemask) == cachemask)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (*pos == first)
|
|
|
|
{
|
|
|
|
++pos;
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * needle_pos = needle + 1;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (needle_pos < needle_end && *pos == *needle_pos)
|
|
|
|
++pos, ++needle_pos;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return true;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-06-12 02:35:25 +00:00
|
|
|
if (needle == needle_end)
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (haystack < haystack_end)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_1__
|
2017-05-10 04:00:19 +00:00
|
|
|
if (haystack + n <= haystack_end && pageSafe(haystack))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
/// find first character
|
|
|
|
const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
|
|
|
const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern);
|
|
|
|
|
|
|
|
const auto mask = _mm_movemask_epi8(v_against_pattern);
|
|
|
|
|
|
|
|
/// first character not present in 16 octets starting at `haystack`
|
|
|
|
if (mask == 0)
|
|
|
|
{
|
|
|
|
haystack += n;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto offset = __builtin_ctz(mask);
|
|
|
|
haystack += offset;
|
|
|
|
|
2020-09-03 18:13:57 +00:00
|
|
|
if (haystack + n <= haystack_end && pageSafe(haystack))
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
/// check for first 16 octets
|
2019-01-04 12:10:00 +00:00
|
|
|
const auto v_haystack_offset = _mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
|
|
|
|
const auto v_against_cache = _mm_cmpeq_epi8(v_haystack_offset, cache);
|
|
|
|
const auto mask_offset = _mm_movemask_epi8(v_against_cache);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (0xffff == cachemask)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
if (mask_offset == cachemask)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * haystack_pos = haystack + n;
|
|
|
|
const auto * needle_pos = needle + n;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
while (haystack_pos < haystack_end && needle_pos < needle_end &&
|
|
|
|
*haystack_pos == *needle_pos)
|
|
|
|
++haystack_pos, ++needle_pos;
|
|
|
|
|
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return haystack;
|
|
|
|
}
|
|
|
|
}
|
2019-01-04 12:10:00 +00:00
|
|
|
else if ((mask_offset & cachemask) == cachemask)
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack;
|
|
|
|
|
|
|
|
++haystack;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2016-01-16 00:45:19 +00:00
|
|
|
#endif
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (haystack == haystack_end)
|
|
|
|
return haystack_end;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (*haystack == first)
|
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
const auto * haystack_pos = haystack + 1;
|
|
|
|
const auto * needle_pos = needle + 1;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (haystack_pos < haystack_end && needle_pos < needle_end &&
|
|
|
|
*haystack_pos == *needle_pos)
|
|
|
|
++haystack_pos, ++needle_pos;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (needle_pos == needle_end)
|
|
|
|
return haystack;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
++haystack;
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return haystack_end;
|
|
|
|
}
|
2016-01-27 03:11:28 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
return search(haystack, haystack + haystack_size);
|
|
|
|
}
|
2015-10-14 12:12:56 +00:00
|
|
|
};
|
|
|
|
|
2019-08-21 08:12:39 +00:00
|
|
|
// Searches for needle surrounded by token-separators.
|
|
|
|
// Separators are anything inside ASCII (0-128) and not alphanum.
|
|
|
|
// Any value outside of basic ASCII (>=128) is considered a non-separator symbol, hence UTF-8 strings
|
|
|
|
// should work just fine. But any Unicode whitespace is not considered a token separtor.
|
|
|
|
template <typename StringSearcher>
|
2022-01-26 10:45:26 +00:00
|
|
|
class TokenSearcher : public StringSearcherBase
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
|
|
|
StringSearcher searcher;
|
|
|
|
size_t needle_size;
|
|
|
|
|
|
|
|
public:
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
TokenSearcher(const CharT * needle_, const size_t needle_size_)
|
2019-08-21 08:12:39 +00:00
|
|
|
: searcher{needle_, needle_size_},
|
|
|
|
needle_size(needle_size_)
|
|
|
|
{
|
2020-01-03 15:28:38 +00:00
|
|
|
if (std::any_of(needle_, needle_ + needle_size_, isTokenSeparator))
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
2019-08-23 16:08:27 +00:00
|
|
|
throw Exception{"Needle must not contain whitespace or separator characters", ErrorCodes::BAD_ARGUMENTS};
|
2019-08-21 08:12:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
ALWAYS_INLINE bool compare(const CharT * haystack, const CharT * haystack_end, const CharT * pos) const
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
|
|
|
// use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token.
|
|
|
|
if (isToken(haystack, haystack_end, pos))
|
|
|
|
return searcher.compare(haystack, haystack_end, pos);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
|
|
|
// use searcher.search(), then verify that returned value is a token
|
|
|
|
// if it is not, skip it and re-run
|
|
|
|
|
2020-02-22 05:46:35 +00:00
|
|
|
const auto * pos = haystack;
|
2019-08-21 08:12:39 +00:00
|
|
|
while (pos < haystack_end)
|
|
|
|
{
|
|
|
|
pos = searcher.search(pos, haystack_end);
|
|
|
|
if (pos == haystack_end || isToken(haystack, haystack_end, pos))
|
|
|
|
return pos;
|
|
|
|
|
|
|
|
// assuming that heendle does not contain any token separators.
|
|
|
|
pos += needle_size;
|
|
|
|
}
|
|
|
|
return haystack_end;
|
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
|
|
|
return search(haystack, haystack + haystack_size);
|
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
2020-02-22 05:46:35 +00:00
|
|
|
ALWAYS_INLINE bool isToken(const CharT * haystack, const CharT * const haystack_end, const CharT* p) const
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
|
|
|
return (p == haystack || isTokenSeparator(*(p - 1)))
|
|
|
|
&& (p + needle_size >= haystack_end || isTokenSeparator(*(p + needle_size)));
|
|
|
|
}
|
|
|
|
|
2020-02-22 05:46:35 +00:00
|
|
|
ALWAYS_INLINE static bool isTokenSeparator(const uint8_t c)
|
2019-08-21 08:12:39 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
return !(isAlphaNumericASCII(c) || !isASCII(c));
|
2019-08-21 08:12:39 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-10-14 12:12:56 +00:00
|
|
|
|
|
|
|
using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>;
|
|
|
|
using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
|
|
|
|
using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
|
|
|
|
using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
|
2019-08-21 08:12:39 +00:00
|
|
|
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
|
2019-08-26 08:00:48 +00:00
|
|
|
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
|
2015-10-14 12:12:56 +00:00
|
|
|
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
/// Use only with short haystacks where cheap initialization is required.
|
|
|
|
template <bool CaseInsensitive>
|
|
|
|
struct StdLibASCIIStringSearcher : public StringSearcherBase
|
2016-01-27 03:26:36 +00:00
|
|
|
{
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
const char * const needle_start;
|
|
|
|
const char * const needle_end;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_)
|
|
|
|
: needle_start{reinterpret_cast<const char *>(needle_start_)}
|
|
|
|
, needle_end{reinterpret_cast<const char *>(needle_start) + needle_size_}
|
|
|
|
{}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
{
|
|
|
|
return std::search(
|
|
|
|
haystack_start, haystack_end, needle_start, needle_end,
|
|
|
|
[](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return std::search(
|
|
|
|
haystack_start, haystack_end, needle_start, needle_end);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2022-03-15 19:37:28 +00:00
|
|
|
template <typename CharT>
|
|
|
|
requires (sizeof(CharT) == 1)
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
const CharT * search(const CharT * haystack_start, const size_t haystack_length) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.
create table t (id UInt32, pattern String) engine = MergeTree() order by id;
insert into t values (1, 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from t;
We returned 3 before this commit, now we return 6
2022-06-29 15:08:16 +00:00
|
|
|
return search(haystack_start, haystack_start + haystack_length);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2016-01-27 03:26:36 +00:00
|
|
|
};
|
|
|
|
|
2015-10-14 12:12:56 +00:00
|
|
|
}
|