mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 10:52:30 +00:00
244 lines
8.0 KiB
C++
244 lines
8.0 KiB
C++
#include "ITokenExtractor.h"
|
|
|
|
#include <boost/algorithm/string.hpp>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
|
#include <Common/UTF8Helpers.h>
|
|
#include <bit>
|
|
|
|
#if defined(__SSE2__)
|
|
#include <emmintrin.h>
|
|
|
|
#if defined(__SSE4_2__)
|
|
#include <nmmintrin.h>
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
bool NgramTokenExtractor::nextInString(const char * data, size_t length, size_t * __restrict pos, size_t * __restrict token_start, size_t * __restrict token_length) const
|
|
{
|
|
*token_start = *pos;
|
|
*token_length = 0;
|
|
size_t code_points = 0;
|
|
for (; code_points < n && *token_start + *token_length < length; ++code_points)
|
|
{
|
|
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_length]));
|
|
*token_length += sz;
|
|
}
|
|
*pos += UTF8::seqLength(static_cast<UInt8>(data[*pos]));
|
|
return code_points == n;
|
|
}
|
|
|
|
bool NgramTokenExtractor::nextInStringLike(const char * data, size_t length, size_t * pos, String & token) const
|
|
{
|
|
token.clear();
|
|
|
|
size_t code_points = 0;
|
|
bool escaped = false;
|
|
for (size_t i = *pos; i < length;)
|
|
{
|
|
if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\'))
|
|
{
|
|
token += data[i];
|
|
++code_points;
|
|
escaped = false;
|
|
++i;
|
|
}
|
|
else if (!escaped && (data[i] == '%' || data[i] == '_'))
|
|
{
|
|
/// This token is too small, go to the next.
|
|
token.clear();
|
|
code_points = 0;
|
|
escaped = false;
|
|
*pos = ++i;
|
|
}
|
|
else if (!escaped && data[i] == '\\')
|
|
{
|
|
escaped = true;
|
|
++i;
|
|
}
|
|
else
|
|
{
|
|
const size_t sz = UTF8::seqLength(static_cast<UInt8>(data[i]));
|
|
for (size_t j = 0; j < sz; ++j)
|
|
token += data[i + j];
|
|
i += sz;
|
|
++code_points;
|
|
escaped = false;
|
|
}
|
|
|
|
if (code_points == n)
|
|
{
|
|
*pos += UTF8::seqLength(static_cast<UInt8>(data[*pos]));
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool SplitTokenExtractor::nextInString(const char * data, size_t length, size_t * __restrict pos, size_t * __restrict token_start, size_t * __restrict token_length) const
|
|
{
|
|
*token_start = *pos;
|
|
*token_length = 0;
|
|
|
|
while (*pos < length)
|
|
{
|
|
if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
|
|
{
|
|
/// Finish current token if any
|
|
if (*token_length > 0)
|
|
return true;
|
|
*token_start = ++*pos;
|
|
}
|
|
else
|
|
{
|
|
/// Note that UTF-8 sequence is completely consisted of non-ASCII bytes.
|
|
++*pos;
|
|
++*token_length;
|
|
}
|
|
}
|
|
|
|
return *token_length > 0;
|
|
}
|
|
|
|
bool SplitTokenExtractor::nextInStringPadded(const char * data, size_t length, size_t * __restrict pos, size_t * __restrict token_start, size_t * __restrict token_length) const
|
|
{
|
|
*token_start = *pos;
|
|
*token_length = 0;
|
|
|
|
while (*pos < length)
|
|
{
|
|
#if defined(__SSE2__) && !defined(MEMORY_SANITIZER) /// We read uninitialized bytes and decide on the calculated mask
|
|
// NOTE: we assume that `data` string is padded from the right with 15 bytes.
|
|
const __m128i haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + *pos));
|
|
const size_t haystack_length = 16;
|
|
|
|
#if defined(__SSE4_2__)
|
|
// With the help of https://www.strchr.com/strcmp_and_strlen_using_sse_4.2
|
|
const auto alnum_chars_ranges = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
|
'\xFF', '\x80', 'z', 'a', 'Z', 'A', '9', '0');
|
|
// Every bit represents if `haystack` character is in the ranges (1) or not (0)
|
|
const unsigned result_bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(alnum_chars_ranges, 8, haystack, haystack_length, _SIDD_CMP_RANGES));
|
|
#else
|
|
// NOTE: -1 and +1 required since SSE2 has no `>=` and `<=` instructions on packed 8-bit integers (epi8).
|
|
const auto number_begin = _mm_set1_epi8('0' - 1);
|
|
const auto number_end = _mm_set1_epi8('9' + 1);
|
|
const auto alpha_lower_begin = _mm_set1_epi8('a' - 1);
|
|
const auto alpha_lower_end = _mm_set1_epi8('z' + 1);
|
|
const auto alpha_upper_begin = _mm_set1_epi8('A' - 1);
|
|
const auto alpha_upper_end = _mm_set1_epi8('Z' + 1);
|
|
const auto zero = _mm_set1_epi8(0);
|
|
|
|
// every bit represents if `haystack` character `c` satisfies condition:
|
|
// (c < 0) || (c > '0' - 1 && c < '9' + 1) || (c > 'a' - 1 && c < 'z' + 1) || (c > 'A' - 1 && c < 'Z' + 1)
|
|
// < 0 since _mm_cmplt_epi8 threats chars as SIGNED, and so all chars > 0x80 are negative.
|
|
const unsigned result_bitmask = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(_mm_or_si128(
|
|
_mm_cmplt_epi8(haystack, zero),
|
|
_mm_and_si128(_mm_cmpgt_epi8(haystack, number_begin), _mm_cmplt_epi8(haystack, number_end))),
|
|
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_lower_begin), _mm_cmplt_epi8(haystack, alpha_lower_end))),
|
|
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_upper_begin), _mm_cmplt_epi8(haystack, alpha_upper_end))));
|
|
#endif
|
|
if (result_bitmask == 0)
|
|
{
|
|
if (*token_length != 0)
|
|
// end of token started on previous haystack
|
|
return true;
|
|
|
|
*pos += haystack_length;
|
|
continue;
|
|
}
|
|
|
|
const auto token_start_pos_in_current_haystack = std::countr_zero(result_bitmask);
|
|
if (*token_length == 0)
|
|
// new token
|
|
*token_start = *pos + token_start_pos_in_current_haystack;
|
|
else if (token_start_pos_in_current_haystack != 0)
|
|
// end of token starting in one of previous haystacks
|
|
return true;
|
|
|
|
const auto token_bytes_in_current_haystack = std::countr_zero(~(result_bitmask >> token_start_pos_in_current_haystack));
|
|
*token_length += token_bytes_in_current_haystack;
|
|
|
|
*pos += token_start_pos_in_current_haystack + token_bytes_in_current_haystack;
|
|
if (token_start_pos_in_current_haystack + token_bytes_in_current_haystack == haystack_length)
|
|
// check if there are leftovers in next `haystack`
|
|
continue;
|
|
|
|
break;
|
|
#else
|
|
if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
|
|
{
|
|
/// Finish current token if any
|
|
if (*token_length > 0)
|
|
return true;
|
|
*token_start = ++*pos;
|
|
}
|
|
else
|
|
{
|
|
/// Note that UTF-8 sequence is completely consisted of non-ASCII bytes.
|
|
++*pos;
|
|
++*token_length;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if defined(__SSE2__) && !defined(MEMORY_SANITIZER)
|
|
// Could happen only if string is not padded with zeros, and we accidentally hopped over the end of data.
|
|
if (*token_start > length)
|
|
return false;
|
|
*token_length = std::min(length - *token_start, *token_length);
|
|
#endif
|
|
|
|
return *token_length > 0;
|
|
}
|
|
|
|
bool SplitTokenExtractor::nextInStringLike(const char * data, size_t length, size_t * pos, String & token) const
|
|
{
|
|
token.clear();
|
|
bool bad_token = false; // % or _ before token
|
|
bool escaped = false;
|
|
while (*pos < length)
|
|
{
|
|
if (!escaped && (data[*pos] == '%' || data[*pos] == '_'))
|
|
{
|
|
token.clear();
|
|
bad_token = true;
|
|
++*pos;
|
|
}
|
|
else if (!escaped && data[*pos] == '\\')
|
|
{
|
|
escaped = true;
|
|
++*pos;
|
|
}
|
|
else if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
|
|
{
|
|
if (!bad_token && !token.empty())
|
|
return true;
|
|
|
|
token.clear();
|
|
bad_token = false;
|
|
escaped = false;
|
|
++*pos;
|
|
}
|
|
else
|
|
{
|
|
const size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*pos]));
|
|
for (size_t j = 0; j < sz; ++j)
|
|
{
|
|
token += data[*pos];
|
|
++*pos;
|
|
}
|
|
escaped = false;
|
|
}
|
|
}
|
|
|
|
return !bad_token && !token.empty();
|
|
}
|
|
|
|
}
|