mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-01 03:52:15 +00:00
Fixed compilation issues and fixed several bugs in SplitTokenExtractor::next
* Handling all characters above 0x80 as symbols (fixes UTF8 tokens) * Properly handling tokens that end exactly on haystack boundary.
This commit is contained in:
parent
9d6c88c78e
commit
90cb6a25cf
@ -19,6 +19,10 @@
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <nmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -609,16 +613,17 @@ bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size
|
||||
|
||||
while (*pos < len)
|
||||
{
|
||||
#if __SSE2__
|
||||
#if defined(__SSE2__)
|
||||
// NOTE: we assume that `data` string is padded from the right with 15 zero-bytes.
|
||||
const __m128i haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + *pos));
|
||||
const size_t haystack_length = 16;
|
||||
|
||||
#if __SSE4_2__
|
||||
#if defined(__SSE4_2__)
|
||||
// With the help of https://www.strchr.com/strcmp_and_strlen_using_sse_4.2
|
||||
static const auto alnum_chars_ranges = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'Z', 'A', 'z', 'a', '9', '0');
|
||||
static const auto alnum_chars_ranges = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||
'\xFF', '\x80', 'z', 'a', 'Z', 'A', '9', '0');
|
||||
// Every bit represents if `haystack` character is in the ranges (1) or not(0)
|
||||
const auto result_bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(alnum_chars_ranges, 6, haystack, haystack_length, _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS));
|
||||
const int result_bitmask = _mm_cvtsi128_si32(_mm_cmpestrm(alnum_chars_ranges, 8, haystack, haystack_length, _SIDD_CMP_RANGES));
|
||||
#else
|
||||
// NOTE: -1 and +1 required since SSE2 has no `>=` and `<=` instructions on packed 8-bit integers (epi8).
|
||||
static const auto number_begin = _mm_set1_epi8('0' - 1);
|
||||
@ -627,13 +632,16 @@ bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size
|
||||
static const auto alpha_lower_end = _mm_set1_epi8('z' + 1);
|
||||
static const auto alpha_upper_begin = _mm_set1_epi8('A' - 1);
|
||||
static const auto alpha_upper_end = _mm_set1_epi8('Z' + 1);
|
||||
static const auto zero = _mm_set1_epi8(0);
|
||||
|
||||
// every bit represents if `haystack` character `c` statisfies condition:
|
||||
// (c > '0' - 1 && c < '9' + 1) || (c > 'a' - 1 && c < 'z' + 1) || (c > 'A' - 1 && c < 'Z' + 1)
|
||||
const int result_bitmask = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(
|
||||
_mm_and_si128(_mm_cmpgt_epi8(haystack, number_begin), _mm_cmplt_epi8(haystack, number_end)),
|
||||
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_lower_begin), _mm_cmplt_epi8(haystack, alpha_lower_end))),
|
||||
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_upper_begin), _mm_cmplt_epi8(haystack, alpha_upper_end))));
|
||||
// (c < 0) || (c > '0' - 1 && c < '9' + 1) || (c > 'a' - 1 && c < 'z' + 1) || (c > 'A' - 1 && c < 'Z' + 1)
|
||||
// < 0 since _mm_cmplt_epi8 threats chars as SIGNED, and hence all chars > 0x80 are negative.
|
||||
const int result_bitmask = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(_mm_or_si128(
|
||||
_mm_cmplt_epi8(haystack, zero),
|
||||
_mm_and_si128(_mm_cmpgt_epi8(haystack, number_begin), _mm_cmplt_epi8(haystack, number_end))),
|
||||
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_lower_begin), _mm_cmplt_epi8(haystack, alpha_lower_end))),
|
||||
_mm_and_si128(_mm_cmpgt_epi8(haystack, alpha_upper_begin), _mm_cmplt_epi8(haystack, alpha_upper_end))));
|
||||
#endif
|
||||
// NOTE: __builtin_ctz family explicitly state that result is UNDEFINED if argument is 0
|
||||
if (result_bitmask == 0)
|
||||
@ -649,12 +657,15 @@ bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size
|
||||
const auto start = getTrailingZeroBits(result_bitmask);
|
||||
if (*token_len == 0)
|
||||
*token_start = *pos + start;
|
||||
else if (start != 0)
|
||||
// token is not continued in this haystack
|
||||
return true;
|
||||
|
||||
const auto l = getTrailingZeroBits(~(result_bitmask >> start));
|
||||
*token_len += l;
|
||||
|
||||
*pos += start + l;
|
||||
if (start + l == 16)
|
||||
if (start + l == haystack_length)
|
||||
// check if there are leftovers in next `haystack`
|
||||
continue;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user