2019-11-06 10:34:13 +00:00
|
|
|
#include <Common/UTF8Helpers.h>
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <climits>
|
|
|
|
#include <cstring>
|
|
|
|
#include <memory>
|
|
|
|
#include <utility>
|
|
|
|
|
|
|
|
#ifdef __SSE4_2__
|
|
|
|
# include <nmmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2019-11-06 10:35:55 +00:00
|
|
|
// used by FunctionsStringSimilarity and FunctionsStringHash
|
2019-12-05 03:48:40 +00:00
|
|
|
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
|
2019-11-06 10:34:13 +00:00
|
|
|
template <size_t N, bool CaseInsensitive>
|
|
|
|
struct ExtractStringImpl
|
|
|
|
{
|
|
|
|
static constexpr size_t default_padding = 16;
|
|
|
|
|
|
|
|
static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
|
|
|
|
{
|
|
|
|
/// Offset before which we copy some data.
|
|
|
|
constexpr size_t padding_offset = default_padding - N + 1;
|
|
|
|
/// We have an array like this for ASCII (N == 4, other cases are similar)
|
|
|
|
/// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
|
|
|
|
/// And we copy ^^^^^^^^^^^^^^^ these bytes to the start
|
|
|
|
/// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
|
|
|
|
memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));
|
|
|
|
/// Now we have an array
|
|
|
|
/// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
|
|
|
|
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
|
|
/// Doing unaligned read of 16 bytes and copy them like above
|
|
|
|
/// 16 is also chosen to do two `movups`.
|
|
|
|
/// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
|
|
|
|
memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8));
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
{
|
|
|
|
/// We really need template lambdas with C++20 to do it inline
|
|
|
|
unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
|
|
|
|
}
|
|
|
|
pos += padding_offset;
|
|
|
|
if (pos > end)
|
|
|
|
return default_padding - (pos - end);
|
|
|
|
return default_padding;
|
|
|
|
}
|
|
|
|
|
2019-11-06 10:35:55 +00:00
|
|
|
// used by FunctionsStringHash
|
|
|
|
// it's not easy to add padding for ColumnString, so we need safety check each memcpy
|
2019-11-06 10:34:13 +00:00
|
|
|
static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end)
|
|
|
|
{
|
|
|
|
constexpr size_t padding_offset = default_padding - N + 1;
|
|
|
|
memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));
|
|
|
|
|
2019-11-06 10:35:55 +00:00
|
|
|
// safety check
|
2019-11-06 10:34:13 +00:00
|
|
|
size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset;
|
|
|
|
|
|
|
|
memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8));
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
{
|
|
|
|
unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
|
|
|
|
}
|
|
|
|
pos += padding_offset;
|
|
|
|
if (pos > end)
|
|
|
|
return default_padding - (pos - end);
|
|
|
|
return default_padding;
|
|
|
|
}
|
|
|
|
|
2019-11-06 10:35:55 +00:00
|
|
|
// read a ASCII word from pos to word
|
|
|
|
// if the word size exceeds max_word_size, only read max_word_size byte
|
|
|
|
// in FuntionsStringHash, the default value of max_word_size is 128
|
2019-11-06 10:34:13 +00:00
|
|
|
static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size)
|
|
|
|
{
|
2019-11-06 10:35:55 +00:00
|
|
|
// jump seperators
|
2019-11-06 10:34:13 +00:00
|
|
|
while (pos < end && !isAlphaNum(*pos))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
// word start from here
|
|
|
|
const char * word_start = pos;
|
|
|
|
while (pos < end && isAlphaNum(*pos))
|
|
|
|
++pos;
|
|
|
|
|
|
|
|
size_t word_size = (static_cast<size_t>(pos - word_start) <= max_word_size) ? pos - word_start : max_word_size;
|
|
|
|
|
|
|
|
memcpy(word, word_start, word_size);
|
|
|
|
if (CaseInsensitive)
|
|
|
|
{
|
|
|
|
std::transform(word, word + word_size, word, [](UInt8 c) { return std::tolower(c); });
|
|
|
|
}
|
|
|
|
return word_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end)
|
|
|
|
{
|
|
|
|
memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32));
|
|
|
|
|
|
|
|
size_t num = N - 1;
|
|
|
|
while (num < default_padding && pos < end)
|
|
|
|
{
|
|
|
|
code_points[num++] = readOneUTF8Code(pos, end);
|
|
|
|
}
|
|
|
|
return num;
|
|
|
|
}
|
|
|
|
|
2019-11-06 10:35:55 +00:00
|
|
|
// read one UTF8 word from pos to word
|
|
|
|
// also, we assume that one word size cann't exceed max_word_size with default value 128
|
2019-11-06 10:34:13 +00:00
|
|
|
static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size)
|
|
|
|
{
|
|
|
|
// jump UTF8 seperator
|
|
|
|
while (pos < end && isUTF8Sep(*pos))
|
|
|
|
++pos;
|
2019-11-06 10:35:55 +00:00
|
|
|
// UTF8 word's character number
|
2019-11-06 10:34:13 +00:00
|
|
|
size_t num = 0;
|
|
|
|
while (pos < end && num < max_word_size && !isUTF8Sep(*pos))
|
|
|
|
{
|
|
|
|
word[num++] = readOneUTF8Code(pos, end);
|
|
|
|
}
|
|
|
|
return num;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
static ALWAYS_INLINE inline bool isAlphaNum(const UInt8 c)
|
|
|
|
{
|
|
|
|
return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <size_t Offset, typename Container, size_t... I>
|
|
|
|
static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
|
|
|
|
{
|
|
|
|
((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
|
|
|
|
}
|
|
|
|
|
2019-11-06 10:35:55 +00:00
|
|
|
// we use ASCII non-alphanum character as UTF8 seperator
|
2019-11-06 10:34:13 +00:00
|
|
|
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); }
|
|
|
|
|
|
|
|
// read one UTF8 character and return it
|
|
|
|
static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
|
|
|
|
{
|
|
|
|
size_t length = UTF8::seqLength(*pos);
|
|
|
|
|
|
|
|
if (pos + length > end)
|
|
|
|
length = end - pos;
|
|
|
|
UInt32 res;
|
|
|
|
switch (length)
|
|
|
|
{
|
|
|
|
case 1:
|
|
|
|
res = 0;
|
|
|
|
memcpy(&res, pos, 1);
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
res = 0;
|
|
|
|
memcpy(&res, pos, 2);
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
res = 0;
|
|
|
|
memcpy(&res, pos, 3);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
memcpy(&res, pos, 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
{
|
|
|
|
switch (length)
|
|
|
|
{
|
|
|
|
case 4:
|
|
|
|
res &= ~(1u << (5 + 3 * CHAR_BIT));
|
|
|
|
[[fallthrough]];
|
|
|
|
case 3:
|
|
|
|
res &= ~(1u << (5 + 2 * CHAR_BIT));
|
|
|
|
[[fallthrough]];
|
|
|
|
case 2:
|
|
|
|
res &= ~(1u);
|
|
|
|
res &= ~(1u << (5 + CHAR_BIT));
|
|
|
|
[[fallthrough]];
|
|
|
|
default:
|
|
|
|
res &= ~(1u << 5);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pos += length;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|