ClickHouse/src/Functions/ExtractString.h

167 lines
5.8 KiB
C++
Raw Normal View History

2020-12-22 16:07:04 +00:00
#pragma once
2020-06-10 15:02:58 +00:00
#include <Common/PODArray.h>
#include <Common/StringUtils/StringUtils.h>
2019-11-06 10:34:13 +00:00
#include <Common/UTF8Helpers.h>
#include <algorithm>
#include <climits>
#include <cstring>
#include <memory>
#include <utility>
#ifdef __SSE4_2__
# include <nmmintrin.h>
#endif
namespace DB
{
// used by FunctionsStringSimilarity and FunctionsStringHash
2019-12-05 03:48:40 +00:00
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
2019-11-06 10:34:13 +00:00
template <size_t N, bool CaseInsensitive>
struct ExtractStringImpl
{
2020-12-21 14:14:05 +00:00
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
2019-11-06 10:34:13 +00:00
static constexpr size_t default_padding = 16;
2020-12-21 14:14:05 +00:00
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
static constexpr size_t buffer_size = default_padding + N - 1;
// the length of code_points = buffer_size
2020-06-10 15:02:58 +00:00
// pos: the current beginning location that we want to copy data
2020-12-21 14:14:05 +00:00
// end: the end location of the string
2019-11-06 10:34:13 +00:00
static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
{
/// Offset before which we copy some data.
constexpr size_t padding_offset = default_padding - N + 1;
/// We have an array like this for ASCII (N == 4, other cases are similar)
/// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
/// And we copy ^^^^^^^^^^^^^^^ these bytes to the start
/// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));
/// Now we have an array
/// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/// Doing unaligned read of 16 bytes and copy them like above
/// 16 is also chosen to do two `movups`.
/// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8));
if constexpr (CaseInsensitive)
{
/// We really need template lambdas with C++20 to do it inline
unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
}
pos += padding_offset;
if (pos > end)
return default_padding - (pos - end);
return default_padding;
}
2020-06-10 15:02:58 +00:00
// read a ASCII word
static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
2019-11-06 10:34:13 +00:00
{
2020-12-24 10:48:10 +00:00
// jump separators
2020-05-22 13:23:49 +00:00
while (pos < end && !isAlphaNumericASCII(*pos))
2019-11-06 10:34:13 +00:00
++pos;
// word start from here
const char * word_start = pos;
2020-05-22 13:23:49 +00:00
while (pos < end && isAlphaNumericASCII(*pos))
2019-11-06 10:34:13 +00:00
++pos;
2020-06-10 15:02:58 +00:00
word_buf.assign(word_start, pos);
2019-11-06 10:34:13 +00:00
if (CaseInsensitive)
{
2020-06-10 15:02:58 +00:00
std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); });
2019-11-06 10:34:13 +00:00
}
2020-06-10 15:02:58 +00:00
return word_buf.size();
2019-11-06 10:34:13 +00:00
}
static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end)
{
memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32));
size_t num = N - 1;
while (num < default_padding && pos < end)
{
code_points[num++] = readOneUTF8Code(pos, end);
}
return num;
}
// read one UTF8 word from pos to word
2020-06-10 15:02:58 +00:00
static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
2019-11-06 10:34:13 +00:00
{
2020-12-24 10:48:10 +00:00
// jump UTF8 separator
2019-11-06 10:34:13 +00:00
while (pos < end && isUTF8Sep(*pos))
++pos;
2020-06-10 15:02:58 +00:00
word_buf.clear();
// UTF8 word's character number
2020-06-10 15:02:58 +00:00
while (pos < end && !isUTF8Sep(*pos))
2019-11-06 10:34:13 +00:00
{
2020-06-10 15:02:58 +00:00
word_buf.push_back(readOneUTF8Code(pos, end));
2019-11-06 10:34:13 +00:00
}
2020-06-10 15:02:58 +00:00
return word_buf.size();
2019-11-06 10:34:13 +00:00
}
private:
template <size_t Offset, typename Container, size_t... I>
static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
{
((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
}
2020-12-24 10:48:10 +00:00
// we use ASCII non-alphanum character as UTF8 separator
2020-05-22 13:23:49 +00:00
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
2019-11-06 10:34:13 +00:00
// read one UTF8 character and return it
static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
{
size_t length = UTF8::seqLength(*pos);
if (pos + length > end)
length = end - pos;
UInt32 res;
switch (length)
{
case 1:
res = 0;
memcpy(&res, pos, 1);
break;
case 2:
res = 0;
memcpy(&res, pos, 2);
break;
case 3:
res = 0;
memcpy(&res, pos, 3);
break;
default:
memcpy(&res, pos, 4);
}
if constexpr (CaseInsensitive)
{
switch (length)
{
case 4:
res &= ~(1u << (5 + 3 * CHAR_BIT));
[[fallthrough]];
case 3:
res &= ~(1u << (5 + 2 * CHAR_BIT));
[[fallthrough]];
case 2:
res &= ~(1u);
res &= ~(1u << (5 + CHAR_BIT));
[[fallthrough]];
default:
res &= ~(1u << 5);
}
}
pos += length;
return res;
}
};
}