2020-12-22 16:07:04 +00:00
|
|
|
#pragma once
|
2020-06-10 15:02:58 +00:00
|
|
|
#include <Common/PODArray.h>
|
|
|
|
#include <Common/StringUtils/StringUtils.h>
|
2019-11-06 10:34:13 +00:00
|
|
|
#include <Common/UTF8Helpers.h>
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <climits>
|
|
|
|
#include <cstring>
|
|
|
|
#include <memory>
|
|
|
|
#include <utility>
|
|
|
|
|
|
|
|
#ifdef __SSE4_2__
|
|
|
|
# include <nmmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-12-24 18:57:31 +00:00
|
|
|
|
2019-11-06 10:35:55 +00:00
|
|
|
// used by FunctionsStringSimilarity and FunctionsStringHash
|
2019-12-05 03:48:40 +00:00
|
|
|
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
|
2019-11-06 10:34:13 +00:00
|
|
|
struct ExtractStringImpl
|
|
|
|
{
|
2020-12-29 10:16:22 +00:00
|
|
|
static ALWAYS_INLINE inline const UInt8 * readOneWord(const UInt8 *& pos, const UInt8 * end)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 10:48:10 +00:00
|
|
|
// jump separators
|
2020-12-25 17:58:44 +00:00
|
|
|
while (pos < end && isUTF8Sep(*pos))
|
2019-11-06 10:34:13 +00:00
|
|
|
++pos;
|
|
|
|
|
|
|
|
// word start from here
|
2020-12-25 11:22:48 +00:00
|
|
|
const UInt8 * word_start = pos;
|
2020-12-25 17:58:44 +00:00
|
|
|
while (pos < end && !isUTF8Sep(*pos))
|
2019-11-06 10:34:13 +00:00
|
|
|
++pos;
|
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
return word_start;
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
|
2020-12-24 10:48:10 +00:00
|
|
|
// we use ASCII non-alphanum character as UTF8 separator
|
2020-05-22 13:23:49 +00:00
|
|
|
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
// read one UTF8 character
|
|
|
|
static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
|
|
|
size_t length = UTF8::seqLength(*pos);
|
|
|
|
|
|
|
|
if (pos + length > end)
|
|
|
|
length = end - pos;
|
|
|
|
|
|
|
|
pos += length;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|