ClickHouse/src/Functions/ExtractString.h

#pragma once
#include <Common/PODArray.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/UTF8Helpers.h>

#include <algorithm>
#include <climits>
#include <cstring>
#include <memory>
#include <utility>

#ifdef __SSE4_2__
#    include <nmmintrin.h>
#endif

namespace DB
{
// used by FunctionsStringSimilarity and FunctionsStringHash
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
template <size_t N, bool CaseInsensitive>
struct ExtractStringImpl
{
    /// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
    static constexpr size_t default_padding = 16;

    /// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
    /// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
    static constexpr size_t buffer_size = default_padding + N - 1;

    // the length of code_points = buffer_size
    // pos: the current beginning location that we want to copy data
    // end: the end location of the string
    static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
    {
        /// Offset before which we copy some data.
        constexpr size_t padding_offset = default_padding - N + 1;
        /// We have an array like this for ASCII (N == 4, other cases are similar)
        /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
        /// And we copy                                ^^^^^^^^^^^^^^^ these bytes to the start
        /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
        memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));
        /// Now we have an array
        /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
        ///              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        /// Doing unaligned read of 16 bytes and copy them like above
        /// 16 is also chosen to do two `movups`.
        /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
        memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8));

        if constexpr (CaseInsensitive)
        {
            /// We really need template lambdas with C++20 to do it inline
            unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
        }
        pos += padding_offset;
        if (pos > end)
            return default_padding - (pos - end);
        return default_padding;
    }

    // read a ASCII word
    static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
    {
        // jump separators
        while (pos < end && !isAlphaNumericASCII(*pos))
            ++pos;

        // word start from here
        const char * word_start = pos;
        while (pos < end && isAlphaNumericASCII(*pos))
            ++pos;

        word_buf.assign(word_start, pos);
        if (CaseInsensitive)
        {
            std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); });
        }
        return word_buf.size();
    }

    static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end)
    {
        memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32));

        size_t num = N - 1;
        while (num < default_padding && pos < end)
        {
            code_points[num++] = readOneUTF8Code(pos, end);
        }
        return num;
    }

    // read one UTF8 word from pos to word
    static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
    {
        // jump UTF8 separator
        while (pos < end && isUTF8Sep(*pos))
            ++pos;
        word_buf.clear();
        // UTF8 word's character number
        while (pos < end && !isUTF8Sep(*pos))
        {
            word_buf.push_back(readOneUTF8Code(pos, end));
        }
        return word_buf.size();
    }

private:
    template <size_t Offset, typename Container, size_t... I>
    static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
    {
        ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
    }

    // we use ASCII non-alphanum character as UTF8 separator
    static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }

    // read one UTF8 character and return it
    static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
    {
        size_t length = UTF8::seqLength(*pos);

        if (pos + length > end)
            length = end - pos;
        UInt32 res;
        switch (length)
        {
            case 1:
                res = 0;
                memcpy(&res, pos, 1);
                break;
            case 2:
                res = 0;
                memcpy(&res, pos, 2);
                break;
            case 3:
                res = 0;
                memcpy(&res, pos, 3);
                break;
            default:
                memcpy(&res, pos, 4);
        }

        if constexpr (CaseInsensitive)
        {
            switch (length)
            {
                case 4:
                    res &= ~(1u << (5 + 3 * CHAR_BIT));
                    [[fallthrough]];
                case 3:
                    res &= ~(1u << (5 + 2 * CHAR_BIT));
                    [[fallthrough]];
                case 2:
                    res &= ~(1u);
                    res &= ~(1u << (5 + CHAR_BIT));
                    [[fallthrough]];
                default:
                    res &= ~(1u << 5);
            }
        }
        pos += length;
        return res;
    }
};
}
Fix style. 2020-12-22 16:07:04 +00:00			`#pragma once`
update fix fix fix 2020-06-10 15:02:58 +00:00			`#include <Common/PODArray.h>`
			`#include <Common/StringUtils/StringUtils.h>`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`#include <Common/UTF8Helpers.h>`

			`#include <algorithm>`
			`#include <climits>`
			`#include <cstring>`
			`#include <memory>`
			`#include <utility>`

			`#ifdef __SSE4_2__`
			`# include <nmmintrin.h>`
			`#endif`

			`namespace DB`
			`{`
add test fix comment style fix lambda function style 2019-11-06 10:35:55 +00:00			`// used by FunctionsStringSimilarity and FunctionsStringHash`
Update ExtractString.h 2019-12-05 03:48:40 +00:00			`// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`template <size_t N, bool CaseInsensitive>`
			`struct ExtractStringImpl`
			`{`
Refactor cose a little bit more. 2020-12-21 14:14:05 +00:00			`/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`static constexpr size_t default_padding = 16;`

Refactor cose a little bit more. 2020-12-21 14:14:05 +00:00			/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
			/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
			`static constexpr size_t buffer_size = default_padding + N - 1;`

			`// the length of code_points = buffer_size`
update fix fix fix 2020-06-10 15:02:58 +00:00			`// pos: the current beginning location that we want to copy data`
Refactor cose a little bit more. 2020-12-21 14:14:05 +00:00			`// end: the end location of the string`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char & pos, const char end)`
			`{`
			`/// Offset before which we copy some data.`
			`constexpr size_t padding_offset = default_padding - N + 1;`
			`/// We have an array like this for ASCII (N == 4, other cases are similar)`
			`/// \|a0\|a1\|a2\|a3\|a4\|a5\|a6\|a7\|a8\|a9\|a10\|a11\|a12\|a13\|a14\|a15\|a16\|a17\|a18\|`
			`/// And we copy ^^^^^^^^^^^^^^^ these bytes to the start`
			`/// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction`
			`memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8));`
			`/// Now we have an array`
			`/// \|a13\|a14\|a15\|a16\|a4\|a5\|a6\|a7\|a8\|a9\|a10\|a11\|a12\|a13\|a14\|a15\|a16\|a17\|a18\|`
			`/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^`
			`/// Doing unaligned read of 16 bytes and copy them like above`
			/// 16 is also chosen to do two `movups`.
			`/// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.`
			`memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8));`

			`if constexpr (CaseInsensitive)`
			`{`
			`/// We really need template lambdas with C++20 to do it inline`
			`unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());`
			`}`
			`pos += padding_offset;`
			`if (pos > end)`
			`return default_padding - (pos - end);`
			`return default_padding;`
			`}`

update fix fix fix 2020-06-10 15:02:58 +00:00			`// read a ASCII word`
			`static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char & pos, const char end)`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`{`
more fixes after rebase 2020-12-24 10:48:10 +00:00			`// jump separators`
update update name 2020-05-22 13:23:49 +00:00			`while (pos < end && !isAlphaNumericASCII(*pos))`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`++pos;`

			`// word start from here`
			`const char * word_start = pos;`
update update name 2020-05-22 13:23:49 +00:00			`while (pos < end && isAlphaNumericASCII(*pos))`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`++pos;`

update fix fix fix 2020-06-10 15:02:58 +00:00			`word_buf.assign(word_start, pos);`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`if (CaseInsensitive)`
			`{`
update fix fix fix 2020-06-10 15:02:58 +00:00			`std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); });`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`}`
update fix fix fix 2020-06-10 15:02:58 +00:00			`return word_buf.size();`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`}`

			`static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char & pos, const char end)`
			`{`
			`memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32));`

			`size_t num = N - 1;`
			`while (num < default_padding && pos < end)`
			`{`
			`code_points[num++] = readOneUTF8Code(pos, end);`
			`}`
			`return num;`
			`}`

add test fix comment style fix lambda function style 2019-11-06 10:35:55 +00:00			`// read one UTF8 word from pos to word`
update fix fix fix 2020-06-10 15:02:58 +00:00			`static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char & pos, const char end)`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`{`
more fixes after rebase 2020-12-24 10:48:10 +00:00			`// jump UTF8 separator`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`while (pos < end && isUTF8Sep(*pos))`
			`++pos;`
update fix fix fix 2020-06-10 15:02:58 +00:00			`word_buf.clear();`
add test fix comment style fix lambda function style 2019-11-06 10:35:55 +00:00			`// UTF8 word's character number`
update fix fix fix 2020-06-10 15:02:58 +00:00			`while (pos < end && !isUTF8Sep(*pos))`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`{`
update fix fix fix 2020-06-10 15:02:58 +00:00			`word_buf.push_back(readOneUTF8Code(pos, end));`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`}`
update fix fix fix 2020-06-10 15:02:58 +00:00			`return word_buf.size();`
add simhash and minhash 2019-11-06 10:34:13 +00:00			`}`

			`private:`
			`template <size_t Offset, typename Container, size_t... I>`
			`static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)`
			`{`
			`((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);`
			`}`

more fixes after rebase 2020-12-24 10:48:10 +00:00			`// we use ASCII non-alphanum character as UTF8 separator`
update update name 2020-05-22 13:23:49 +00:00			`static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }`
add simhash and minhash 2019-11-06 10:34:13 +00:00
			`// read one UTF8 character and return it`
			`static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char & pos, const char end)`
			`{`
			`size_t length = UTF8::seqLength(*pos);`

			`if (pos + length > end)`
			`length = end - pos;`
			`UInt32 res;`
			`switch (length)`
			`{`
			`case 1:`
			`res = 0;`
			`memcpy(&res, pos, 1);`
			`break;`
			`case 2:`
			`res = 0;`
			`memcpy(&res, pos, 2);`
			`break;`
			`case 3:`
			`res = 0;`
			`memcpy(&res, pos, 3);`
			`break;`
			`default:`
			`memcpy(&res, pos, 4);`
			`}`

			`if constexpr (CaseInsensitive)`
			`{`
			`switch (length)`
			`{`
			`case 4:`
			`res &= ~(1u << (5 + 3 * CHAR_BIT));`
			`[[fallthrough]];`
			`case 3:`
			`res &= ~(1u << (5 + 2 * CHAR_BIT));`
			`[[fallthrough]];`
			`case 2:`
			`res &= ~(1u);`
			`res &= ~(1u << (5 + CHAR_BIT));`
			`[[fallthrough]];`
			`default:`
			`res &= ~(1u << 5);`
			`}`
			`}`
			`pos += length;`
			`return res;`
			`}`
			`};`
			`}`