ClickHouse/dbms/src/Functions/FunctionsStringSimilarity.cpp

#include <Functions/FunctionsStringSimilarity.h>

#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsHashing.h>
#include <Common/HashTable/ClearableHashMap.h>
#include <Common/HashTable/Hash.h>
#include <Common/UTF8Helpers.h>

#include <Core/Defines.h>

#include <common/unaligned.h>

#include <algorithm>
#include <climits>
#include <cstring>
#include <limits>
#include <memory>
#include <utility>

#ifdef __SSE4_2__
#    include <nmmintrin.h>
#endif

namespace DB
{
/** Distance function implementation.
  * We calculate all the n-grams from left string and count by the index of
  * 16 bits hash of them in the map.
  * Then calculate all the n-grams from the right string and calculate
  * the n-gram distance on the flight by adding and subtracting from the hashmap.
  * Then return the map into the condition of which it was after the left string
  * calculation. If the right string size is big (more than 2**15 bytes),
  * the strings are not similar at all and we return 1.
  */
template <size_t N, class CodePoint, bool UTF8, bool CaseInsensitive, bool Symmetric>
struct NgramDistanceImpl
{
    using ResultType = Float32;

    /// map_size for ngram difference.
    static constexpr size_t map_size = 1u << 16;

    /// If the haystack size is bigger than this, behaviour is unspecified for this function.
    static constexpr size_t max_string_size = 1u << 15;

    /// Default padding to read safely.
    static constexpr size_t default_padding = 16;

    /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
    static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;

    /** This fits mostly in L2 cache all the time.
      * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
      * integer array.
      */
    using NgramStats = UInt16[map_size];

    static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points)
    {
        return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu;
    }

    static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points)
    {
        UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1];
#ifdef __SSE4_2__
        return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu;
#else
        return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu;
#endif
    }

    template <size_t Offset, class Container, size_t... I>
    static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
    {
        ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
    }

    static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end)
    {
        /// Offset before which we copy some data.
        constexpr size_t padding_offset = default_padding - N + 1;
        /// We have an array like this for ASCII (N == 4, other cases are similar)
        /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
        /// And we copy                                ^^^^^^^^^^^^^^^ these bytes to the start
        /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
        memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
        /// Now we have an array
        /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
        ///              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        /// Doing unaligned read of 16 bytes and copy them like above
        /// 16 is also chosen to do two `movups`.
        /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
        memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));

        if constexpr (CaseInsensitive)
        {
            /// We really need template lambdas with C++20 to do it inline
            unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
        }
        pos += padding_offset;
        if (pos > end)
            return default_padding - (pos - end);
        return default_padding;
    }

    static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end)
    {
        /// The same copying as described in the function above.
        memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));

        size_t num = N - 1;
        while (num < default_padding && pos < end)
        {
            size_t length = UTF8::seqLength(*pos);

            if (pos + length > end)
                length = end - pos;

            CodePoint res;
            /// This is faster than just memcpy because of compiler optimizations with moving bytes.
            switch (length)
            {
                case 1:
                    res = 0;
                    memcpy(&res, pos, 1);
                    break;
                case 2:
                    res = 0;
                    memcpy(&res, pos, 2);
                    break;
                case 3:
                    res = 0;
                    memcpy(&res, pos, 3);
                    break;
                default:
                    memcpy(&res, pos, 4);
            }

            /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte.
            /// And first bit of first byte if there are two bytes.
            /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does.
            /// For others, we don't care now. Lowering UTF is not a cheap operation.
            if constexpr (CaseInsensitive)
            {
                switch (length)
                {
                    case 4:
                        res &= ~(1u << (5 + 3 * CHAR_BIT));
                        [[fallthrough]];
                    case 3:
                        res &= ~(1u << (5 + 2 * CHAR_BIT));
                        [[fallthrough]];
                    case 2:
                        res &= ~(1u);
                        res &= ~(1u << (5 + CHAR_BIT));
                        [[fallthrough]];
                    default:
                        res &= ~(1u << 5);
                }
            }

            pos += length;
            code_points[num++] = res;
        }
        return num;
    }

    template <bool SaveNgrams>
    static ALWAYS_INLINE inline size_t calculateNeedleStats(
        const char * data,
        const size_t size,
        NgramStats & ngram_stats,
        [[maybe_unused]] UInt16 * ngram_storage,
        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
        UInt16 (*hash_functor)(const CodePoint *))
    {
        const char * start = data;
        const char * end = data + size;
        CodePoint cp[simultaneously_codepoints_num] = {};
        /// read_code_points returns the position of cp where it stopped reading codepoints.
        size_t found = read_code_points(cp, start, end);
        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
        size_t i = N - 1;
        size_t len = 0;
        do
        {
            for (; i + N <= found; ++i)
            {
                ++len;
                UInt16 hash = hash_functor(cp + i);
                if constexpr (SaveNgrams)
                    *ngram_storage++ = hash;
                ++ngram_stats[hash];
            }
            i = 0;
        } while (start < end && (found = read_code_points(cp, start, end)));

        return len;
    }

    template <bool ReuseStats>
    static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
        const char * data,
        const size_t size,
        NgramStats & ngram_stats,
        size_t & distance,
        [[maybe_unused]] UInt16 * ngram_storage,
        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
        UInt16 (*hash_functor)(const CodePoint *))
    {
        size_t ngram_cnt = 0;
        const char * start = data;
        const char * end = data + size;
        CodePoint cp[simultaneously_codepoints_num] = {};

        /// read_code_points returns the position of cp where it stopped reading codepoints.
        size_t found = read_code_points(cp, start, end);
        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
        size_t iter = N - 1;

        do
        {
            for (; iter + N <= found; ++iter)
            {
                UInt16 hash = hash_functor(cp + iter);
                /// For symmetric version we should add when we can't subtract to get symmetric difference.
                if (static_cast<Int16>(ngram_stats[hash]) > 0)
                    --distance;
                else if constexpr (Symmetric)
                    ++distance;
                if constexpr (ReuseStats)
                    ngram_storage[ngram_cnt] = hash;
                ++ngram_cnt;
                --ngram_stats[hash];
            }
            iter = 0;
        } while (start < end && (found = read_code_points(cp, start, end)));

        /// Return the state of hash map to its initial.
        if constexpr (ReuseStats)
        {
            for (size_t i = 0; i < ngram_cnt; ++i)
                ++ngram_stats[ngram_storage[i]];
        }
        return ngram_cnt;
    }

    template <class Callback, class... Args>
    static inline auto dispatchSearcher(Callback callback, Args &&... args)
    {
        if constexpr (!UTF8)
            return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash);
        else
            return callback(std::forward<Args>(args)..., readUTF8CodePoints, UTF8Hash);
    }

    static void constant_constant(std::string data, std::string needle, Float32 & res)
    {
        NgramStats common_stats = {};

        /// We use unsafe versions of getting ngrams, so I decided to use padded strings.
        const size_t needle_size = needle.size();
        const size_t data_size = data.size();
        needle.resize(needle_size + default_padding);
        data.resize(data_size + default_padding);

        size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);
        size_t distance = second_size;
        if (data_size <= max_string_size)
        {
            size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats, distance, nullptr);
            /// For !Symmetric version we should not use first_size.
            if constexpr (Symmetric)
                res = distance * 1.f / std::max(first_size + second_size, size_t(1));
            else
                res = 1.f - distance * 1.f / std::max(second_size, size_t(1));
        }
        else
        {
            if constexpr (Symmetric)
                res = 1.f;
            else
                res = 0.f;
        }
    }

    static void vector_vector(
        const ColumnString::Chars & haystack_data,
        const ColumnString::Offsets & haystack_offsets,
        const ColumnString::Chars & needle_data,
        const ColumnString::Offsets & needle_offsets,
        PaddedPODArray<Float32> & res)
    {
        const size_t haystack_offsets_size = haystack_offsets.size();
        size_t prev_haystack_offset = 0;
        size_t prev_needle_offset = 0;

        NgramStats common_stats = {};

        /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
        /// And we can reuse these storages in one thread because we care only about what was written to first places.
        std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
        std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);

        for (size_t i = 0; i < haystack_offsets_size; ++i)
        {
            const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]);
            const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1;
            const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]);
            const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;

            if (needle_size <= max_string_size && haystack_size <= max_string_size)
            {
                /// Get needle stats.
                const size_t needle_stats_size = dispatchSearcher(
                    calculateNeedleStats<true>,
                    needle,
                    needle_size,
                    common_stats,
                    needle_ngram_storage.get());

                size_t distance = needle_stats_size;

                /// Combine with haystack stats, return to initial needle stats.
                const size_t haystack_stats_size = dispatchSearcher(
                    calculateHaystackStatsAndMetric<true>,
                    haystack,
                    haystack_size,
                    common_stats,
                    distance,
                    haystack_ngram_storage.get());

                /// Return to zero array stats.
                for (size_t j = 0; j < needle_stats_size; ++j)
                    --common_stats[needle_ngram_storage[j]];

                /// For now, common stats is a zero array.


                /// For !Symmetric version we should not use haystack_stats_size.
                if constexpr (Symmetric)
                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
                else
                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
            }
            else
            {
                /// Strings are too big, we are assuming they are not the same. This is done because of limiting number
                /// of bigrams added and not allocating too much memory.
                if constexpr (Symmetric)
                    res[i] = 1.f;
                else
                    res[i] = 0.f;
            }

            prev_needle_offset = needle_offsets[i];
            prev_haystack_offset = haystack_offsets[i];
        }
    }

    static void constant_vector(
        std::string haystack,
        const ColumnString::Chars & needle_data,
        const ColumnString::Offsets & needle_offsets,
        PaddedPODArray<Float32> & res)
    {
        /// For symmetric version it is better to use vector_constant
        if constexpr (Symmetric)
        {
            vector_constant(needle_data, needle_offsets, std::move(haystack), res);
        }
        else
        {
            const size_t haystack_size = haystack.size();
            haystack.resize(haystack_size + default_padding);

            /// For logic explanation see vector_vector function.
            const size_t needle_offsets_size = needle_offsets.size();
            size_t prev_offset = 0;

            NgramStats common_stats = {};

            std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
            std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);

            for (size_t i = 0; i < needle_offsets_size; ++i)
            {
                const char * needle = reinterpret_cast<const char *>(&needle_data[prev_offset]);
                const size_t needle_size = needle_offsets[i] - prev_offset - 1;

                if (needle_size <= max_string_size && haystack_size <= max_string_size)
                {
                    const size_t needle_stats_size = dispatchSearcher(
                        calculateNeedleStats<true>,
                        needle,
                        needle_size,
                        common_stats,
                        needle_ngram_storage.get());

                    size_t distance = needle_stats_size;

                    dispatchSearcher(
                        calculateHaystackStatsAndMetric<true>,
                        haystack.data(),
                        haystack_size,
                        common_stats,
                        distance,
                        haystack_ngram_storage.get());

                    for (size_t j = 0; j < needle_stats_size; ++j)
                        --common_stats[needle_ngram_storage[j]];

                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
                }
                else
                {
                    res[i] = 0.f;
                }

                prev_offset = needle_offsets[i];
            }

        }
    }

    static void vector_constant(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        std::string needle,
        PaddedPODArray<Float32> & res)
    {
        /// zeroing our map
        NgramStats common_stats = {};

        /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
        /// And we can reuse these storages in one thread because we care only about what was written to first places.
        std::unique_ptr<UInt16[]> ngram_storage(new UInt16[max_string_size]);

        /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
        const size_t needle_size = needle.size();
        needle.resize(needle_size + default_padding);

        const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats, nullptr);

        size_t distance = needle_stats_size;
        size_t prev_offset = 0;
        for (size_t i = 0; i < offsets.size(); ++i)
        {
            const UInt8 * haystack = &data[prev_offset];
            const size_t haystack_size = offsets[i] - prev_offset - 1;
            if (haystack_size <= max_string_size)
            {
                size_t haystack_stats_size = dispatchSearcher(
                    calculateHaystackStatsAndMetric<true>,
                    reinterpret_cast<const char *>(haystack),
                    haystack_size, common_stats,
                    distance,
                    ngram_storage.get());
                /// For !Symmetric version we should not use haystack_stats_size.
                if constexpr (Symmetric)
                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
                else
                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, size_t(1));
            }
            else
            {
                /// if the strings are too big, we say they are completely not the same
                if constexpr (Symmetric)
                    res[i] = 1.f;
                else
                    res[i] = 0.f;
            }
            distance = needle_stats_size;
            prev_offset = offsets[i];
        }
    }
};


struct NameNgramDistance
{
    static constexpr auto name = "ngramDistance";
};
struct NameNgramDistanceCaseInsensitive
{
    static constexpr auto name = "ngramDistanceCaseInsensitive";
};

struct NameNgramDistanceUTF8
{
    static constexpr auto name = "ngramDistanceUTF8";
};

struct NameNgramDistanceUTF8CaseInsensitive
{
    static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
};

struct NameNgramSearch
{
    static constexpr auto name = "ngramSearch";
};
struct NameNgramSearchCaseInsensitive
{
    static constexpr auto name = "ngramSearchCaseInsensitive";
};
struct NameNgramSearchUTF8
{
    static constexpr auto name = "ngramSearchUTF8";
};

struct NameNgramSearchUTF8CaseInsensitive
{
    static constexpr auto name = "ngramSearchCaseInsensitiveUTF8";
};

using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, true>, NameNgramDistance>;
using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>;
using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, true>, NameNgramDistanceUTF8>;
using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>;

using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, false>, NameNgramSearch>;
using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>;
using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, false>, NameNgramSearchUTF8>;
using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>;


void registerFunctionsStringSimilarity(FunctionFactory & factory)
{
    factory.registerFunction<FunctionNgramDistance>();
    factory.registerFunction<FunctionNgramDistanceCaseInsensitive>();
    factory.registerFunction<FunctionNgramDistanceUTF8>();
    factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>();

    factory.registerFunction<FunctionNgramSearch>();
    factory.registerFunction<FunctionNgramSearchCaseInsensitive>();
    factory.registerFunction<FunctionNgramSearchUTF8>();
    factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>();
}

}