ClickHouse/src/Functions/FunctionsCharsetClassification.cpp

#include <Functions/FunctionsTextClassification.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>

#include <algorithm>
#include <cstring>
#include <cmath>
#include <limits>
#include <unordered_map>
#include <memory>
#include <utility>
#include <sstream>
#include <set>

namespace DB
{


template <size_t N, bool detect_language>
struct CharsetClassificationImpl
{

    using ResultType = String;
    using CodePoint = UInt8;

    static constexpr Float64 zero_frequency = 0.000001;
    /// map_size for ngram count.
    static constexpr size_t map_size = 1u << 16;

    /// If the data size is bigger than this, behaviour is unspecified for this function.
    static constexpr size_t max_string_size = 1u << 15;

    /// Default padding to read safely.
    static constexpr size_t default_padding = 16;

    /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
    static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;

    /** map_size of this fits mostly in L2 cache all the time.
      * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
      * integer array.
      */
    using NgramCount = UInt16;

    static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
    {
        Float64 res = 0;
        for (auto & el : model)
        {
            if (standart[el.first] != 0)
            {
                res += el.second * log(standart[el.first]);
            } else
            {
                res += el.second * log(zero_frequency);
            }
        }
        return res;
    }


    static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
    {
        constexpr size_t padding_offset = default_padding - N + 1;
        memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
        memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
        pos += padding_offset;
        if (pos > end)
            return default_padding - (pos - end);
        return default_padding;
    }


    static ALWAYS_INLINE inline size_t calculateStats(
        const char * data,
        const size_t size,
        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
        std::unordered_map<UInt16, Float64>& model)
    {

        const char * start = data;
        const char * end = data + size;
        CodePoint cp[simultaneously_codepoints_num] = {};
        /// read_code_points returns the position of cp where it stopped reading codepoints.
        size_t found = read_code_points(cp, start, end);
        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
        size_t i = N - 1;
        size_t len = 0;
        do
        {
            for (; i + N <= found; ++i)
            {
                UInt32 hash = 0;
                for (size_t j = 0; j < N; ++j) {
                    hash <<= 8;
                    hash += *(cp + i + j);
                }
                if (model[hash] == 0) {
                    model[hash] = 1;
                    ++len;
                }
                ++model[hash];
            }
            i = 0;
        } while (start < end && (found = read_code_points(cp, start, end)));

        return len;
    }

    
    static void constant(String data, String & res)
    {
        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
        static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();

        std::unordered_map<UInt16, Float64> model;
        calculateStats(data.data(), data.size(), readCodePoints, model);

        Float64 max_result = 0;
        String poss_ans;
        for (const auto& item : encodings_freq)
        {
            const Float64 score = Naive_bayes(item.second, model);
            if (max_result == 0 || max_result < score)
            {
                poss_ans = item.first;
                max_result = score;
            }
        }
        size_t sep = poss_ans.find('_');
        if (detect_language)
        {
            res = poss_ans.erase(0, sep + 1);
        }
        else
        {
            res = poss_ans.erase(sep, poss_ans.size() - sep);
        }
    }


    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();

        res_data.reserve(1024);
        res_offsets.resize(offsets.size());

        size_t prev_offset = 0;
        size_t res_offset = 0;

        for (size_t i = 0; i < offsets.size(); ++i)
        {
            const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
            String str = haystack;

            String poss_ans;

            std::unordered_map<UInt16, Float64> model;
            calculateStats(str.data(), str.size(), readCodePoints, model);

           Float64 max_result = 0;
           for (const auto& item : encodings_freq)
            {
                Float64 score = Naive_bayes(item.second, model);
                if (max_result == 0 || max_result < score)
                {
                    max_result = score;
                    poss_ans = item.first;
                }
            }
            
            size_t sep = poss_ans.find('_');
            String ans_str;
            if (detect_language)
            {
                ans_str = poss_ans.erase(0, sep + 1);
            }
            else
            {
                ans_str = poss_ans.erase(sep, poss_ans.size() - sep);
            }

            const auto ans = ans_str.c_str();
            size_t cur_offset = offsets[i];

            size_t ans_size = strlen(ans);
            res_data.resize(res_offset + ans_size + 1);
            memcpy(&res_data[res_offset], ans, ans_size);
            res_offset += ans_size;

            res_data[res_offset] = 0;
            ++res_offset;

            res_offsets[i] = res_offset;
            prev_offset = cur_offset;
        }
    }


};


struct NameCharsetDetect
{
    static constexpr auto name = "detectCharset";
};

struct NameLanguageDetect
{
    static constexpr auto name = "detectLanguage";
};


using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2, true>, NameCharsetDetect>;
using FunctionLanguageDetect = FunctionsTextClassification<CharsetClassificationImpl<2, false>, NameLanguageDetect>;

void registerFunctionsCharsetClassification(FunctionFactory & factory)
{
    factory.registerFunction<FunctionCharsetDetect>();
    factory.registerFunction<FunctionLanguageDetect>();
}

}
First step of work 2021-02-07 18:40:55 +00:00			`#include <Functions/FunctionsTextClassification.h>`
Fix 2.0 2021-03-19 10:06:21 +00:00			`#include <Common/FrequencyHolder.h>`
First step of work 2021-02-07 18:40:55 +00:00			`#include <Functions/FunctionFactory.h>`
			`#include <Common/UTF8Helpers.h>`
Remove stringstream 2021-03-23 19:32:54 +00:00			`#include <IO/ReadBufferFromString.h>`
			`#include <IO/ReadHelpers.h>`
First step of work 2021-02-07 18:40:55 +00:00
			`#include <algorithm>`
			`#include <cstring>`
Major improvements 2021-03-18 14:05:28 +00:00			`#include <cmath>`
First step of work 2021-02-07 18:40:55 +00:00			`#include <limits>`
Major improvements 2021-03-18 14:05:28 +00:00			`#include <unordered_map>`
First step of work 2021-02-07 18:40:55 +00:00			`#include <memory>`
			`#include <utility>`
Major improvements 2021-03-18 14:05:28 +00:00			`#include <sstream>`
			`#include <set>`
First step of work 2021-02-07 18:40:55 +00:00
			`namespace DB`
			`{`
Major improvements 2021-03-18 14:05:28 +00:00
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00
Add detectLanguage 2021-05-07 14:18:06 +00:00			`template <size_t N, bool detect_language>`
Minor fixes 2021-04-15 17:16:32 +00:00			`struct CharsetClassificationImpl`
First step of work 2021-02-07 18:40:55 +00:00			`{`

Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`using ResultType = String;`
First step of work 2021-02-07 18:40:55 +00:00			`using CodePoint = UInt8;`
Fix git submodules 2021-05-06 07:04:00 +00:00
			`static constexpr Float64 zero_frequency = 0.000001;`
First step of work 2021-02-07 18:40:55 +00:00			`/// map_size for ngram count.`
			`static constexpr size_t map_size = 1u << 16;`

			`/// If the data size is bigger than this, behaviour is unspecified for this function.`
			`static constexpr size_t max_string_size = 1u << 15;`

			`/// Default padding to read safely.`
			`static constexpr size_t default_padding = 16;`

			`/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.`
			`static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;`

			`/** map_size of this fits mostly in L2 cache all the time.`
			`* Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed`
			`* integer array.`
			`*/`
			`using NgramCount = UInt16;`

Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)`
Major improvements 2021-03-18 14:05:28 +00:00			`{`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`Float64 res = 0;`
Fix git submodules 2021-05-06 07:04:00 +00:00			`for (auto & el : model)`
			`{`
			`if (standart[el.first] != 0)`
			`{`
Major improvements 2021-03-18 14:05:28 +00:00			`res += el.second * log(standart[el.first]);`
Fix git submodules 2021-05-06 07:04:00 +00:00			`} else`
			`{`
			`res += el.second * log(zero_frequency);`
Major improvements 2021-03-18 14:05:28 +00:00			`}`
			`}`
			`return res;`
			`}`

Add charset_freq 2021-03-18 21:57:42 +00:00
Major improvements 2021-03-18 14:05:28 +00:00
First step of work 2021-02-07 18:40:55 +00:00			`static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char & pos, const char end)`
			`{`
			`constexpr size_t padding_offset = default_padding - N + 1;`
			`memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));`
			`memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));`
			`pos += padding_offset;`
			`if (pos > end)`
			`return default_padding - (pos - end);`
			`return default_padding;`
			`}`

Major improvements 2021-03-18 14:05:28 +00:00
First step of work 2021-02-07 18:40:55 +00:00			`static ALWAYS_INLINE inline size_t calculateStats(`
			`const char * data,`
			`const size_t size,`
			`size_t (read_code_points)(CodePoint , const char &, const char ),`
Fix git submodules 2021-05-06 07:04:00 +00:00			`std::unordered_map<UInt16, Float64>& model)`
First step of work 2021-02-07 18:40:55 +00:00			`{`
Fix git submodules 2021-05-06 07:04:00 +00:00
First step of work 2021-02-07 18:40:55 +00:00			`const char * start = data;`
			`const char * end = data + size;`
			`CodePoint cp[simultaneously_codepoints_num] = {};`
			`/// read_code_points returns the position of cp where it stopped reading codepoints.`
			`size_t found = read_code_points(cp, start, end);`
			`/// We need to start for the first time here, because first N - 1 codepoints mean nothing.`
			`size_t i = N - 1;`
			`size_t len = 0;`
			`do`
			`{`
			`for (; i + N <= found; ++i)`
			`{`
Delete triGramcount 2021-02-08 12:23:51 +00:00			`UInt32 hash = 0;`
First step of work 2021-02-07 18:40:55 +00:00			`for (size_t j = 0; j < N; ++j) {`
			`hash <<= 8;`
			`hash += *(cp + i + j);`
			`}`
Fix git submodules 2021-05-06 07:04:00 +00:00			`if (model[hash] == 0) {`
			`model[hash] = 1;`
First step of work 2021-02-07 18:40:55 +00:00			`++len;`
			`}`
Fix git submodules 2021-05-06 07:04:00 +00:00			`++model[hash];`
First step of work 2021-02-07 18:40:55 +00:00			`}`
			`i = 0;`
			`} while (start < end && (found = read_code_points(cp, start, end)));`

			`return len;`
			`}`
Major improvements 2021-03-18 14:05:28 +00:00

Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`static void constant(String data, String & res)`
Major improvements 2021-03-18 14:05:28 +00:00			`{`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();`
			`static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();`
Major improvements 2021-03-18 14:05:28 +00:00
Fix git submodules 2021-05-06 07:04:00 +00:00			`std::unordered_map<UInt16, Float64> model;`
			`calculateStats(data.data(), data.size(), readCodePoints, model);`
Minor fixes 2021-04-15 17:16:32 +00:00
Add detectLanguage 2021-05-07 14:18:06 +00:00			`Float64 max_result = 0;`
			`String poss_ans;`
Minor fixes 2021-04-15 17:16:32 +00:00			`for (const auto& item : encodings_freq)`
			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`const Float64 score = Naive_bayes(item.second, model);`
			`if (max_result == 0 \|\| max_result < score)`
Fix git submodules 2021-05-06 07:04:00 +00:00			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`poss_ans = item.first;`
			`max_result = score;`
Fix git submodules 2021-05-06 07:04:00 +00:00			`}`
Major improvements 2021-03-18 14:05:28 +00:00			`}`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`size_t sep = poss_ans.find('_');`
			`if (detect_language)`
			`{`
			`res = poss_ans.erase(0, sep + 1);`
			`}`
			`else`
			`{`
			`res = poss_ans.erase(sep, poss_ans.size() - sep);`
			`}`
First step of work 2021-02-07 18:40:55 +00:00			`}`

add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00
First step of work 2021-02-07 18:40:55 +00:00			`static void vector(`
			`const ColumnString::Chars & data,`
			`const ColumnString::Offsets & offsets,`
Major improvements 2021-03-18 14:05:28 +00:00			`ColumnString::Chars & res_data,`
			`ColumnString::Offsets & res_offsets)`
First step of work 2021-02-07 18:40:55 +00:00			`{`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();`
			`static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();`
Major improvements 2021-03-18 14:05:28 +00:00
			`res_data.reserve(1024);`
			`res_offsets.resize(offsets.size());`

First step of work 2021-02-07 18:40:55 +00:00			`size_t prev_offset = 0;`
Major improvements 2021-03-18 14:05:28 +00:00			`size_t res_offset = 0;`
First step of work 2021-02-07 18:40:55 +00:00
Major improvements 2021-03-18 14:05:28 +00:00			`for (size_t i = 0; i < offsets.size(); ++i)`
First step of work 2021-02-07 18:40:55 +00:00			`{`
			`const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`String str = haystack;`
Major improvements 2021-03-18 14:05:28 +00:00
Add detectLanguage 2021-05-07 14:18:06 +00:00			`String poss_ans;`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`std::unordered_map<UInt16, Float64> model;`
Fix git submodules 2021-05-06 07:04:00 +00:00			`calculateStats(str.data(), str.size(), readCodePoints, model);`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00
Add detectLanguage 2021-05-07 14:18:06 +00:00			`Float64 max_result = 0;`
			`for (const auto& item : encodings_freq)`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`Float64 score = Naive_bayes(item.second, model);`
			`if (max_result == 0 \|\| max_result < score)`
Fix git submodules 2021-05-06 07:04:00 +00:00			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`max_result = score;`
			`poss_ans = item.first;`
Fix git submodules 2021-05-06 07:04:00 +00:00			`}`
add FunctionsTonalityClassification.cpp 2021-04-14 18:42:33 +00:00			`}`
Add detectLanguage 2021-05-07 14:18:06 +00:00
			`size_t sep = poss_ans.find('_');`
			`String ans_str;`
			`if (detect_language)`
Update 2021-05-06 10:04:38 +00:00			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`ans_str = poss_ans.erase(0, sep + 1);`
Update 2021-05-06 10:04:38 +00:00			`}`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`else`
Update 2021-05-06 10:04:38 +00:00			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`ans_str = poss_ans.erase(sep, poss_ans.size() - sep);`
			`}`
Update 2021-05-06 10:04:38 +00:00
Add detectLanguage 2021-05-07 14:18:06 +00:00			`const auto ans = ans_str.c_str();`
Major improvements 2021-03-18 14:05:28 +00:00			`size_t cur_offset = offsets[i];`

Add detectLanguage 2021-05-07 14:18:06 +00:00			`size_t ans_size = strlen(ans);`
			`res_data.resize(res_offset + ans_size + 1);`
			`memcpy(&res_data[res_offset], ans, ans_size);`
			`res_offset += ans_size;`
Major improvements 2021-03-18 14:05:28 +00:00
			`res_data[res_offset] = 0;`
			`++res_offset;`

			`res_offsets[i] = res_offset;`
			`prev_offset = cur_offset;`
First step of work 2021-02-07 18:40:55 +00:00			`}`
			`}`

Major improvements 2021-03-18 14:05:28 +00:00
First step of work 2021-02-07 18:40:55 +00:00			`};`


Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`struct NameCharsetDetect`
First step of work 2021-02-07 18:40:55 +00:00			`{`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`static constexpr auto name = "detectCharset";`
			`};`

			`struct NameLanguageDetect`
			`{`
			`static constexpr auto name = "detectLanguage";`
First step of work 2021-02-07 18:40:55 +00:00			`};`
Fix biGramcount and triGramcount 2021-02-07 19:46:33 +00:00
First step of work 2021-02-07 18:40:55 +00:00
Add detectLanguage 2021-05-07 14:18:06 +00:00			`using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2, true>, NameCharsetDetect>;`
			`using FunctionLanguageDetect = FunctionsTextClassification<CharsetClassificationImpl<2, false>, NameLanguageDetect>;`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00
Minor fixes 2021-04-15 17:16:32 +00:00			`void registerFunctionsCharsetClassification(FunctionFactory & factory)`
First step of work 2021-02-07 18:40:55 +00:00			`{`
Fix FrequencyHolder.h 2021-03-23 18:55:14 +00:00			`factory.registerFunction<FunctionCharsetDetect>();`
Add detectLanguage 2021-05-07 14:18:06 +00:00			`factory.registerFunction<FunctionLanguageDetect>();`
First step of work 2021-02-07 18:40:55 +00:00			`}`

			`}`