ClickHouse/src/Functions/FunctionsCharsetClassification.cpp

#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsTextClassification.h>

#include <memory>
#include <unordered_map>

namespace DB
{

namespace
{
    /* We need to solve zero-frequency problem for Naive Bayes Classifier
     * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
     * 1e-06 is minimal value in our marked-up dictionary.
     */
    constexpr Float64 zero_frequency = 1e-06;

    /// If the data size is bigger than this, behaviour is unspecified for this function.
    constexpr size_t max_string_size = 1UL << 15;

    template <typename ModelMap>
    ALWAYS_INLINE inline Float64 naiveBayes(
        const FrequencyHolder::EncodingMap & standard,
        const ModelMap & model,
        Float64 max_result)
    {
        Float64 res = 0;
        for (const auto & el : model)
        {
            /// Try to find bigram in the dictionary.
            const auto * it = standard.find(el.getKey());
            if (it != standard.end())
            {
                res += el.getMapped() * log(it->getMapped());
            } else
            {
                res += el.getMapped() * log(zero_frequency);
            }
            /// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
            if (res < max_result)
            {
                return res;
            }
        }
        return res;
    }

    /// Сount how many times each bigram occurs in the text.
    template <typename ModelMap>
    ALWAYS_INLINE inline void calculateStats(
        const UInt8 * data,
        const size_t size,
        ModelMap & model)
    {
        UInt16 hash = 0;
        for (size_t i = 0; i < size; ++i)
        {
            hash <<= 8;
            hash += *(data + i);
            ++model[hash];
        }
    }
}

/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
 * Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
 * Using a naive Bayesian classifier, find the most likely charset and language and return it
 */
template <bool detect_language>
struct CharsetClassificationImpl
{
    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();

        if constexpr (detect_language)
            /// 2 chars for ISO code + 1 zero byte
            res_data.reserve(offsets.size() * 3);
        else
            /// Mean charset length is 8
            res_data.reserve(offsets.size() * 8);

        res_offsets.resize(offsets.size());

        size_t current_result_offset = 0;

        double zero_frequency_log = log(zero_frequency);

        for (size_t i = 0; i < offsets.size(); ++i)
        {
            const UInt8 * str = data.data() + offsets[i - 1];
            const size_t str_len = offsets[i] - offsets[i - 1] - 1;

            HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
            calculateStats(str, str_len, model);

            std::string_view result_value;

            /// Go through the dictionary and find the charset with the highest weight
            Float64 max_result = zero_frequency_log * (max_string_size);
            for (const auto & item : encodings_freq)
            {
                Float64 score = naiveBayes(item.map, model, max_result);
                if (max_result < score)
                {
                    max_result = score;

                    if constexpr (detect_language)
                        result_value = item.lang;
                    else
                        result_value = item.name;
                }
            }

            size_t result_value_size = result_value.size();
            res_data.resize(current_result_offset + result_value_size + 1);
            memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
            res_data[current_result_offset + result_value_size] = '\0';
            current_result_offset += result_value_size + 1;

            res_offsets[i] = current_result_offset;
        }
    }
};


struct NameDetectCharset
{
    static constexpr auto name = "detectCharset";
};

struct NameDetectLanguageUnknown
{
    static constexpr auto name = "detectLanguageUnknown";
};


using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;

REGISTER_FUNCTION(DetectCharset)
{
    factory.registerFunction<FunctionDetectCharset>();
    factory.registerFunction<FunctionDetectLanguageUnknown>();
}

}
-												Fix 2.0

											
										
										
											2021-03-19 10:06:21 +00:00
+								#include <Common/FrequencyHolder.h>
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								#include <Functions/FunctionFactory.h>
-												Refactor

											
										
										
											2022-01-12 16:32:17 +00:00
+								#include <Functions/FunctionsTextClassification.h>
-												Change cmake files

											
										
										
											2021-05-31 13:38:51 +00:00
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								#include <memory>
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								#include <unordered_map>
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
 								namespace DB
 								{
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								namespace
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								{
-												Add new programming languages and update detection

											
										
										
											2021-05-21 13:48:18 +00:00
+								    /* We need to solve zero-frequency problem for Naive Bayes Classifier
 								     * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
 								     * 1e-06 is minimal value in our marked-up dictionary.
 								     */
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								    constexpr Float64 zero_frequency = 1e-06;
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
 								    /// If the data size is bigger than this, behaviour is unspecified for this function.
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								    constexpr size_t max_string_size = 1UL << 15;
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								    template <typename ModelMap>
-												Fixed tests

											
										
										
											2022-03-15 15:43:31 +00:00
+								    ALWAYS_INLINE inline Float64 naiveBayes(
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								        const FrequencyHolder::EncodingMap & standard,
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								        const ModelMap & model,
-												Fix bayes

											
										
										
											2021-05-19 10:01:09 +00:00
+								        Float64 max_result)
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								    {
-												Fix FrequencyHolder.h

											
										
										
											2021-03-23 18:55:14 +00:00
+								        Float64 res = 0;
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								        for (const auto & el : model)
-												Fix git submodules

											
										
										
											2021-05-06 07:04:00 +00:00
+								        {
-												Add new programming languages and update detection

											
										
										
											2021-05-21 13:48:18 +00:00
+								            /// Try to find bigram in the dictionary.
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            const auto * it = standard.find(el.getKey());
-												Change DetectLanguageMixed output type

											
										
										
											2021-12-30 02:14:57 +00:00
+								            if (it != standard.end())
-												Fix git submodules

											
										
										
											2021-05-06 07:04:00 +00:00
+								            {
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								                res += el.getMapped() * log(it->getMapped());
-												Fix git submodules

											
										
										
											2021-05-06 07:04:00 +00:00
+								            } else
 								            {
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								                res += el.getMapped() * log(zero_frequency);
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								            }
-												Add new programming languages and update detection

											
										
										
											2021-05-21 13:48:18 +00:00
+								            /// If at some step the result has become less than the current maximum, then it makes no sense to count it fully.
-												Fix style

											
										
										
											2021-05-21 13:58:48 +00:00
+								            if (res < max_result)
 								            {
-												Fix bayes

											
										
										
											2021-05-19 10:01:09 +00:00
+								                return res;
 								            }
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								        }
 								        return res;
 								    }
-												Add new programming languages and update detection

											
										
										
											2021-05-21 13:48:18 +00:00
+								    /// Сount how many times each bigram occurs in the text.
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								    template <typename ModelMap>
-												Fixed tests

											
										
										
											2022-03-15 15:43:31 +00:00
+								    ALWAYS_INLINE inline void calculateStats(
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								        const UInt8 * data,
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								        const size_t size,
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								        ModelMap & model)
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								    {
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								        UInt16 hash = 0;
 								        for (size_t i = 0; i < size; ++i)
-												Add detectLanguage

											
										
										
											2021-05-07 14:18:06 +00:00
+								        {
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            hash <<= 8;
 								            hash += *(data + i);
 								            ++model[hash];
-												Add detectLanguage

											
										
										
											2021-05-07 14:18:06 +00:00
+								        }
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								    }
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								}
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
 								 * Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
 								 * Using a naive Bayesian classifier, find the most likely charset and language and return it
 								 */
 								template <bool detect_language>
 								struct CharsetClassificationImpl
 								{
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								    static void vector(
 								        const ColumnString::Chars & data,
 								        const ColumnString::Offsets & offsets,
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								        ColumnString::Chars & res_data,
 								        ColumnString::Offsets & res_offsets)
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								    {
-												Change DetectLanguageMixed output type

											
										
										
											2021-12-30 02:14:57 +00:00
+								        const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								        if constexpr (detect_language)
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            /// 2 chars for ISO code + 1 zero byte
 								            res_data.reserve(offsets.size() * 3);
 								        else
 								            /// Mean charset length is 8
 								            res_data.reserve(offsets.size() * 8);
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								        res_offsets.resize(offsets.size());
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								        size_t current_result_offset = 0;
 								        double zero_frequency_log = log(zero_frequency);
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
+								        for (size_t i = 0; i < offsets.size(); ++i)
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								        {
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            const UInt8 * str = data.data() + offsets[i - 1];
 								            const size_t str_len = offsets[i] - offsets[i - 1] - 1;
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								            HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            calculateStats(str, str_len, model);
-												Fix FrequencyHolder.h

											
										
										
											2021-03-23 18:55:14 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								            std::string_view result_value;
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            /// Go through the dictionary and find the charset with the highest weight
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								            Float64 max_result = zero_frequency_log * (max_string_size);
-												Better

											
										
										
											2022-01-10 15:36:32 +00:00
+								            for (const auto & item : encodings_freq)
-												add FunctionsTonalityClassification.cpp

											
										
										
											2021-04-14 18:42:33 +00:00
+								            {
-												Change DetectLanguageMixed output type

											
										
										
											2021-12-30 02:14:57 +00:00
+								                Float64 score = naiveBayes(item.map, model, max_result);
-												Fix bayes

											
										
										
											2021-05-19 10:01:09 +00:00
+								                if (max_result < score)
-												Fix git submodules

											
										
										
											2021-05-06 07:04:00 +00:00
+								                {
-												Add detectLanguage

											
										
										
											2021-05-07 14:18:06 +00:00
+								                    max_result = score;
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
 								                    if constexpr (detect_language)
 								                        result_value = item.lang;
 								                    else
 								                        result_value = item.name;
-												Fix git submodules

											
										
										
											2021-05-06 07:04:00 +00:00
+								                }
-												add FunctionsTonalityClassification.cpp

											
										
										
											2021-04-14 18:42:33 +00:00
+								            }
-												New features draft

											
										
										
											2021-05-23 16:39:40 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								            size_t result_value_size = result_value.size();
 								            res_data.resize(current_result_offset + result_value_size + 1);
 								            memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
 								            res_data[current_result_offset + result_value_size] = '\0';
 								            current_result_offset += result_value_size + 1;
-												Major improvements

											
										
										
											2021-03-18 14:05:28 +00:00
-												Fixed performance tests

											
										
										
											2022-03-02 14:46:06 +00:00
+								            res_offsets[i] = current_result_offset;
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								        }
 								    }
 								};
-												Refactor

											
										
										
											2022-01-12 16:32:17 +00:00
+								struct NameDetectCharset
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								{
-												Add detectLanguage

											
										
										
											2021-05-07 14:18:06 +00:00
+								    static constexpr auto name = "detectCharset";
 								};
-												Refactor

											
										
										
											2022-01-12 16:32:17 +00:00
+								struct NameDetectLanguageUnknown
-												Add detectLanguage

											
										
										
											2021-05-07 14:18:06 +00:00
+								{
-												Refactor

											
										
										
											2021-12-22 21:03:42 +00:00
+								    static constexpr auto name = "detectLanguageUnknown";
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								};
-												Fix biGramcount and triGramcount

											
										
										
											2021-02-07 19:46:33 +00:00
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
-												Refactor

											
										
										
											2022-01-12 16:32:17 +00:00
+								using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
 								using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
-												Fix FrequencyHolder.h

											
										
										
											2021-03-23 18:55:14 +00:00
-												Automated function registration

Automated register all functions with below naming convention by
iterating through the symbols:
void DB::registerXXX(DB::FunctionFactory &)

											
										
										
											2022-07-04 07:01:39 +00:00
+								REGISTER_FUNCTION(DetectCharset)
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								{
-												Refactor

											
										
										
											2022-01-12 16:32:17 +00:00
+								    factory.registerFunction<FunctionDetectCharset>();
 								    factory.registerFunction<FunctionDetectLanguageUnknown>();
-												First step of work

											
										
										
											2021-02-07 18:40:55 +00:00
+								}
 								}