Add charset_freq

2024-11-29 19:12:03 +00:00 · 2021-03-19 00:57:42 +03:00 · 2021-03-19 00:57:42 +03:00 · 473c47abcc
commit 473c47abcc
parent 274601232b
4 changed files with 35783 additions and 57 deletions
--- a/src/Functions/ClassificationDictionaries/charset_freq.txt
+++ b/src/Functions/ClassificationDictionaries/charset_freq.txt
--- a/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt
+++ b/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt
--- a/src/Functions/FrequencyHolder.h
+++ b/src/Functions/FrequencyHolder.h
@ -1,9 +1,10 @@
 #pragma once

-#include <Functions/FunctionsTextClassification.h>
-
+#include <Common/TLDListsHolder.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <IO/ReadBufferFromFile.h>
+#include <string_view>
 #include <string>
-#include <Functions/FunctionFactory.h>
 #include <fstream>
 #include <algorithm>
 #include <cstring>
@ -31,29 +32,44 @@ public:
    void parseDictionaries(const std::string& pt)
    {
        is_true = pt;
-        loadEmotionalDict("/home/sergey/datadump/myemo2.txt");
-        loadEncodingsFrequency("/home/sergey/data/dumps/encodings/russian/freq_enc/");
+        loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
+        loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
    }


-    void loadEncodingsFrequency(const std::string path_to_encodings_freq)
+    void loadEncodingsFrequency(const std::string path_to_charset_freq)
    {
-        std::vector<std::string> languages = {"freq_CP866", "freq_ISO", "freq_WINDOWS-1251", "freq_UTF-8"};
-        for (std::string & lang : languages) {
-            std::ifstream file(path_to_encodings_freq + lang + ".txt");
-            Map new_lang;
-            UInt16 bigram;
-            double count;
-            double total = 0;
-            while (file >> bigram >> count) {
-                new_lang[bigram] = count;
-                total += count;
+        char charset_name_buf [40];
+        std::string charset_name;
+        ReadBufferFromFile in(path_to_charset_freq);
+        while (!in.eof())
+        {
+            char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
+        
+            if (newline >= in.buffer().end())
+                break;
+
+            std::string_view line(in.position(), newline - in.position());
+            in.position() = newline + 1;
+
+            if (line.empty())
+                continue;
+            // Start load new charset
+            if (line.size() > 2 && line[0] == '/' && line[1] == '/')
+            {
+                const char * st = line.data();
+                sscanf(st + 2, "%39s", charset_name_buf);
+                std::string s(charset_name_buf);
+                charset_name = s;
+            } else
+            {
+                const char * st = line.data();
+                UInt16 bigram;
+                double frequency;
+                sscanf(st, "%hd %lg", &bigram, &frequency);
+                encodings_freq[charset_name][bigram] = frequency;
+
            }
-            for (auto & el : new_lang) {
-                el.second /= total;
-            }
-            encodings_freq[lang] = new_lang;
-            file.close();
        }
    }

--- a/src/Functions/FunctionsTextClassification.cpp
+++ b/src/Functions/FunctionsTextClassification.cpp
@ -71,15 +71,26 @@ struct TextClassificationImpl

    static double Naive_bayes(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
    {
-        double res = 1;
+        double res = 0;
        for (auto & el : model) {
            if (standart[el.first] != 0) {
                res += el.second * log(standart[el.first]);
+            } else {
+                res += el.second * log(0.00001);
            }
        }
        return res;
    }

+    static double Simple(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
+    {
+        double res = 0;
+        for (auto & el : model) {
+            res += el.second * standart[el.first];
+        }
+        return res;
+    }
+

    static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
    {
@ -151,11 +162,6 @@ struct TextClassificationImpl
        static std::unordered_map<std::string, std::vector<double>> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
        static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();

-        /*
-        static TextClassificationDictionaries classification_dictionaries;
-        static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
-        static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
-        */
        if (!Emo)
        {
            
@ -163,23 +169,20 @@ struct TextClassificationImpl
            std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
            size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
            std::string ans;
-            double count_bigram = data.size() - 1;
+            // double count_bigram = data.size() - 1;
            std::unordered_map<UInt16, double> model;
            for (size_t i = 0; i < len; ++i) {
-                ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram) + "\n";
-                model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram;
+                ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
+                model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]);
            }

-            double res1 = L2_distance(encodings_freq["freq_CP866"], model);
-            double res2 =  L2_distance(encodings_freq["freq_ISO"], model);
-            double res3 =  L2_distance(encodings_freq["freq_WINDOWS-1251"], model);
-            double res4 =  L2_distance(encodings_freq["freq_UTF-8"], model);
-            ans += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
+            for (const auto& item : encodings_freq) {
+                ans += item.first + " " + std::to_string(Simple(item.second, model)) + "\n";
+            }
            res = ans;
        }
        else 
        {
-
            double freq = 0;
            double count_words = 0;

@ -245,33 +248,17 @@ struct TextClassificationImpl

            size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
            std::string prom;
-            double count_bigram = data.size() - 1;
-            std::unordered_map<UInt16, double> model1;
-
-            std::unordered_map<UInt16, double> model2;
+            // double count_bigram = data.size() - 1;
+            std::unordered_map<UInt16, double> model;

            for (size_t j = 0; j < len; ++j)
            {
-                model2[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
+                model[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
            }

-            for (size_t j = 0; j < len; ++j)
-            {
-                model1[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]) / count_bigram;
+            for (const auto& item : encodings_freq) {
+                prom += item.first + " " + std::to_string(Simple(item.second, model)) + "\n";
            }
-
-            double res1 = L2_distance(encodings_freq["freq_CP866"], model1);
-            double res2 =  L2_distance(encodings_freq["freq_ISO"], model1);
-            double res3 =  L2_distance(encodings_freq["freq_WINDOWS-1251"], model1);
-            double res4 =  L2_distance(encodings_freq["freq_UTF-8"], model1);
-            prom += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
-
-
-            double res12 = Naive_bayes(encodings_freq["freq_CP866"], model2);
-            double res22 =  Naive_bayes(encodings_freq["freq_ISO"], model2);
-            double res32 =  Naive_bayes(encodings_freq["freq_WINDOWS-1251"], model2);
-            double res42 =  Naive_bayes(encodings_freq["freq_UTF-8"], model2);
-            prom += std::to_string(res12) + " " + std::to_string(res22) + " " + std::to_string(res32) + " " + std::to_string(res42) + "\n";
            
            const auto ans = prom.c_str();