Major improvements

2024-11-23 08:02:02 +00:00 · 2021-03-18 17:05:28 +03:00 · 2021-03-18 17:05:28 +03:00 · ff30b40bf6
commit ff30b40bf6
parent 4c44e21c29
13 changed files with 13783 additions and 37 deletions
--- a/my_data/lang_models/bigram_english
+++ b/my_data/lang_models/bigram_english
--- a/my_data/lang_models/bigram_french
+++ b/my_data/lang_models/bigram_french
--- a/my_data/lang_models/bigram_german
+++ b/my_data/lang_models/bigram_german
--- a/my_data/lang_models/bigram_russian
+++ b/my_data/lang_models/bigram_russian
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -70,6 +70,8 @@
 #include <Server/MySQLHandlerFactory.h>
 #include <Server/PostgreSQLHandlerFactory.h>
 #include <Server/ProtocolServerAdapter.h>
+#include <Functions/FrequencyHolder.h>
+#include <Functions/FunctionsTextClassification.h>


 #if !defined(ARCADIA_BUILD)
@ -631,6 +633,12 @@ int Server::main(const std::vector<std::string> & /*args*/)
        TLDListsHolder::getInstance().parseConfig(top_level_domains_path, config());
    }

+    /// my test
+    {
+        const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "textclassification_frequency/");
+        FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
+    }
+
    {
        Poco::File(path + "data/").createDirectories();
        Poco::File(path + "metadata/").createDirectories();
--- a/programs/server/config.d/path.xml
+++ b/programs/server/config.d/path.xml
@ -5,4 +5,5 @@
    <format_schema_path replace="replace">./format_schemas/</format_schema_path>
    <access_control_path replace="replace">./access/</access_control_path>
    <top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
+    <encodings_frequency_path replace="replace">./encoding_frequency/</encodings_frequency_path>
 </yandex>
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -130,7 +130,7 @@
        <password></password>
    </interserver_http_credentials>-->

-    <!-- Listen specified address.
+/home/sergey/datadump/test.txt    <!-- Listen specified address.
         Use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere.
         Notes:
         If you open connections from wildcard address, make sure that at least one of the following measures applied:
@ -842,6 +842,11 @@
         Changes will not be applied w/o server restart.
         Path to the list is under top_level_domains_path (see above).
    -->
+
+    <!-- MY CHANGES -->
+
+    <encodings_frequency_path>/var/lib/clickhouse/encodings_frequency/</encodings_frequency_path>
+
    <top_level_domains_lists>
        <!--
        <public_suffix_list>/path/to/public_suffix_list.dat</public_suffix_list>
--- a/programs/server/textclassification_frequency/test.txt
+++ b/programs/server/textclassification_frequency/test.txt
@ -0,0 +1 @@
+Hello!
--- a/src/Functions/FrequencyHolder.cpp
+++ b/src/Functions/FrequencyHolder.cpp
@ -0,0 +1,11 @@
+/*
+#include "FrequencyHolder.h"
+
+namespace DB
+{
+static FrequencyHolder & getInstance() {
+    static FrequencyHolder instance;
+    return instance;
+}
+}
+*/
--- a/src/Functions/FrequencyHolder.h
+++ b/src/Functions/FrequencyHolder.h
@ -0,0 +1,97 @@
+#pragma once
+
+#include <Functions/FunctionsTextClassification.h>
+
+#include <string>
+#include <Functions/FunctionFactory.h>
+#include <fstream>
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <unordered_map>
+
+
+namespace DB
+{
+
+class FrequencyHolder
+{
+public:
+    using Map = std::unordered_map<UInt16, double>;
+    using Container = std::unordered_map<std::string, Map>;
+
+
+    static FrequencyHolder & getInstance()
+    {
+        static FrequencyHolder instance;
+        return instance;
+    }
+
+
+    void parseDictionaries(const std::string& pt)
+    {
+        is_true = pt;
+        loadEmotionalDict("/home/sergey/datadump/myemo2.txt");
+        loadEncodingsFrequency("/home/sergey/data/dumps/encodings/russian/freq_enc/");
+    }
+
+
+    void loadEncodingsFrequency(const std::string path_to_encodings_freq)
+    {
+        std::vector<std::string> languages = {"freq_CP866", "freq_ISO", "freq_WINDOWS-1251", "freq_UTF-8"};
+        for (std::string & lang : languages) {
+            std::ifstream file(path_to_encodings_freq + lang + ".txt");
+            Map new_lang;
+            UInt16 bigram;
+            double count;
+            double total = 0;
+            while (file >> bigram >> count) {
+                new_lang[bigram] = count;
+                total += count;
+            }
+            for (auto & el : new_lang) {
+                el.second /= total;
+            }
+            encodings_freq[lang] = new_lang;
+            file.close();
+        }
+    }
+
+
+    void loadEmotionalDict(const std::string path_to_emotional_dict)
+    {
+        std::ifstream file(path_to_emotional_dict);
+        std::string term, tag;
+        double val;
+        while (file >> term >> tag >> val) {
+            std::vector<double> cur = {val};
+            emotional_dict[term] = cur;
+        }
+        file.close();
+    }
+
+
+    const std::string & get_path()
+    {
+        return is_true;
+    }
+
+    const std::unordered_map<std::string, std::vector<double>> getEmotionalDict()
+    {
+        return emotional_dict;
+    }
+
+    const Container getEncodingsFrequency()
+    {
+        return encodings_freq;
+    }
+
+
+protected:
+
+    std::string is_true;
+    std::unordered_map<std::string, std::vector<double>> emotional_dict;
+    Container encodings_freq;
+};
+}
+
--- a/src/Functions/FunctionsTextClassification.cpp
+++ b/src/Functions/FunctionsTextClassification.cpp
@ -1,29 +1,42 @@
 #include <Functions/FunctionsTextClassification.h>
+#include "FrequencyHolder.h"
 #include <Functions/FunctionFactory.h>
-#include <Functions/FunctionsHashing.h>
-#include <Common/HashTable/ClearableHashMap.h>
-#include <Common/HashTable/Hash.h>
 #include <Common/UTF8Helpers.h>

-#include <Core/Defines.h>
-
-#include <common/unaligned.h>
-
 #include <algorithm>
-#include <climits>
 #include <cstring>
+#include <cmath>
 #include <limits>
-#include <map>
+#include <unordered_map>
 #include <memory>
 #include <utility>
+#include <sstream>
+#include <set>

 namespace DB
 {
-template <size_t N>
+/*
+struct TextClassificationDictionaries
+{
+    const std::unordered_map<std::string, std::vector<double>> emotional_dict;
+    const std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_frequency;
+    const std::string path;
+    TextClassificationDictionaries()
+        : emotional_dict(FrequencyHolder::getInstance().getEmotionalDict()),
+          encodings_frequency(FrequencyHolder::getInstance().getEncodingsFrequency()),
+          path(FrequencyHolder::getInstance().get_path())
+    {
+    }
+};
+*/
+// static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.getEncodingsFrequency();
+// static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.getEmotionalDict();
+
+template <size_t N, bool Emo>
 struct TextClassificationImpl
 {

-    using ResultType = Float32;
+    using ResultType = std::string;
    using CodePoint = UInt8;
    /// map_size for ngram count.
    static constexpr size_t map_size = 1u << 16;
@ -43,6 +56,31 @@ struct TextClassificationImpl
      */
    using NgramCount = UInt16;

+
+    static double L2_distance(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
+    {
+        double res = 0;
+        for (auto& el : standart) {
+            if (model.find(el.first) != model.end()) {
+                res += ((model[el.first] - el.second) * (model[el.first] - el.second));
+            }
+        }
+        return res;
+    }
+
+
+    static double Naive_bayes(std::unordered_map<UInt16, double> standart, std::unordered_map<UInt16, double> model)
+    {
+        double res = 1;
+        for (auto & el : model) {
+            if (standart[el.first] != 0) {
+                res += el.second * log(standart[el.first]);
+            }
+        }
+        return res;
+    }
+
+
    static ALWAYS_INLINE size_t readCodePoints(CodePoint * code_points, const char *& pos, const char * end)
    {
        constexpr size_t padding_offset = default_padding - N + 1;
@ -54,6 +92,7 @@ struct TextClassificationImpl
        return default_padding;
    }

+
    static ALWAYS_INLINE inline size_t calculateStats(
        const char * data,
        const size_t size,
@ -91,34 +130,166 @@ struct TextClassificationImpl
    }
    
    
-    static void constant(std::string data, Float32 & res)
+    static void word_processing(std::string & word)
    {
-        std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
-        std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
-        res = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
+        std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
+
+        while (to_skip.find(word.back()) != to_skip.end())
+        {
+            word.pop_back();
+        }
+
+        while (to_skip.find(word.front()) != to_skip.end())
+        {
+            word.erase(0, 1);
+        }
+    }
+
+    
+    static void constant(std::string data, std::string & res)
+    {
+        static std::unordered_map<std::string, std::vector<double>> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
+        static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
+
+        /*
+        static TextClassificationDictionaries classification_dictionaries;
+        static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
+        static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
+        */
+        if (!Emo)
+        {
+            
+            std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
+            std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
+            size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
+            std::string ans;
+            double count_bigram = data.size() - 1;
+            std::unordered_map<UInt16, double> model;
+            for (size_t i = 0; i < len; ++i) {
+                ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram) + "\n";
+                model[ngram_storage.get()[i]] = static_cast<double>(common_stats.get()[ngram_storage.get()[i]]) / count_bigram;
+            }
+
+            double res1 = L2_distance(encodings_freq["freq_CP866"], model);
+            double res2 =  L2_distance(encodings_freq["freq_ISO"], model);
+            double res3 =  L2_distance(encodings_freq["freq_WINDOWS-1251"], model);
+            double res4 =  L2_distance(encodings_freq["freq_UTF-8"], model);
+            ans += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
+            res = ans;
+        }
+        else 
+        {
+
+            double freq = 0;
+            double count_words = 0;
+
+            std::string ans;
+            std::stringstream ss;
+            ss << data;
+            std::string to_check;
+
+            while (ss >> to_check)
+            {
+                word_processing(to_check);
+
+                if (emotional_dict.find(to_check) != emotional_dict.cend())
+                {
+                    count_words += 1;
+                    ans += to_check + " " + std::to_string(emotional_dict[to_check][0]) + "\n";
+                    freq += emotional_dict[to_check][0];
+                }                
+            }
+            double total_tonality = freq / count_words;
+            if (total_tonality < 0.5)
+            {
+                ans += "NEG";
+            }
+            else if (total_tonality > 1)
+            {
+                ans += "POS";
+            }
+            else
+            {
+                ans += "NEUT";
+            }
+            ans += " " + std::to_string(total_tonality) + "\n";
+            res = ans;
+        }
    }

    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
-        PaddedPODArray<Float32> & res)
+        ColumnString::Chars & res_data,
+        ColumnString::Offsets & res_offsets)
    {
-        const size_t offsets_size = offsets.size();
-        size_t prev_offset = 0;
+        static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();

-        for (size_t i = 0; i < offsets_size; ++i)
+        /*
+        static TextClassificationDictionaries classification_dictionaries;
+        static std::unordered_map<std::string, std::vector<double>> emotional_dict = classification_dictionaries.emotional_dict;
+        static std::unordered_map<std::string, std::unordered_map<UInt16, double>> encodings_freq = classification_dictionaries.encodings_frequency;
+        */
+        res_data.reserve(1024);
+        res_offsets.resize(offsets.size());
+
+        size_t prev_offset = 0;
+        size_t res_offset = 0;
+
+        for (size_t i = 0; i < offsets.size(); ++i)
        {
            const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
            std::string str = haystack;
-
            std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
            std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams

-            res[i] = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
-            prev_offset = offsets[i];
+            size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
+            std::string prom;
+            double count_bigram = data.size() - 1;
+            std::unordered_map<UInt16, double> model1;
+
+            std::unordered_map<UInt16, double> model2;
+
+            for (size_t j = 0; j < len; ++j)
+            {
+                model2[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]);
+            }
+
+            for (size_t j = 0; j < len; ++j)
+            {
+                model1[ngram_storage.get()[j]] = static_cast<double>(common_stats.get()[ngram_storage.get()[j]]) / count_bigram;
+            }
+
+            double res1 = L2_distance(encodings_freq["freq_CP866"], model1);
+            double res2 =  L2_distance(encodings_freq["freq_ISO"], model1);
+            double res3 =  L2_distance(encodings_freq["freq_WINDOWS-1251"], model1);
+            double res4 =  L2_distance(encodings_freq["freq_UTF-8"], model1);
+            prom += std::to_string(res1) + " " + std::to_string(res2) + " " + std::to_string(res3) + " " + std::to_string(res4) + "\n";
+
+
+            double res12 = Naive_bayes(encodings_freq["freq_CP866"], model2);
+            double res22 =  Naive_bayes(encodings_freq["freq_ISO"], model2);
+            double res32 =  Naive_bayes(encodings_freq["freq_WINDOWS-1251"], model2);
+            double res42 =  Naive_bayes(encodings_freq["freq_UTF-8"], model2);
+            prom += std::to_string(res12) + " " + std::to_string(res22) + " " + std::to_string(res32) + " " + std::to_string(res42) + "\n";
+            
+            const auto ans = prom.c_str();
+
+            size_t cur_offset = offsets[i];
+
+            res_data.resize(res_offset + strlen(ans) + 1);
+            memcpy(&res_data[res_offset], ans, strlen(ans));
+            res_offset += strlen(ans);
+
+            res_data[res_offset] = 0;
+            ++res_offset;
+
+            res_offsets[i] = res_offset;
+            prev_offset = cur_offset;
        }
    }

+
 };


@ -126,13 +297,18 @@ struct NameBiGramcount
 {
    static constexpr auto name = "biGramcount";
 };
+struct NameGetEmo
+{
+    static constexpr auto name = "getEmo";
+};


-using FunctionBiGramcount = FunctionsTextClassification<TextClassificationImpl<2>, NameBiGramcount>;
-
+using FunctionBiGramcount = FunctionsTextClassification<TextClassificationImpl<2, false>, NameBiGramcount>;
+using FunctionGetEmo = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetEmo>;
 void registerFunctionsTextClassification(FunctionFactory & factory)
 {
    factory.registerFunction<FunctionBiGramcount>();
+    factory.registerFunction<FunctionGetEmo>();
 }

 }
--- a/src/Functions/FunctionsTextClassification.h
+++ b/src/Functions/FunctionsTextClassification.h
@ -34,7 +34,7 @@ public:
        if (!isString(arguments[0]))
            throw Exception(
                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-        return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
+        return arguments[0];
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
@ -47,21 +47,19 @@ public:

        if (col_const)
        {
-            ResultType res{};
+            ResultType res;
            Impl::constant(col_const->getValue<String>(), res);
            return result_type->createColumnConst(col_const->size(), toField(res));
        }

-        auto col_res = ColumnVector<ResultType>::create();

-        typename ColumnVector<ResultType>::Container & vec_res = col_res->getData();
-        vec_res.resize(column->size());
-
-        const ColumnString * col_vector = checkAndGetColumn<ColumnString>(&*column);
-
-        if (col_vector)
+        if (const ColumnString * col = checkAndGetColumn<ColumnString>(column.get()))
        {
-            Impl::vector(col_vector->getChars(), col_vector->getOffsets(), vec_res);
+            auto col_res = ColumnString::create();
+            ColumnString::Chars & vec_res = col_res->getChars();
+            ColumnString::Offsets & offsets_res = col_res->getOffsets();
+            Impl::vector(col->getChars(), col->getOffsets(), vec_res, offsets_res);
+            return col_res;
        }
        else
        {
@ -69,8 +67,6 @@ public:
                "Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
                ErrorCodes::ILLEGAL_COLUMN);
        }
-
-        return col_res;
    }
 };

--- a/src/Functions/test.txt
+++ b/src/Functions/test.txt
@ -0,0 +1,3 @@
+12 123
+54 2323
+abcd 123