add FunctionsTonalityClassification.cpp

2024-11-27 01:51:59 +00:00 · 2021-04-14 21:42:33 +03:00 · 2021-04-14 21:42:33 +03:00 · cdf8ab71d2
commit cdf8ab71d2
parent 4412aa39bb
10 changed files with 91478 additions and 134 deletions
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -634,7 +634,7 @@ int Server::main(const std::vector<std::string> & /*args*/)

    /// my test
    {
-        const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "textclassification_frequency/");
+        const std::string & encode_frequency_path = config().getString("encodings_frequency_path", path + "encodings_frequency/");
        FrequencyHolder::getInstance().parseDictionaries(encode_frequency_path);
    }

--- a/programs/server/config.d/path.xml
+++ b/programs/server/config.d/path.xml
@ -5,5 +5,4 @@
    <format_schema_path replace="replace">./format_schemas/</format_schema_path>
    <access_control_path replace="replace">./access/</access_control_path>
    <top_level_domains_path replace="replace">./top_level_domains/</top_level_domains_path>
-    <encodings_frequency_path replace="replace">./encoding_frequency/</encodings_frequency_path>
 </yandex>
--- a/src/Common/ClassificationDictionaries/charset_freq.txt
+++ b/src/Common/ClassificationDictionaries/charset_freq.txt
--- a/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt
+++ b/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt
--- a/src/Common/FrequencyHolder.h
+++ b/src/Common/FrequencyHolder.h
@ -14,7 +14,7 @@
 #include <cstring>
 #include <limits>
 #include <unordered_map>
-
+#include <common/logger_useful.h>


 namespace DB
@ -22,11 +22,11 @@ namespace DB

 class FrequencyHolder
 {
+
 public:
    using Map = std::unordered_map<UInt16, Float64>;
    using Container = std::unordered_map<String, Map>;

-
    static FrequencyHolder & getInstance()
    {
        static FrequencyHolder instance;
@ -37,8 +37,8 @@ public:
    void parseDictionaries(const String & pt)
    {
        is_true = pt;
-        loadEmotionalDict("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/emotional_dictionary_rus.txt");
-        loadEncodingsFrequency("/home/sergey/ClickHouse/src/Functions/ClassificationDictionaries/charset_freq.txt");
+        loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
+        loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
    }


@ -48,6 +48,10 @@ public:
        Float64 frequency;
        String charset_name;

+        Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
+
+        LOG_TRACE(log, "Charset frequencies loading from {}", path_to_charset_freq);
+
        ReadBufferFromFile in(path_to_charset_freq);
        while (!in.eof())
        {
@ -75,6 +79,7 @@ public:
            }
            in.position() = newline + 1;
        }
+        LOG_TRACE(log, "Charset frequencies was added");
    }


@ -84,6 +89,9 @@ public:
        String word;
        Float64 tonality;

+        Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
+        LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
+
        ReadBufferFromFile in(path_to_emotional_dict);
        while (!in.eof())
        {
@ -101,6 +109,7 @@ public:
            emotional_dict[word] = tonality;

        }
+        LOG_TRACE(log, "Emotional dictionary was added");
    }


@ -110,13 +119,13 @@ public:
    }


-    const std::unordered_map<String, Float64> getEmotionalDict()
+    const std::unordered_map<String, Float64> & getEmotionalDict()
    {
        return emotional_dict;
    }


-    const Container getEncodingsFrequency()
+    const Container & getEncodingsFrequency()
    {
        return encodings_freq;
    }
--- a/src/Functions/FunctionsTextClassification.cpp
+++ b/src/Functions/FunctionsTextClassification.cpp
@ -19,7 +19,7 @@ namespace DB
 {


-template <size_t N, bool Tonality>
+template <size_t N>
 struct TextClassificationImpl
 {

@ -43,19 +43,6 @@ struct TextClassificationImpl
      */
    using NgramCount = UInt16;

-
-    static ALWAYS_INLINE inline Float64 L2_distance(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
-    {
-        Float64 res = 0;
-        for (auto& el : standart) {
-            if (model.find(el.first) != model.end()) {
-                res += ((model[el.first] - el.second) * (model[el.first] - el.second));
-            }
-        }
-        return res;
-    }
-
-
    static ALWAYS_INLINE inline Float64 Naive_bayes(std::unordered_map<UInt16, Float64> standart, std::unordered_map<UInt16, Float64> model)
    {
        Float64 res = 0;
@ -63,7 +50,7 @@ struct TextClassificationImpl
            if (standart[el.first] != 0) {
                res += el.second * log(standart[el.first]);
            } else {
-                res += el.second * log(0.0000001);
+                res += el.second * log(0.000001);
            }
        }
        return res;
@ -118,82 +105,32 @@ struct TextClassificationImpl

        return len;
    }
-    
-    
-    static ALWAYS_INLINE inline void word_processing(String & word)
-    {
-        std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};

-        while (to_skip.find(word.back()) != to_skip.end())
-        {
-            word.pop_back();
-        }
-
-        while (to_skip.find(word.front()) != to_skip.end())
-        {
-            word.erase(0, 1);
-        }
-    }
-
-    static String get_tonality(const Float64 & tonality_level)
-    {
-        if (tonality_level < 0.5) { return "NEG"; }
-        if (tonality_level > 1) { return "POS"; }
-        return "NEUT";
-    } 
    
    static void constant(String data, String & res)
    {
        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
        static std::unordered_map<String, std::unordered_map<UInt16, Float64>> encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
-
-        if (!Tonality)
-        {
            
-            std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
-            std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
-            size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
-            String ans;
-            // Float64 count_bigram = data.size() - 1;
-            std::unordered_map<UInt16, Float64> model;
-            for (size_t i = 0; i < len; ++i) {
-                ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
-                model[ngram_storage.get()[i]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]]);
-            }
-
-            for (const auto& item : encodings_freq) {
-                ans += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
-            }
-            res = ans;
+        std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
+        std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
+            
+        size_t len = calculateStats(data.data(), data.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
+        String ans;
+        // Float64 count_bigram = data.size() - 1;
+        std::unordered_map<UInt16, Float64> model;
+        for (size_t i = 0; i < len; ++i) {
+            ans += std::to_string(ngram_storage.get()[i]) + " " + std::to_string(static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]])) + "\n";
+            model[ngram_storage.get()[i]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[i]]);
        }
-        else 
-        {
-            Float64 freq = 0;
-            Float64 count_words = 0;

-            String ans;
-
-            String to_check;
-            ReadBufferFromString in(data);
-
-            while (!in.eof())
-            {
-                readString(to_check, in);
-                word_processing(to_check);
-
-                if (emotional_dict.find(to_check) != emotional_dict.cend())
-                {
-                    count_words += 1;
-                    ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
-                    freq += emotional_dict[to_check];
-                }                
-            }
-            Float64 total_tonality = freq / count_words;
-            ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
-            res = ans;
+        for (const auto& item : encodings_freq) {
+            ans += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
        }
+        res = ans;
    }

+
    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
@ -215,50 +152,35 @@ struct TextClassificationImpl
            String str = haystack;

            String prom;
-            if (!Tonality)
+
+            std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
+            std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
+
+            size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
+            // Float64 count_bigram = data.size() - 1;
+            std::unordered_map<UInt16, Float64> model;
+
+            for (size_t j = 0; j < len; ++j)
            {
-                std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}}; // frequency of N-grams 
-                std::unique_ptr<NgramCount[]> ngram_storage{new NgramCount[map_size]{}}; // list of N-grams
+                model[ngram_storage.get()[j]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[j]]);
+            }

-                size_t len = calculateStats(str.data(), str.size(), common_stats.get(), readCodePoints, ngram_storage.get()); // count of N-grams
-                // Float64 count_bigram = data.size() - 1;
-                std::unordered_map<UInt16, Float64> model;
+            std::vector<std::pair<std::string, Float64>> results;

-                for (size_t j = 0; j < len; ++j)
-                {
-                    model[ngram_storage.get()[j]] = static_cast<Float64>(common_stats.get()[ngram_storage.get()[j]]);
-                }
+            for (const auto& item : encodings_freq)
+            {
+                results.push_back(std::make_pair(item.first, Naive_bayes(item.second, model)));
+            }

-                for (const auto& item : encodings_freq) {
-                    prom += item.first + " " + std::to_string(Naive_bayes(item.second, model)) + "\n";
-                }
+            std::sort(results.begin(), results.end(), [](auto &left, auto &right)
+            {
+                return left.second > right.second;
+            });
+
+            for (size_t ind = 0; ind < 3; ++ind) {
+                prom += results[ind].first + " result=" + std::to_string(results[ind].second) + "\n"; 
+            }
            
-            }
-            else 
-            {
-                Float64 freq = 0;
-                Float64 count_words = 0;
-
-
-                String to_check;
-                ReadBufferFromString in(str);
-
-                while (!in.eof())
-                {
-                    readString(to_check, in);
-
-                    word_processing(to_check);
-
-                    if (emotional_dict.find(to_check) != emotional_dict.cend())
-                    {
-                        count_words += 1;
-                        prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
-                        freq += emotional_dict[to_check];
-                    }                
-                }
-                Float64 total_tonality = freq / count_words;
-                prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
-            }

            const auto ans = prom.c_str();
            size_t cur_offset = offsets[i];
@ -283,19 +205,13 @@ struct NameCharsetDetect
 {
    static constexpr auto name = "charsetDetect";
 };
-struct NameGetTonality
-{
-    static constexpr auto name = "getTonality";
-};


-using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2, false>, NameCharsetDetect>;
-using FunctionGetTonality = FunctionsTextClassification<TextClassificationImpl<2, true>, NameGetTonality>;
+using FunctionCharsetDetect = FunctionsTextClassification<TextClassificationImpl<2>, NameCharsetDetect>;

 void registerFunctionsTextClassification(FunctionFactory & factory)
 {
    factory.registerFunction<FunctionCharsetDetect>();
-    factory.registerFunction<FunctionGetTonality>();
 }

 }
--- a/src/Functions/FunctionsTextClassification.h
+++ b/src/Functions/FunctionsTextClassification.h
@ -14,7 +14,6 @@ namespace ErrorCodes
 {
 extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 extern const int ILLEGAL_COLUMN;
-extern const int TOO_LARGE_STRING_SIZE;
 }

 template <typename Impl, typename Name>
--- a/src/Functions/FunctionsTonalityClassification.cpp
+++ b/src/Functions/FunctionsTonalityClassification.cpp
@ -0,0 +1,163 @@
+#include <Functions/FunctionsTextClassification.h>
+#include <Common/FrequencyHolder.h>
+#include <Functions/FunctionFactory.h>
+#include <Common/UTF8Helpers.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadHelpers.h>
+
+#include <algorithm>
+#include <cstring>
+#include <cmath>
+#include <limits>
+#include <unordered_map>
+#include <memory>
+#include <utility>
+#include <sstream>
+#include <set>
+
+namespace DB
+{
+
+
+struct TonalityClassificationImpl
+{
+
+    using ResultType = String;
+
+    
+    static ALWAYS_INLINE inline void word_processing(String & word)
+    {
+        std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
+
+        while (to_skip.find(word.back()) != to_skip.end())
+        {
+            word.pop_back();
+        }
+
+        while (to_skip.find(word.front()) != to_skip.end())
+        {
+            word.erase(0, 1);
+        }
+    }
+
+    static String get_tonality(const Float64 & tonality_level)
+    {
+        if (tonality_level < 0.5) { return "NEG"; }
+        if (tonality_level > 1) { return "POS"; }
+        return "NEUT";
+    } 
+    
+    static void constant(String data, String & res)
+    {
+        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
+
+        Float64 freq = 0;
+        Float64 count_words = 0;
+
+        String ans;
+
+        ReadBufferFromMemory in(data.data(), data.size() + 1);
+        skipWhitespaceIfAny(in);
+        String to_check;
+        while (!in.eof())
+        {
+            if (data.size() - (in.position() - data.data()) <= 3) {
+                break;
+            }
+            readStringUntilWhitespace(to_check, in);
+            skipWhitespaceIfAny(in);
+            
+            word_processing(to_check);
+                
+
+            if (emotional_dict.find(to_check) != emotional_dict.cend())
+            {
+                count_words += 1;
+                ans += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
+                freq += emotional_dict[to_check];
+            }            
+        }
+        Float64 total_tonality = freq / count_words;
+        ans += get_tonality(total_tonality) + std::to_string(total_tonality) + std::to_string(emotional_dict.size()) + "\n";
+        res = ans;
+    }
+
+
+    static void vector(
+        const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        ColumnString::Chars & res_data,
+        ColumnString::Offsets & res_offsets)
+    {
+        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
+
+        res_data.reserve(1024);
+        res_offsets.resize(offsets.size());
+
+        size_t prev_offset = 0;
+        size_t res_offset = 0;
+
+        for (size_t i = 0; i < offsets.size(); ++i)
+        {
+            const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
+            String str = haystack;
+
+            String prom;
+
+            Float64 freq = 0;
+            Float64 count_words = 0;
+
+
+            ReadBufferFromMemory in(str.data(), str.size() + 1);
+            skipWhitespaceIfAny(in);
+            String to_check;
+            while (!in.eof())
+            {
+                if (str.size() - (in.position() - str.data()) <= 3) {
+                    break;
+                }
+                readStringUntilWhitespace(to_check, in);
+                skipWhitespaceIfAny(in);
+
+                if (emotional_dict.find(to_check) != emotional_dict.cend())
+                {
+                    count_words += 1;
+                    prom += to_check + " " + std::to_string(emotional_dict[to_check]) + "\n";
+                    freq += emotional_dict[to_check];
+                }
+            }
+            Float64 total_tonality = freq / count_words;
+            prom += get_tonality(total_tonality) + std::to_string(total_tonality) + "\n";
+
+            const auto ans = prom.c_str();
+            size_t cur_offset = offsets[i];
+
+            res_data.resize(res_offset + strlen(ans) + 1);
+            memcpy(&res_data[res_offset], ans, strlen(ans));
+            res_offset += strlen(ans);
+
+            res_data[res_offset] = 0;
+            ++res_offset;
+
+            res_offsets[i] = res_offset;
+            prev_offset = cur_offset;
+        }
+    }
+
+
+};
+
+struct NameGetTonality
+{
+    static constexpr auto name = "getTonality";
+};
+
+
+using FunctionGetTonality = FunctionsTextClassification<TonalityClassificationImpl, NameGetTonality>;
+
+void registerFunctionsTonalityClassification(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionGetTonality>();
+}
+
+}
--- a/src/Functions/registerFunctions.cpp
+++ b/src/Functions/registerFunctions.cpp
@ -34,6 +34,7 @@ void registerFunctionsStringSearch(FunctionFactory &);
 void registerFunctionsStringRegexp(FunctionFactory &);
 void registerFunctionsStringSimilarity(FunctionFactory &);
 void registerFunctionsTextClassification(FunctionFactory &);
+void registerFunctionsTonalityClassification(FunctionFactory &);
 void registerFunctionsURL(FunctionFactory &);
 void registerFunctionsVisitParam(FunctionFactory &);
 void registerFunctionsMath(FunctionFactory &);
@ -93,6 +94,7 @@ void registerFunctions()
    registerFunctionsStringRegexp(factory);
    registerFunctionsStringSimilarity(factory);
    registerFunctionsTextClassification(factory);
+    registerFunctionsTonalityClassification(factory);
    registerFunctionsURL(factory);
    registerFunctionsVisitParam(factory);
    registerFunctionsMath(factory);
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@ -56,6 +56,7 @@ SRCS(
    FunctionsStringHash.cpp
    FunctionsStringSimilarity.cpp
    FunctionsTextClassification.cpp
+    FunctionsTonalityClassification.cpp
    GatherUtils/concat.cpp
    GatherUtils/createArraySink.cpp
    GatherUtils/createArraySource.cpp