Fix functions

2024-11-21 15:12:02 +00:00 · 2021-04-18 20:03:56 +03:00 · 2021-04-18 20:03:56 +03:00 · e4a8cd3f06
commit e4a8cd3f06
parent 95980899dc
8 changed files with 297698 additions and 8956 deletions
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -48,6 +48,7 @@ add_subdirectory (murmurhash)
 add_subdirectory (replxx-cmake)
 add_subdirectory (unixodbc-cmake)
 add_subdirectory (xz)
+add_subdirectory (cld2-cmake) 

 add_subdirectory (poco-cmake)
 add_subdirectory (croaring-cmake)
--- a/contrib/cld2
+++ b/contrib/cld2
@ -0,0 +1 @@
+Subproject commit b56fa78a2fe44ac2851bae5bf4f4693a0644da7b
--- a/contrib/cld2-cmake/CMakeLists.txt
+++ b/contrib/cld2-cmake/CMakeLists.txt
@ -0,0 +1,67 @@
+option (USE_INTERNAL_CLD2_LIBRARY "Use internal cld2 library" ${NOT_UNBUNDLED})
+
+if (NOT USE_INTERNAL_LZ4_LIBRARY)
+    find_library (LIBRARY_CLD2 cld2)
+    find_path (INCLUDE_CLD2 compact_lang_det.h)
+
+    if (LIBRARY_CLD2 AND INCLUDE_CLD2)
+        set(EXTERNAL_CLD2_LIBRARY_FOUND 1)
+        add_library (cld2 INTERFACE)
+        set_property (TARGET cld2 PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_CLD2})
+        set_property (TARGET cld2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CLD2})
+    else ()
+        set(EXTERNAL_CLD2_LIBRARY_FOUND 0)
+        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system cld2")
+    endif()
+endif()
+
+if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
+    set (USE_INTERNAL_CLD2_LIBRARY 1)
+    set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
+
+    set (SRCS
+        ${LIBRARY_DIR}/internal/cldutil.cc
+        ${LIBRARY_DIR}/internal/cldutil_shared.cc
+        ${LIBRARY_DIR}/internal/compact_lang_det.cc
+        ${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
+        ${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
+        ${LIBRARY_DIR}/internal/debug.cc
+        ${LIBRARY_DIR}/internal/fixunicodevalue.cc
+        ${LIBRARY_DIR}/internal/generated_entities.cc
+        ${LIBRARY_DIR}/internal/generated_language.cc
+        ${LIBRARY_DIR}/internal/generated_ulscript.cc
+        ${LIBRARY_DIR}/internal/getonescriptspan.cc
+        ${LIBRARY_DIR}/internal/lang_script.cc
+        ${LIBRARY_DIR}/internal/offsetmap.cc
+        ${LIBRARY_DIR}/internal/scoreonescriptspan.cc
+        ${LIBRARY_DIR}/internal/tote.cc
+        ${LIBRARY_DIR}/internal/utf8statetable.cc
+        ${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
+        ${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
+        ${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
+        ${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
+        ${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
+        ${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
+        ${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
+        ${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
+    )
+
+    add_library(cld2 ${SRCS})
+
+    target_compile_options (cld2
+        PRIVATE
+            -Wno-old-style-cast
+            -Wno-inconsistent-missing-destructor-override
+            -Wno-deprecated
+            -Wno-unused-parameter
+            -Wno-shadow
+            -Wno-tautological-type-limit-compare
+            -Wno-extra-semi
+            -Wno-narrowing
+            -Wl
+    )
+
+    target_include_directories (cld2 SYSTEM PUBLIC ${LIBRARY_DIR}/public)
+endif()
+    #target_link_libraries (cld2 PUBLIC ssl)
+
--- a/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt
+++ b/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt
--- a/src/Common/ClassificationDictionaries/emotional_dictionary_rus_part.txt
+++ b/src/Common/ClassificationDictionaries/emotional_dictionary_rus_part.txt
--- a/src/Common/FrequencyHolder.h
+++ b/src/Common/FrequencyHolder.h
@ -34,22 +34,22 @@ public:
    void parseEncodingFrequencies(const String & pt)
    {
        path_to_enc_freq = pt;
-        loadEncodingsFrequency(pt);
-        //loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
+        //loadEncodingsFrequency(pt);
+        loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
    }

    void parseEmotionalDict(const String & pt)
    {
        path_to_emo_dict = pt;
-        loadEmotionalDict(pt);
-        //loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
+        //loadEmotionalDict(pt);
+        loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
    }

    void parseProgrammingFrequency(const String & pt) 
    {
        path_to_prog_freq = pt;
-        loadProgrammingFrequency(pt);
-        //loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
+        //loadProgrammingFrequency(pt);
+        loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
    }


@ -103,24 +103,29 @@ public:
        Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
        LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);

-        ReadBufferFromFile in(path_to_emotional_dict);
+        size_t buf_size = 10000000;
+        ReadBufferFromFile in(path_to_emotional_dict, buf_size);
+        size_t count = 0;
        while (!in.eof())
        {
            char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());

-            if (newline >= in.buffer().end()) { break; }
+            //if (newline >= in.buffer().end()) { break; }

            ReadBufferFromMemory buf_line(in.position(), newline - in.position());
            in.position() = newline + 1;

+            if (newline >= in.buffer().end())
+                break;
+
            readStringUntilWhitespace(word, buf_line);
            buf_line.ignore();
            readFloatText(tonality, buf_line);

            emotional_dict[word] = tonality;
-
+            ++count;
        }
-        LOG_TRACE(log, "Emotional dictionary was added");
+        LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
    }


@ -134,7 +139,8 @@ public:

        LOG_TRACE(log, "Programming langugages frequencies loading from {}", path_to_programming_freq);

-        ReadBufferFromFile in(path_to_programming_freq);
+        size_t buf_size = 10000000;
+        ReadBufferFromFile in(path_to_programming_freq, buf_size);
        while (!in.eof())
        {
            char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
--- a/src/Functions/FunctionsProgrammingClassification.cpp
+++ b/src/Functions/FunctionsProgrammingClassification.cpp
@ -31,31 +31,33 @@ struct ProgrammingClassificationImpl
        static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
        std::unordered_map<String, Float64> data_freq;

-        String answer;
+        String prev_command;
+        String command;

-        ReadBufferFromMemory in(data.data(), data.size() + 1);
-        skipWhitespaceIfAny(in);
-
-        String prev = "";
-        String new_word;
-        
-        while (!in.eof())
+        for (size_t i = 0; i < data.size();)
        {
-            if (data.size() - (in.position() - data.data()) <= 3)
+            if (!isspace(data[i]))
            {
-                break;
-            }
-            readStringUntilWhitespace(new_word, in);
-            skipWhitespaceIfAny(in);
+                command.push_back(data[i]);
+                ++i;

-            if (prev == "")
-            {
-                prev = new_word;
-            }
+                while ((i < data.size()) && (!isspace(data[i]))) {
+                    command.push_back(data[i]);
+                    ++i;
+                }
+                if (prev_command == "") {
+                    prev_command = command;
+                }
+                else
+                {
+                    data_freq[prev_command + command] += 1;
+                    prev_command = command;
+                }
+                command = "";
+            } 
            else
            {
-                data_freq[prev + new_word] += 1;
-                prev = new_word;
+                ++i;
            }
        }

@ -97,32 +99,35 @@ struct ProgrammingClassificationImpl
        for (size_t i = 0; i < offsets.size(); ++i)
        {
            const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
-            String str = haystack;
+            String str_data = haystack;

-            String buf;
+            String prev_command;
+            String command;

-            ReadBufferFromMemory in(str.data(), str.size() + 1);
-
-            skipWhitespaceIfAny(in);
-            String new_word;
-            String prev;
-            while (!in.eof())
+            for (size_t ind = 0; ind < str_data.size();)
            {
-                if (str.size() - (in.position() - str.data()) <= 3)
+                if (!isspace(str_data[ind]))
                {
-                    break;
-                }
-                readStringUntilWhitespace(new_word, in);
-                skipWhitespaceIfAny(in);
+                    command.push_back(str_data[ind]);
+                    ++ind;

-                if (prev == "")
-                {
-                    prev = new_word;
+                    while ((ind < str_data.size()) && (!isspace(str_data[ind]))) {
+                        command.push_back(str_data[ind]);
+                        ++ind;
+                    }
+                    if (prev_command == "") {
+                        prev_command = command;
+                    }
+                    else
+                    {
+                        data_freq[prev_command + command] += 1;
+                        prev_command = command;
+                    }
+                    command = "";
                } 
                else
                {
-                    data_freq[prev + new_word] += 1;
-                    prev = new_word;
+                    ++ind;
                }
            }

@ -141,7 +146,7 @@ struct ProgrammingClassificationImpl

            if (most_liked == "")
            {
-            most_liked = "Undefined";
+                most_liked = "Undefined";
            }

            const auto ans = most_liked.c_str();
--- a/src/Functions/FunctionsTonalityClassification.cpp
+++ b/src/Functions/FunctionsTonalityClassification.cpp
@ -1,4 +1,5 @@
 #include <Functions/FunctionsTextClassification.h>
+#include <Common/StringUtils/StringUtils.h>
 #include <Common/FrequencyHolder.h>
 #include <Functions/FunctionFactory.h>
 #include <Common/UTF8Helpers.h>
@ -15,26 +16,11 @@ struct TonalityClassificationImpl

    using ResultType = String;

-    
-    static ALWAYS_INLINE inline void word_processing(String & word)
-    {
-        std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
-
-        while (to_skip.find(word.back()) != to_skip.end())
-        {
-            word.pop_back();
-        }
-
-        while (to_skip.find(word.front()) != to_skip.end())
-        {
-            word.erase(0, 1);
-        }
-    }

    static String get_tonality(const Float64 & tonality_level)
    {
-        if (tonality_level < 0.5) { return "NEG"; }
-        if (tonality_level > 1) { return "POS"; }
+        if (tonality_level < 0.25) { return "NEG"; }
+        if (tonality_level > 0.5) { return "POS"; }
        return "NEUT";
    } 
    
@ -42,34 +28,38 @@ struct TonalityClassificationImpl
    {
        static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();

-        Float64 freq = 0;
+        Float64 weight = 0;
        Float64 count_words = 0;

        String answer;
+        String word;

-        ReadBufferFromMemory in(data.data(), data.size() + 1);
-        skipWhitespaceIfAny(in);
-
-        String to_check;
-        while (!in.eof())
+        for (size_t i = 0; i < data.size();)
        {
-            if (data.size() - (in.position() - data.data()) <= 3) {
-                break;
-            }
-            readStringUntilWhitespace(to_check, in);
-            skipWhitespaceIfAny(in);
-
-            word_processing(to_check);
-                
-
-            if (emotional_dict.find(to_check) != emotional_dict.cend())
+            if (!isASCII(data[i]))
            {
-                count_words += 1;
-                freq += emotional_dict[to_check];
-            }            
+                word.push_back(data[i]);
+                ++i;
+
+                while ((i < data.size()) && (!isASCII(data[i]))) {
+                    word.push_back(data[i]);
+                    ++i;
+                }
+                if (emotional_dict.find(word) != emotional_dict.cend())
+                {
+                    count_words += 1;
+                    weight += emotional_dict[word];
+                }
+                word = "";
+            } 
+            else
+            {
+                ++i;
+            }
        }
-        Float64 total_tonality = freq / count_words;
-        res = get_tonality(total_tonality);
+
+        Float64 total_tonality = weight / count_words;
+        res += get_tonality(total_tonality);
    }


@ -94,29 +84,38 @@ struct TonalityClassificationImpl

            String buf;

-            Float64 freq = 0;
+            Float64 weight = 0;
            Float64 count_words = 0;


-            ReadBufferFromMemory in(str.data(), str.size() + 1);
+            String answer;
+            String word;

-            skipWhitespaceIfAny(in);
-            String to_check;
-            while (!in.eof())
+            for (size_t ind = 0; ind < str.size();)
            {
-                if (str.size() - (in.position() - str.data()) <= 3) {
-                    break;
-                }
-                readStringUntilWhitespace(to_check, in);
-                skipWhitespaceIfAny(in);
-
-                if (emotional_dict.find(to_check) != emotional_dict.cend())
+                if (!isASCII(str[ind]))
                {
-                    count_words += 1;
-                    freq += emotional_dict[to_check];
+                    word.push_back(str[ind]);
+                    ++ind;
+
+                    while ((ind < str.size()) && (!isASCII(str[ind]))) {
+                        word.push_back(str[ind]);
+                        ++ind;
+                    }
+                    if (emotional_dict.find(word) != emotional_dict.cend())
+                    {
+                        count_words += 1;
+                        weight += emotional_dict[word];
+                    }
+                    word = "";
+                }
+                else
+                {
+                    ++ind;
                }
            }
-            Float64 total_tonality = freq / count_words;
+
+            Float64 total_tonality = weight / count_words;
            buf = get_tonality(total_tonality);

            const auto ans = buf.c_str();
				`@ -0,0 +1 @@`
				`Subproject commit b56fa78a2fe44ac2851bae5bf4f4693a0644da7b`