Add detectLanguage

2024-11-21 23:21:59 +00:00 · 2021-05-07 17:18:06 +03:00 · 2021-05-07 17:18:06 +03:00 · 3ad26a798d
commit 3ad26a798d
parent a5320e8d15
11 changed files with 389450 additions and 102 deletions
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -37,6 +37,7 @@ add_subdirectory (abseil-cpp-cmake)
 add_subdirectory (antlr4-runtime-cmake)
 add_subdirectory (boost-cmake)
 add_subdirectory (cctz-cmake)
+add_subdirectory (cld2-cmake)
 add_subdirectory (consistent-hashing)
 add_subdirectory (dragonbox-cmake)
 add_subdirectory (hyperscan-cmake)
--- a/contrib/cld2-cmake/CMakeLists.txt
+++ b/contrib/cld2-cmake/CMakeLists.txt
@ -1,24 +1,6 @@
-option (USE_INTERNAL_CLD2_LIBRARY "Use internal cld2 library" ${NOT_UNBUNDLED})
-
-if (NOT USE_INTERNAL_LZ4_LIBRARY)
-    find_library (LIBRARY_CLD2 cld2)
-    find_path (INCLUDE_CLD2 compact_lang_det.h)
-
-    if (LIBRARY_CLD2 AND INCLUDE_CLD2)
-        set(EXTERNAL_CLD2_LIBRARY_FOUND 1)
-        add_library (cld2 INTERFACE)
-        set_property (TARGET cld2 PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_CLD2})
-        set_property (TARGET cld2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CLD2})
-    else ()
-        set(EXTERNAL_CLD2_LIBRARY_FOUND 0)
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system cld2")
-    endif()
-endif()
-
-if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
-    set (USE_INTERNAL_CLD2_LIBRARY 1)
 set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")

+
 set (SRCS
    ${LIBRARY_DIR}/internal/cldutil.cc
    ${LIBRARY_DIR}/internal/cldutil_shared.cc
@ -61,7 +43,4 @@ if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
        -Wl
 )

-    target_include_directories (cld2 SYSTEM PUBLIC ${LIBRARY_DIR}/public)
-endif()
-    #target_link_libraries (cld2 PUBLIC ssl)
-
+target_include_directories (cld2 PUBLIC ${LIBRARY_DIR}/public)
--- a/programs/server/charset_freq.txt
+++ b/programs/server/charset_freq.txt
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -908,9 +908,9 @@

    <!-- Text classification -->

-    <encoding_frequencies_path>/ClassificationDictionaries/charset_freq.txt</encoding_frequencies_path>
-    <programming_lang_frequencies_path>/ClassificationDictionaries/programming_freq.txt</programming_lang_frequencies_path>
-    <emotional_dict_path>/ClassificationDictionaries/emotional_dictionary_rus.txt</emotional_dict_path>
+    <encoding_frequencies_path>charset_freq.txt</encoding_frequencies_path>
+    <programming_lang_frequencies_path>programming_freq.txt</programming_lang_frequencies_path>
+    <emotional_dict_path>emotional_dictionary_rus.txt</emotional_dict_path>

    <top_level_domains_lists>
        <!--
--- a/programs/server/emotional_dictionary_rus.txt
+++ b/programs/server/emotional_dictionary_rus.txt
--- a/programs/server/emotional_dictionary_rus_part.txt
+++ b/programs/server/emotional_dictionary_rus_part.txt
--- a/programs/server/programming_freq.txt
+++ b/programs/server/programming_freq.txt
--- a/src/Common/t
+++ b/src/Common/t
--- a/src/Functions/FunctionsCharsetClassification.cpp
+++ b/src/Functions/FunctionsCharsetClassification.cpp
@ -19,7 +19,7 @@ namespace DB
 {


-template <size_t N>
+template <size_t N, bool detect_language>
 struct CharsetClassificationImpl
 {

@ -120,17 +120,26 @@ struct CharsetClassificationImpl
        std::unordered_map<UInt16, Float64> model;
        calculateStats(data.data(), data.size(), readCodePoints, model);

-        Float64 max_result = log(zero_frequency) * (model.size() + 1);
-        res = "Undefined";
+        Float64 max_result = 0;
+        String poss_ans;
        for (const auto& item : encodings_freq)
        {
-            const Float64 freq_pr = Naive_bayes(item.second, model);
-            if (max_result > freq_pr)
+            const Float64 score = Naive_bayes(item.second, model);
+            if (max_result == 0 || max_result < score)
            {
-                res = item.first;
-                max_result = freq_pr;
+                poss_ans = item.first;
+                max_result = score;
            }
        }
+        size_t sep = poss_ans.find('_');
+        if (detect_language)
+        {
+            res = poss_ans.erase(0, sep + 1);
+        }
+        else
+        {
+            res = poss_ans.erase(sep, poss_ans.size() - sep);
+        }
    }


@ -154,42 +163,40 @@ struct CharsetClassificationImpl
            const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
            String str = haystack;

-            String prom;
+            String poss_ans;

            std::unordered_map<UInt16, Float64> model;
            calculateStats(str.data(), str.size(), readCodePoints, model);
-/*
-            Float64 max_result = log(zero_frequency) * model.size();

-            prom = "Undefined";
+           Float64 max_result = 0;
           for (const auto& item : encodings_freq)
            {
-                const Float64 freq_pr = Naive_bayes(item.second, model);
-                if (max_result > freq_pr)
+                Float64 score = Naive_bayes(item.second, model);
+                if (max_result == 0 || max_result < score)
                {
-                    prom = item.first;
-                    max_result = freq_pr;
+                    max_result = score;
+                    poss_ans = item.first;
                }
            }
-            */
-            std::vector<std::pair<std::string, Float64>> results;
-           for (const auto& item : encodings_freq)
-            {
-                results.push_back(std::make_pair(item.first, Naive_bayes(item.second, model)));
-            }
-            std::sort(results.begin(), results.end(), [](auto &left, auto &right)
-            {
-                return left.second > right.second;
-            });
            
-            prom = results[0].first + " | " + results[1].first + " | " + results[2].first; 
+            size_t sep = poss_ans.find('_');
+            String ans_str;
+            if (detect_language)
+            {
+                ans_str = poss_ans.erase(0, sep + 1);
+            }
+            else
+            {
+                ans_str = poss_ans.erase(sep, poss_ans.size() - sep);
+            }

-            const auto ans = prom.c_str();
+            const auto ans = ans_str.c_str();
            size_t cur_offset = offsets[i];

-            res_data.resize(res_offset + strlen(ans) + 1);
-            memcpy(&res_data[res_offset], ans, strlen(ans));
-            res_offset += strlen(ans);
+            size_t ans_size = strlen(ans);
+            res_data.resize(res_offset + ans_size + 1);
+            memcpy(&res_data[res_offset], ans, ans_size);
+            res_offset += ans_size;

            res_data[res_offset] = 0;
            ++res_offset;
@ -205,15 +212,22 @@ struct CharsetClassificationImpl

 struct NameCharsetDetect
 {
-    static constexpr auto name = "charsetDetect";
+    static constexpr auto name = "detectCharset";
+};
+
+struct NameLanguageDetect
+{
+    static constexpr auto name = "detectLanguage";
 };


-using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2>, NameCharsetDetect>;
+using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2, true>, NameCharsetDetect>;
+using FunctionLanguageDetect = FunctionsTextClassification<CharsetClassificationImpl<2, false>, NameLanguageDetect>;

 void registerFunctionsCharsetClassification(FunctionFactory & factory)
 {
    factory.registerFunction<FunctionCharsetDetect>();
+    factory.registerFunction<FunctionLanguageDetect>();
 }

 }
--- a/src/Functions/FunctionsTextClassification.h
+++ b/src/Functions/FunctionsTextClassification.h
@ -12,9 +12,11 @@ namespace DB
 {
 /** Functions for text classification:
  *
-  * charsetDetect(string data) - detect charset of data.
+  * detectCharset(string data) - detect charset of data.
  * Returns string name of most likely charset.
  * .
+  * detectLanguage(string data) - detect language of data in various encodings (not UTF-8)
+  *
  * getTonality(string data) - defines the emotional coloring of the text.
  * Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
  *