Add detectLanguage

This commit is contained in:
s-kat 2021-05-07 17:18:06 +03:00
parent a5320e8d15
commit 3ad26a798d
11 changed files with 389450 additions and 102 deletions

View File

@ -37,6 +37,7 @@ add_subdirectory (abseil-cpp-cmake)
add_subdirectory (antlr4-runtime-cmake)
add_subdirectory (boost-cmake)
add_subdirectory (cctz-cmake)
add_subdirectory (cld2-cmake)
add_subdirectory (consistent-hashing)
add_subdirectory (dragonbox-cmake)
add_subdirectory (hyperscan-cmake)

View File

@ -1,67 +1,46 @@
option (USE_INTERNAL_CLD2_LIBRARY "Use internal cld2 library" ${NOT_UNBUNDLED})
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
if (NOT USE_INTERNAL_LZ4_LIBRARY)
find_library (LIBRARY_CLD2 cld2)
find_path (INCLUDE_CLD2 compact_lang_det.h)
if (LIBRARY_CLD2 AND INCLUDE_CLD2)
set(EXTERNAL_CLD2_LIBRARY_FOUND 1)
add_library (cld2 INTERFACE)
set_property (TARGET cld2 PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_CLD2})
set_property (TARGET cld2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CLD2})
else ()
set(EXTERNAL_CLD2_LIBRARY_FOUND 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system cld2")
endif()
endif()
set (SRCS
${LIBRARY_DIR}/internal/cldutil.cc
${LIBRARY_DIR}/internal/cldutil_shared.cc
${LIBRARY_DIR}/internal/compact_lang_det.cc
${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
${LIBRARY_DIR}/internal/debug.cc
${LIBRARY_DIR}/internal/fixunicodevalue.cc
${LIBRARY_DIR}/internal/generated_entities.cc
${LIBRARY_DIR}/internal/generated_language.cc
${LIBRARY_DIR}/internal/generated_ulscript.cc
${LIBRARY_DIR}/internal/getonescriptspan.cc
${LIBRARY_DIR}/internal/lang_script.cc
${LIBRARY_DIR}/internal/offsetmap.cc
${LIBRARY_DIR}/internal/scoreonescriptspan.cc
${LIBRARY_DIR}/internal/tote.cc
${LIBRARY_DIR}/internal/utf8statetable.cc
${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
)
if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
set (USE_INTERNAL_CLD2_LIBRARY 1)
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
add_library(cld2 ${SRCS})
set (SRCS
${LIBRARY_DIR}/internal/cldutil.cc
${LIBRARY_DIR}/internal/cldutil_shared.cc
${LIBRARY_DIR}/internal/compact_lang_det.cc
${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
${LIBRARY_DIR}/internal/debug.cc
${LIBRARY_DIR}/internal/fixunicodevalue.cc
${LIBRARY_DIR}/internal/generated_entities.cc
${LIBRARY_DIR}/internal/generated_language.cc
${LIBRARY_DIR}/internal/generated_ulscript.cc
${LIBRARY_DIR}/internal/getonescriptspan.cc
${LIBRARY_DIR}/internal/lang_script.cc
${LIBRARY_DIR}/internal/offsetmap.cc
${LIBRARY_DIR}/internal/scoreonescriptspan.cc
${LIBRARY_DIR}/internal/tote.cc
${LIBRARY_DIR}/internal/utf8statetable.cc
${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
)
add_library(cld2 ${SRCS})
target_compile_options (cld2
PRIVATE
-Wno-old-style-cast
-Wno-inconsistent-missing-destructor-override
-Wno-deprecated
-Wno-unused-parameter
-Wno-shadow
-Wno-tautological-type-limit-compare
-Wno-extra-semi
-Wno-narrowing
-Wl
)
target_include_directories (cld2 SYSTEM PUBLIC ${LIBRARY_DIR}/public)
endif()
#target_link_libraries (cld2 PUBLIC ssl)
target_compile_options (cld2
PRIVATE
-Wno-old-style-cast
-Wno-inconsistent-missing-destructor-override
-Wno-deprecated
-Wno-unused-parameter
-Wno-shadow
-Wno-tautological-type-limit-compare
-Wno-extra-semi
-Wno-narrowing
-Wl
)
target_include_directories (cld2 PUBLIC ${LIBRARY_DIR}/public)

File diff suppressed because it is too large Load Diff

View File

@ -908,9 +908,9 @@
<!-- Text classification -->
<encoding_frequencies_path>/ClassificationDictionaries/charset_freq.txt</encoding_frequencies_path>
<programming_lang_frequencies_path>/ClassificationDictionaries/programming_freq.txt</programming_lang_frequencies_path>
<emotional_dict_path>/ClassificationDictionaries/emotional_dictionary_rus.txt</emotional_dict_path>
<encoding_frequencies_path>charset_freq.txt</encoding_frequencies_path>
<programming_lang_frequencies_path>programming_freq.txt</programming_lang_frequencies_path>
<emotional_dict_path>emotional_dictionary_rus.txt</emotional_dict_path>
<top_level_domains_lists>
<!--

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ public:
{
path_to_enc_freq = pt;
loadEncodingsFrequency(pt);
// loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
// loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
}
void parseEmotionalDict(const String & pt)

BIN
src/Common/t Executable file

Binary file not shown.

View File

@ -19,7 +19,7 @@ namespace DB
{
template <size_t N>
template <size_t N, bool detect_language>
struct CharsetClassificationImpl
{
@ -120,17 +120,26 @@ struct CharsetClassificationImpl
std::unordered_map<UInt16, Float64> model;
calculateStats(data.data(), data.size(), readCodePoints, model);
Float64 max_result = log(zero_frequency) * (model.size() + 1);
res = "Undefined";
Float64 max_result = 0;
String poss_ans;
for (const auto& item : encodings_freq)
{
const Float64 freq_pr = Naive_bayes(item.second, model);
if (max_result > freq_pr)
const Float64 score = Naive_bayes(item.second, model);
if (max_result == 0 || max_result < score)
{
res = item.first;
max_result = freq_pr;
poss_ans = item.first;
max_result = score;
}
}
size_t sep = poss_ans.find('_');
if (detect_language)
{
res = poss_ans.erase(0, sep + 1);
}
else
{
res = poss_ans.erase(sep, poss_ans.size() - sep);
}
}
@ -154,42 +163,40 @@ struct CharsetClassificationImpl
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
String str = haystack;
String prom;
String poss_ans;
std::unordered_map<UInt16, Float64> model;
calculateStats(str.data(), str.size(), readCodePoints, model);
/*
Float64 max_result = log(zero_frequency) * model.size();
prom = "Undefined";
for (const auto& item : encodings_freq)
{
const Float64 freq_pr = Naive_bayes(item.second, model);
if (max_result > freq_pr)
{
prom = item.first;
max_result = freq_pr;
}
}
*/
std::vector<std::pair<std::string, Float64>> results;
Float64 max_result = 0;
for (const auto& item : encodings_freq)
{
results.push_back(std::make_pair(item.first, Naive_bayes(item.second, model)));
Float64 score = Naive_bayes(item.second, model);
if (max_result == 0 || max_result < score)
{
max_result = score;
poss_ans = item.first;
}
}
std::sort(results.begin(), results.end(), [](auto &left, auto &right)
{
return left.second > right.second;
});
prom = results[0].first + " | " + results[1].first + " | " + results[2].first;
const auto ans = prom.c_str();
size_t sep = poss_ans.find('_');
String ans_str;
if (detect_language)
{
ans_str = poss_ans.erase(0, sep + 1);
}
else
{
ans_str = poss_ans.erase(sep, poss_ans.size() - sep);
}
const auto ans = ans_str.c_str();
size_t cur_offset = offsets[i];
res_data.resize(res_offset + strlen(ans) + 1);
memcpy(&res_data[res_offset], ans, strlen(ans));
res_offset += strlen(ans);
size_t ans_size = strlen(ans);
res_data.resize(res_offset + ans_size + 1);
memcpy(&res_data[res_offset], ans, ans_size);
res_offset += ans_size;
res_data[res_offset] = 0;
++res_offset;
@ -205,15 +212,22 @@ struct CharsetClassificationImpl
struct NameCharsetDetect
{
static constexpr auto name = "charsetDetect";
static constexpr auto name = "detectCharset";
};
struct NameLanguageDetect
{
static constexpr auto name = "detectLanguage";
};
using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2>, NameCharsetDetect>;
using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2, true>, NameCharsetDetect>;
using FunctionLanguageDetect = FunctionsTextClassification<CharsetClassificationImpl<2, false>, NameLanguageDetect>;
void registerFunctionsCharsetClassification(FunctionFactory & factory)
{
factory.registerFunction<FunctionCharsetDetect>();
factory.registerFunction<FunctionLanguageDetect>();
}
}

View File

@ -12,9 +12,11 @@ namespace DB
{
/** Functions for text classification:
*
* charsetDetect(string data) - detect charset of data.
* detectCharset(string data) - detect charset of data.
* Returns string name of most likely charset.
* .
* detectLanguage(string data) - detect language of data in various encodings (not UTF-8)
*
* getTonality(string data) - defines the emotional coloring of the text.
* Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
*