mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Add detectLanguage
This commit is contained in:
parent
a5320e8d15
commit
3ad26a798d
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -37,6 +37,7 @@ add_subdirectory (abseil-cpp-cmake)
|
||||
add_subdirectory (antlr4-runtime-cmake)
|
||||
add_subdirectory (boost-cmake)
|
||||
add_subdirectory (cctz-cmake)
|
||||
add_subdirectory (cld2-cmake)
|
||||
add_subdirectory (consistent-hashing)
|
||||
add_subdirectory (dragonbox-cmake)
|
||||
add_subdirectory (hyperscan-cmake)
|
||||
|
@ -1,67 +1,46 @@
|
||||
option (USE_INTERNAL_CLD2_LIBRARY "Use internal cld2 library" ${NOT_UNBUNDLED})
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
|
||||
if (NOT USE_INTERNAL_LZ4_LIBRARY)
|
||||
find_library (LIBRARY_CLD2 cld2)
|
||||
find_path (INCLUDE_CLD2 compact_lang_det.h)
|
||||
|
||||
if (LIBRARY_CLD2 AND INCLUDE_CLD2)
|
||||
set(EXTERNAL_CLD2_LIBRARY_FOUND 1)
|
||||
add_library (cld2 INTERFACE)
|
||||
set_property (TARGET cld2 PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_CLD2})
|
||||
set_property (TARGET cld2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CLD2})
|
||||
else ()
|
||||
set(EXTERNAL_CLD2_LIBRARY_FOUND 0)
|
||||
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system cld2")
|
||||
endif()
|
||||
endif()
|
||||
set (SRCS
|
||||
${LIBRARY_DIR}/internal/cldutil.cc
|
||||
${LIBRARY_DIR}/internal/cldutil_shared.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
|
||||
${LIBRARY_DIR}/internal/debug.cc
|
||||
${LIBRARY_DIR}/internal/fixunicodevalue.cc
|
||||
${LIBRARY_DIR}/internal/generated_entities.cc
|
||||
${LIBRARY_DIR}/internal/generated_language.cc
|
||||
${LIBRARY_DIR}/internal/generated_ulscript.cc
|
||||
${LIBRARY_DIR}/internal/getonescriptspan.cc
|
||||
${LIBRARY_DIR}/internal/lang_script.cc
|
||||
${LIBRARY_DIR}/internal/offsetmap.cc
|
||||
${LIBRARY_DIR}/internal/scoreonescriptspan.cc
|
||||
${LIBRARY_DIR}/internal/tote.cc
|
||||
${LIBRARY_DIR}/internal/utf8statetable.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
|
||||
${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
|
||||
)
|
||||
|
||||
if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
|
||||
set (USE_INTERNAL_CLD2_LIBRARY 1)
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
add_library(cld2 ${SRCS})
|
||||
|
||||
set (SRCS
|
||||
${LIBRARY_DIR}/internal/cldutil.cc
|
||||
${LIBRARY_DIR}/internal/cldutil_shared.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
|
||||
${LIBRARY_DIR}/internal/debug.cc
|
||||
${LIBRARY_DIR}/internal/fixunicodevalue.cc
|
||||
${LIBRARY_DIR}/internal/generated_entities.cc
|
||||
${LIBRARY_DIR}/internal/generated_language.cc
|
||||
${LIBRARY_DIR}/internal/generated_ulscript.cc
|
||||
${LIBRARY_DIR}/internal/getonescriptspan.cc
|
||||
${LIBRARY_DIR}/internal/lang_script.cc
|
||||
${LIBRARY_DIR}/internal/offsetmap.cc
|
||||
${LIBRARY_DIR}/internal/scoreonescriptspan.cc
|
||||
${LIBRARY_DIR}/internal/tote.cc
|
||||
${LIBRARY_DIR}/internal/utf8statetable.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
|
||||
${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
|
||||
)
|
||||
|
||||
add_library(cld2 ${SRCS})
|
||||
|
||||
target_compile_options (cld2
|
||||
PRIVATE
|
||||
-Wno-old-style-cast
|
||||
-Wno-inconsistent-missing-destructor-override
|
||||
-Wno-deprecated
|
||||
-Wno-unused-parameter
|
||||
-Wno-shadow
|
||||
-Wno-tautological-type-limit-compare
|
||||
-Wno-extra-semi
|
||||
-Wno-narrowing
|
||||
-Wl
|
||||
)
|
||||
|
||||
target_include_directories (cld2 SYSTEM PUBLIC ${LIBRARY_DIR}/public)
|
||||
endif()
|
||||
#target_link_libraries (cld2 PUBLIC ssl)
|
||||
target_compile_options (cld2
|
||||
PRIVATE
|
||||
-Wno-old-style-cast
|
||||
-Wno-inconsistent-missing-destructor-override
|
||||
-Wno-deprecated
|
||||
-Wno-unused-parameter
|
||||
-Wno-shadow
|
||||
-Wno-tautological-type-limit-compare
|
||||
-Wno-extra-semi
|
||||
-Wno-narrowing
|
||||
-Wl
|
||||
)
|
||||
|
||||
target_include_directories (cld2 PUBLIC ${LIBRARY_DIR}/public)
|
||||
|
35722
programs/server/charset_freq.txt
Normal file
35722
programs/server/charset_freq.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -908,9 +908,9 @@
|
||||
|
||||
<!-- Text classification -->
|
||||
|
||||
<encoding_frequencies_path>/ClassificationDictionaries/charset_freq.txt</encoding_frequencies_path>
|
||||
<programming_lang_frequencies_path>/ClassificationDictionaries/programming_freq.txt</programming_lang_frequencies_path>
|
||||
<emotional_dict_path>/ClassificationDictionaries/emotional_dictionary_rus.txt</emotional_dict_path>
|
||||
<encoding_frequencies_path>charset_freq.txt</encoding_frequencies_path>
|
||||
<programming_lang_frequencies_path>programming_freq.txt</programming_lang_frequencies_path>
|
||||
<emotional_dict_path>emotional_dictionary_rus.txt</emotional_dict_path>
|
||||
|
||||
<top_level_domains_lists>
|
||||
<!--
|
||||
|
288664
programs/server/emotional_dictionary_rus.txt
Normal file
288664
programs/server/emotional_dictionary_rus.txt
Normal file
File diff suppressed because it is too large
Load Diff
55532
programs/server/emotional_dictionary_rus_part.txt
Normal file
55532
programs/server/emotional_dictionary_rus_part.txt
Normal file
File diff suppressed because it is too large
Load Diff
9434
programs/server/programming_freq.txt
Normal file
9434
programs/server/programming_freq.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -35,7 +35,7 @@ public:
|
||||
{
|
||||
path_to_enc_freq = pt;
|
||||
loadEncodingsFrequency(pt);
|
||||
// loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
// loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
}
|
||||
|
||||
void parseEmotionalDict(const String & pt)
|
||||
|
BIN
src/Common/t
Executable file
BIN
src/Common/t
Executable file
Binary file not shown.
@ -19,7 +19,7 @@ namespace DB
|
||||
{
|
||||
|
||||
|
||||
template <size_t N>
|
||||
template <size_t N, bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
{
|
||||
|
||||
@ -120,17 +120,26 @@ struct CharsetClassificationImpl
|
||||
std::unordered_map<UInt16, Float64> model;
|
||||
calculateStats(data.data(), data.size(), readCodePoints, model);
|
||||
|
||||
Float64 max_result = log(zero_frequency) * (model.size() + 1);
|
||||
res = "Undefined";
|
||||
Float64 max_result = 0;
|
||||
String poss_ans;
|
||||
for (const auto& item : encodings_freq)
|
||||
{
|
||||
const Float64 freq_pr = Naive_bayes(item.second, model);
|
||||
if (max_result > freq_pr)
|
||||
const Float64 score = Naive_bayes(item.second, model);
|
||||
if (max_result == 0 || max_result < score)
|
||||
{
|
||||
res = item.first;
|
||||
max_result = freq_pr;
|
||||
poss_ans = item.first;
|
||||
max_result = score;
|
||||
}
|
||||
}
|
||||
size_t sep = poss_ans.find('_');
|
||||
if (detect_language)
|
||||
{
|
||||
res = poss_ans.erase(0, sep + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
res = poss_ans.erase(sep, poss_ans.size() - sep);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -154,42 +163,40 @@ struct CharsetClassificationImpl
|
||||
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
String str = haystack;
|
||||
|
||||
String prom;
|
||||
String poss_ans;
|
||||
|
||||
std::unordered_map<UInt16, Float64> model;
|
||||
calculateStats(str.data(), str.size(), readCodePoints, model);
|
||||
/*
|
||||
Float64 max_result = log(zero_frequency) * model.size();
|
||||
|
||||
prom = "Undefined";
|
||||
for (const auto& item : encodings_freq)
|
||||
{
|
||||
const Float64 freq_pr = Naive_bayes(item.second, model);
|
||||
if (max_result > freq_pr)
|
||||
{
|
||||
prom = item.first;
|
||||
max_result = freq_pr;
|
||||
}
|
||||
}
|
||||
*/
|
||||
std::vector<std::pair<std::string, Float64>> results;
|
||||
Float64 max_result = 0;
|
||||
for (const auto& item : encodings_freq)
|
||||
{
|
||||
results.push_back(std::make_pair(item.first, Naive_bayes(item.second, model)));
|
||||
Float64 score = Naive_bayes(item.second, model);
|
||||
if (max_result == 0 || max_result < score)
|
||||
{
|
||||
max_result = score;
|
||||
poss_ans = item.first;
|
||||
}
|
||||
}
|
||||
std::sort(results.begin(), results.end(), [](auto &left, auto &right)
|
||||
{
|
||||
return left.second > right.second;
|
||||
});
|
||||
|
||||
prom = results[0].first + " | " + results[1].first + " | " + results[2].first;
|
||||
|
||||
const auto ans = prom.c_str();
|
||||
size_t sep = poss_ans.find('_');
|
||||
String ans_str;
|
||||
if (detect_language)
|
||||
{
|
||||
ans_str = poss_ans.erase(0, sep + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
ans_str = poss_ans.erase(sep, poss_ans.size() - sep);
|
||||
}
|
||||
|
||||
const auto ans = ans_str.c_str();
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
res_data.resize(res_offset + strlen(ans) + 1);
|
||||
memcpy(&res_data[res_offset], ans, strlen(ans));
|
||||
res_offset += strlen(ans);
|
||||
size_t ans_size = strlen(ans);
|
||||
res_data.resize(res_offset + ans_size + 1);
|
||||
memcpy(&res_data[res_offset], ans, ans_size);
|
||||
res_offset += ans_size;
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
@ -205,15 +212,22 @@ struct CharsetClassificationImpl
|
||||
|
||||
struct NameCharsetDetect
|
||||
{
|
||||
static constexpr auto name = "charsetDetect";
|
||||
static constexpr auto name = "detectCharset";
|
||||
};
|
||||
|
||||
struct NameLanguageDetect
|
||||
{
|
||||
static constexpr auto name = "detectLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2>, NameCharsetDetect>;
|
||||
using FunctionCharsetDetect = FunctionsTextClassification<CharsetClassificationImpl<2, true>, NameCharsetDetect>;
|
||||
using FunctionLanguageDetect = FunctionsTextClassification<CharsetClassificationImpl<2, false>, NameLanguageDetect>;
|
||||
|
||||
void registerFunctionsCharsetClassification(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionCharsetDetect>();
|
||||
factory.registerFunction<FunctionLanguageDetect>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -12,9 +12,11 @@ namespace DB
|
||||
{
|
||||
/** Functions for text classification:
|
||||
*
|
||||
* charsetDetect(string data) - detect charset of data.
|
||||
* detectCharset(string data) - detect charset of data.
|
||||
* Returns string name of most likely charset.
|
||||
* .
|
||||
* detectLanguage(string data) - detect language of data in various encodings (not UTF-8)
|
||||
*
|
||||
* getTonality(string data) - defines the emotional coloring of the text.
|
||||
* Returns NEG if text is negative, POS if text is postive or NEUT if text is neutral.
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user