mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Fix functions
This commit is contained in:
parent
95980899dc
commit
e4a8cd3f06
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -48,6 +48,7 @@ add_subdirectory (murmurhash)
|
||||
add_subdirectory (replxx-cmake)
|
||||
add_subdirectory (unixodbc-cmake)
|
||||
add_subdirectory (xz)
|
||||
add_subdirectory (cld2-cmake)
|
||||
|
||||
add_subdirectory (poco-cmake)
|
||||
add_subdirectory (croaring-cmake)
|
||||
|
1
contrib/cld2
vendored
Submodule
1
contrib/cld2
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit b56fa78a2fe44ac2851bae5bf4f4693a0644da7b
|
67
contrib/cld2-cmake/CMakeLists.txt
Normal file
67
contrib/cld2-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,67 @@
|
||||
option (USE_INTERNAL_CLD2_LIBRARY "Use internal cld2 library" ${NOT_UNBUNDLED})
|
||||
|
||||
if (NOT USE_INTERNAL_LZ4_LIBRARY)
|
||||
find_library (LIBRARY_CLD2 cld2)
|
||||
find_path (INCLUDE_CLD2 compact_lang_det.h)
|
||||
|
||||
if (LIBRARY_CLD2 AND INCLUDE_CLD2)
|
||||
set(EXTERNAL_CLD2_LIBRARY_FOUND 1)
|
||||
add_library (cld2 INTERFACE)
|
||||
set_property (TARGET cld2 PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_CLD2})
|
||||
set_property (TARGET cld2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CLD2})
|
||||
else ()
|
||||
set(EXTERNAL_CLD2_LIBRARY_FOUND 0)
|
||||
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system cld2")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
|
||||
set (USE_INTERNAL_CLD2_LIBRARY 1)
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
|
||||
set (SRCS
|
||||
${LIBRARY_DIR}/internal/cldutil.cc
|
||||
${LIBRARY_DIR}/internal/cldutil_shared.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
|
||||
${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
|
||||
${LIBRARY_DIR}/internal/debug.cc
|
||||
${LIBRARY_DIR}/internal/fixunicodevalue.cc
|
||||
${LIBRARY_DIR}/internal/generated_entities.cc
|
||||
${LIBRARY_DIR}/internal/generated_language.cc
|
||||
${LIBRARY_DIR}/internal/generated_ulscript.cc
|
||||
${LIBRARY_DIR}/internal/getonescriptspan.cc
|
||||
${LIBRARY_DIR}/internal/lang_script.cc
|
||||
${LIBRARY_DIR}/internal/offsetmap.cc
|
||||
${LIBRARY_DIR}/internal/scoreonescriptspan.cc
|
||||
${LIBRARY_DIR}/internal/tote.cc
|
||||
${LIBRARY_DIR}/internal/utf8statetable.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
|
||||
${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
|
||||
)
|
||||
|
||||
add_library(cld2 ${SRCS})
|
||||
|
||||
target_compile_options (cld2
|
||||
PRIVATE
|
||||
-Wno-old-style-cast
|
||||
-Wno-inconsistent-missing-destructor-override
|
||||
-Wno-deprecated
|
||||
-Wno-unused-parameter
|
||||
-Wno-shadow
|
||||
-Wno-tautological-type-limit-compare
|
||||
-Wno-extra-semi
|
||||
-Wno-narrowing
|
||||
-Wl
|
||||
)
|
||||
|
||||
target_include_directories (cld2 SYSTEM PUBLIC ${LIBRARY_DIR}/public)
|
||||
endif()
|
||||
#target_link_libraries (cld2 PUBLIC ssl)
|
||||
|
File diff suppressed because it is too large
Load Diff
55532
src/Common/ClassificationDictionaries/emotional_dictionary_rus_part.txt
Normal file
55532
src/Common/ClassificationDictionaries/emotional_dictionary_rus_part.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -34,22 +34,22 @@ public:
|
||||
void parseEncodingFrequencies(const String & pt)
|
||||
{
|
||||
path_to_enc_freq = pt;
|
||||
loadEncodingsFrequency(pt);
|
||||
//loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
//loadEncodingsFrequency(pt);
|
||||
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
|
||||
}
|
||||
|
||||
void parseEmotionalDict(const String & pt)
|
||||
{
|
||||
path_to_emo_dict = pt;
|
||||
loadEmotionalDict(pt);
|
||||
//loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
//loadEmotionalDict(pt);
|
||||
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
|
||||
}
|
||||
|
||||
void parseProgrammingFrequency(const String & pt)
|
||||
{
|
||||
path_to_prog_freq = pt;
|
||||
loadProgrammingFrequency(pt);
|
||||
//loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
|
||||
//loadProgrammingFrequency(pt);
|
||||
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
|
||||
}
|
||||
|
||||
|
||||
@ -103,24 +103,29 @@ public:
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
|
||||
|
||||
ReadBufferFromFile in(path_to_emotional_dict);
|
||||
size_t buf_size = 10000000;
|
||||
ReadBufferFromFile in(path_to_emotional_dict, buf_size);
|
||||
size_t count = 0;
|
||||
while (!in.eof())
|
||||
{
|
||||
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
||||
|
||||
if (newline >= in.buffer().end()) { break; }
|
||||
//if (newline >= in.buffer().end()) { break; }
|
||||
|
||||
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
||||
in.position() = newline + 1;
|
||||
|
||||
if (newline >= in.buffer().end())
|
||||
break;
|
||||
|
||||
readStringUntilWhitespace(word, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(tonality, buf_line);
|
||||
|
||||
emotional_dict[word] = tonality;
|
||||
|
||||
++count;
|
||||
}
|
||||
LOG_TRACE(log, "Emotional dictionary was added");
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
|
||||
@ -134,7 +139,8 @@ public:
|
||||
|
||||
LOG_TRACE(log, "Programming langugages frequencies loading from {}", path_to_programming_freq);
|
||||
|
||||
ReadBufferFromFile in(path_to_programming_freq);
|
||||
size_t buf_size = 10000000;
|
||||
ReadBufferFromFile in(path_to_programming_freq, buf_size);
|
||||
while (!in.eof())
|
||||
{
|
||||
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
|
||||
|
@ -31,31 +31,33 @@ struct ProgrammingClassificationImpl
|
||||
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
||||
std::unordered_map<String, Float64> data_freq;
|
||||
|
||||
String answer;
|
||||
String prev_command;
|
||||
String command;
|
||||
|
||||
ReadBufferFromMemory in(data.data(), data.size() + 1);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
String prev = "";
|
||||
String new_word;
|
||||
|
||||
while (!in.eof())
|
||||
for (size_t i = 0; i < data.size();)
|
||||
{
|
||||
if (data.size() - (in.position() - data.data()) <= 3)
|
||||
if (!isspace(data[i]))
|
||||
{
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(new_word, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
command.push_back(data[i]);
|
||||
++i;
|
||||
|
||||
if (prev == "")
|
||||
{
|
||||
prev = new_word;
|
||||
}
|
||||
while ((i < data.size()) && (!isspace(data[i]))) {
|
||||
command.push_back(data[i]);
|
||||
++i;
|
||||
}
|
||||
if (prev_command == "") {
|
||||
prev_command = command;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_freq[prev_command + command] += 1;
|
||||
prev_command = command;
|
||||
}
|
||||
command = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
data_freq[prev + new_word] += 1;
|
||||
prev = new_word;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
@ -97,32 +99,35 @@ struct ProgrammingClassificationImpl
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
String str = haystack;
|
||||
String str_data = haystack;
|
||||
|
||||
String buf;
|
||||
String prev_command;
|
||||
String command;
|
||||
|
||||
ReadBufferFromMemory in(str.data(), str.size() + 1);
|
||||
|
||||
skipWhitespaceIfAny(in);
|
||||
String new_word;
|
||||
String prev;
|
||||
while (!in.eof())
|
||||
for (size_t ind = 0; ind < str_data.size();)
|
||||
{
|
||||
if (str.size() - (in.position() - str.data()) <= 3)
|
||||
if (!isspace(str_data[ind]))
|
||||
{
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(new_word, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
command.push_back(str_data[ind]);
|
||||
++ind;
|
||||
|
||||
if (prev == "")
|
||||
{
|
||||
prev = new_word;
|
||||
while ((ind < str_data.size()) && (!isspace(str_data[ind]))) {
|
||||
command.push_back(str_data[ind]);
|
||||
++ind;
|
||||
}
|
||||
if (prev_command == "") {
|
||||
prev_command = command;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_freq[prev_command + command] += 1;
|
||||
prev_command = command;
|
||||
}
|
||||
command = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
data_freq[prev + new_word] += 1;
|
||||
prev = new_word;
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
|
||||
@ -141,7 +146,7 @@ struct ProgrammingClassificationImpl
|
||||
|
||||
if (most_liked == "")
|
||||
{
|
||||
most_liked = "Undefined";
|
||||
most_liked = "Undefined";
|
||||
}
|
||||
|
||||
const auto ans = most_liked.c_str();
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
@ -15,26 +16,11 @@ struct TonalityClassificationImpl
|
||||
|
||||
using ResultType = String;
|
||||
|
||||
|
||||
static ALWAYS_INLINE inline void word_processing(String & word)
|
||||
{
|
||||
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
|
||||
|
||||
while (to_skip.find(word.back()) != to_skip.end())
|
||||
{
|
||||
word.pop_back();
|
||||
}
|
||||
|
||||
while (to_skip.find(word.front()) != to_skip.end())
|
||||
{
|
||||
word.erase(0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static String get_tonality(const Float64 & tonality_level)
|
||||
{
|
||||
if (tonality_level < 0.5) { return "NEG"; }
|
||||
if (tonality_level > 1) { return "POS"; }
|
||||
if (tonality_level < 0.25) { return "NEG"; }
|
||||
if (tonality_level > 0.5) { return "POS"; }
|
||||
return "NEUT";
|
||||
}
|
||||
|
||||
@ -42,34 +28,38 @@ struct TonalityClassificationImpl
|
||||
{
|
||||
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
|
||||
Float64 freq = 0;
|
||||
Float64 weight = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
String answer;
|
||||
String word;
|
||||
|
||||
ReadBufferFromMemory in(data.data(), data.size() + 1);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
String to_check;
|
||||
while (!in.eof())
|
||||
for (size_t i = 0; i < data.size();)
|
||||
{
|
||||
if (data.size() - (in.position() - data.data()) <= 3) {
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(to_check, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
word_processing(to_check);
|
||||
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
if (!isASCII(data[i]))
|
||||
{
|
||||
count_words += 1;
|
||||
freq += emotional_dict[to_check];
|
||||
}
|
||||
word.push_back(data[i]);
|
||||
++i;
|
||||
|
||||
while ((i < data.size()) && (!isASCII(data[i]))) {
|
||||
word.push_back(data[i]);
|
||||
++i;
|
||||
}
|
||||
if (emotional_dict.find(word) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
weight += emotional_dict[word];
|
||||
}
|
||||
word = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
++i;
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
res = get_tonality(total_tonality);
|
||||
|
||||
Float64 total_tonality = weight / count_words;
|
||||
res += get_tonality(total_tonality);
|
||||
}
|
||||
|
||||
|
||||
@ -94,29 +84,38 @@ struct TonalityClassificationImpl
|
||||
|
||||
String buf;
|
||||
|
||||
Float64 freq = 0;
|
||||
Float64 weight = 0;
|
||||
Float64 count_words = 0;
|
||||
|
||||
|
||||
ReadBufferFromMemory in(str.data(), str.size() + 1);
|
||||
String answer;
|
||||
String word;
|
||||
|
||||
skipWhitespaceIfAny(in);
|
||||
String to_check;
|
||||
while (!in.eof())
|
||||
for (size_t ind = 0; ind < str.size();)
|
||||
{
|
||||
if (str.size() - (in.position() - str.data()) <= 3) {
|
||||
break;
|
||||
}
|
||||
readStringUntilWhitespace(to_check, in);
|
||||
skipWhitespaceIfAny(in);
|
||||
|
||||
if (emotional_dict.find(to_check) != emotional_dict.cend())
|
||||
if (!isASCII(str[ind]))
|
||||
{
|
||||
count_words += 1;
|
||||
freq += emotional_dict[to_check];
|
||||
word.push_back(str[ind]);
|
||||
++ind;
|
||||
|
||||
while ((ind < str.size()) && (!isASCII(str[ind]))) {
|
||||
word.push_back(str[ind]);
|
||||
++ind;
|
||||
}
|
||||
if (emotional_dict.find(word) != emotional_dict.cend())
|
||||
{
|
||||
count_words += 1;
|
||||
weight += emotional_dict[word];
|
||||
}
|
||||
word = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
Float64 total_tonality = freq / count_words;
|
||||
|
||||
Float64 total_tonality = weight / count_words;
|
||||
buf = get_tonality(total_tonality);
|
||||
|
||||
const auto ans = buf.c_str();
|
||||
|
Loading…
Reference in New Issue
Block a user