Fix functions

This commit is contained in:
s-kat 2021-04-18 20:03:56 +03:00
parent 95980899dc
commit e4a8cd3f06
8 changed files with 297698 additions and 8956 deletions

View File

@ -48,6 +48,7 @@ add_subdirectory (murmurhash)
add_subdirectory (replxx-cmake)
add_subdirectory (unixodbc-cmake)
add_subdirectory (xz)
add_subdirectory (cld2-cmake)
add_subdirectory (poco-cmake)
add_subdirectory (croaring-cmake)

1
contrib/cld2 vendored Submodule

@ -0,0 +1 @@
Subproject commit b56fa78a2fe44ac2851bae5bf4f4693a0644da7b

View File

@ -0,0 +1,67 @@
option (USE_INTERNAL_CLD2_LIBRARY "Use internal cld2 library" ${NOT_UNBUNDLED})
if (NOT USE_INTERNAL_LZ4_LIBRARY)
find_library (LIBRARY_CLD2 cld2)
find_path (INCLUDE_CLD2 compact_lang_det.h)
if (LIBRARY_CLD2 AND INCLUDE_CLD2)
set(EXTERNAL_CLD2_LIBRARY_FOUND 1)
add_library (cld2 INTERFACE)
set_property (TARGET cld2 PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_CLD2})
set_property (TARGET cld2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CLD2})
else ()
set(EXTERNAL_CLD2_LIBRARY_FOUND 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system cld2")
endif()
endif()
if (NOT EXTERNAL_CLD2_LIBRARY_FOUND)
set (USE_INTERNAL_CLD2_LIBRARY 1)
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
set (SRCS
${LIBRARY_DIR}/internal/cldutil.cc
${LIBRARY_DIR}/internal/cldutil_shared.cc
${LIBRARY_DIR}/internal/compact_lang_det.cc
${LIBRARY_DIR}/internal/compact_lang_det_hint_code.cc
${LIBRARY_DIR}/internal/compact_lang_det_impl.cc
${LIBRARY_DIR}/internal/debug.cc
${LIBRARY_DIR}/internal/fixunicodevalue.cc
${LIBRARY_DIR}/internal/generated_entities.cc
${LIBRARY_DIR}/internal/generated_language.cc
${LIBRARY_DIR}/internal/generated_ulscript.cc
${LIBRARY_DIR}/internal/getonescriptspan.cc
${LIBRARY_DIR}/internal/lang_script.cc
${LIBRARY_DIR}/internal/offsetmap.cc
${LIBRARY_DIR}/internal/scoreonescriptspan.cc
${LIBRARY_DIR}/internal/tote.cc
${LIBRARY_DIR}/internal/utf8statetable.cc
${LIBRARY_DIR}/internal/cld_generated_cjk_uni_prop_80.cc
${LIBRARY_DIR}/internal/cld2_generated_cjk_compatible.cc
${LIBRARY_DIR}/internal/cld_generated_cjk_delta_bi_4.cc
${LIBRARY_DIR}/internal/generated_distinct_bi_0.cc
${LIBRARY_DIR}/internal/cld2_generated_quadchrome_2.cc
${LIBRARY_DIR}/internal/cld2_generated_deltaoctachrome.cc
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
)
add_library(cld2 ${SRCS})
target_compile_options (cld2
PRIVATE
-Wno-old-style-cast
-Wno-inconsistent-missing-destructor-override
-Wno-deprecated
-Wno-unused-parameter
-Wno-shadow
-Wno-tautological-type-limit-compare
-Wno-extra-semi
-Wno-narrowing
-Wl
)
target_include_directories (cld2 SYSTEM PUBLIC ${LIBRARY_DIR}/public)
endif()
#target_link_libraries (cld2 PUBLIC ssl)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -34,22 +34,22 @@ public:
void parseEncodingFrequencies(const String & pt)
{
path_to_enc_freq = pt;
loadEncodingsFrequency(pt);
//loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
//loadEncodingsFrequency(pt);
loadEncodingsFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/charset_freq.txt");
}
void parseEmotionalDict(const String & pt)
{
path_to_emo_dict = pt;
loadEmotionalDict(pt);
//loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
//loadEmotionalDict(pt);
loadEmotionalDict("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/emotional_dictionary_rus.txt");
}
void parseProgrammingFrequency(const String & pt)
{
path_to_prog_freq = pt;
loadProgrammingFrequency(pt);
//loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
//loadProgrammingFrequency(pt);
loadProgrammingFrequency("/home/sergey/ClickHouse/src/Common/ClassificationDictionaries/programming_freq.txt");
}
@ -103,24 +103,29 @@ public:
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
LOG_TRACE(log, "Emotional dictionary loading from {}", path_to_emotional_dict);
ReadBufferFromFile in(path_to_emotional_dict);
size_t buf_size = 10000000;
ReadBufferFromFile in(path_to_emotional_dict, buf_size);
size_t count = 0;
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());
if (newline >= in.buffer().end()) { break; }
//if (newline >= in.buffer().end()) { break; }
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
in.position() = newline + 1;
if (newline >= in.buffer().end())
break;
readStringUntilWhitespace(word, buf_line);
buf_line.ignore();
readFloatText(tonality, buf_line);
emotional_dict[word] = tonality;
++count;
}
LOG_TRACE(log, "Emotional dictionary was added");
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
}
@ -134,7 +139,8 @@ public:
LOG_TRACE(log, "Programming langugages frequencies loading from {}", path_to_programming_freq);
ReadBufferFromFile in(path_to_programming_freq);
size_t buf_size = 10000000;
ReadBufferFromFile in(path_to_programming_freq, buf_size);
while (!in.eof())
{
char * newline = find_first_symbols<'\n'>(in.position(), in.buffer().end());

View File

@ -31,31 +31,33 @@ struct ProgrammingClassificationImpl
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
std::unordered_map<String, Float64> data_freq;
String answer;
String prev_command;
String command;
ReadBufferFromMemory in(data.data(), data.size() + 1);
skipWhitespaceIfAny(in);
String prev = "";
String new_word;
while (!in.eof())
for (size_t i = 0; i < data.size();)
{
if (data.size() - (in.position() - data.data()) <= 3)
if (!isspace(data[i]))
{
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
command.push_back(data[i]);
++i;
if (prev == "")
{
prev = new_word;
}
while ((i < data.size()) && (!isspace(data[i]))) {
command.push_back(data[i]);
++i;
}
if (prev_command == "") {
prev_command = command;
}
else
{
data_freq[prev_command + command] += 1;
prev_command = command;
}
command = "";
}
else
{
data_freq[prev + new_word] += 1;
prev = new_word;
++i;
}
}
@ -97,32 +99,35 @@ struct ProgrammingClassificationImpl
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * haystack = reinterpret_cast<const char *>(&data[prev_offset]);
String str = haystack;
String str_data = haystack;
String buf;
String prev_command;
String command;
ReadBufferFromMemory in(str.data(), str.size() + 1);
skipWhitespaceIfAny(in);
String new_word;
String prev;
while (!in.eof())
for (size_t ind = 0; ind < str_data.size();)
{
if (str.size() - (in.position() - str.data()) <= 3)
if (!isspace(str_data[ind]))
{
break;
}
readStringUntilWhitespace(new_word, in);
skipWhitespaceIfAny(in);
command.push_back(str_data[ind]);
++ind;
if (prev == "")
{
prev = new_word;
while ((ind < str_data.size()) && (!isspace(str_data[ind]))) {
command.push_back(str_data[ind]);
++ind;
}
if (prev_command == "") {
prev_command = command;
}
else
{
data_freq[prev_command + command] += 1;
prev_command = command;
}
command = "";
}
else
{
data_freq[prev + new_word] += 1;
prev = new_word;
++ind;
}
}
@ -141,7 +146,7 @@ struct ProgrammingClassificationImpl
if (most_liked == "")
{
most_liked = "Undefined";
most_liked = "Undefined";
}
const auto ans = most_liked.c_str();

View File

@ -1,4 +1,5 @@
#include <Functions/FunctionsTextClassification.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Common/UTF8Helpers.h>
@ -15,26 +16,11 @@ struct TonalityClassificationImpl
using ResultType = String;
static ALWAYS_INLINE inline void word_processing(String & word)
{
std::set<char> to_skip {',', '.', '!', '?', ')', '(', '\"', '\'', '[', ']', '{', '}', ':', ';'};
while (to_skip.find(word.back()) != to_skip.end())
{
word.pop_back();
}
while (to_skip.find(word.front()) != to_skip.end())
{
word.erase(0, 1);
}
}
static String get_tonality(const Float64 & tonality_level)
{
if (tonality_level < 0.5) { return "NEG"; }
if (tonality_level > 1) { return "POS"; }
if (tonality_level < 0.25) { return "NEG"; }
if (tonality_level > 0.5) { return "POS"; }
return "NEUT";
}
@ -42,34 +28,38 @@ struct TonalityClassificationImpl
{
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
Float64 freq = 0;
Float64 weight = 0;
Float64 count_words = 0;
String answer;
String word;
ReadBufferFromMemory in(data.data(), data.size() + 1);
skipWhitespaceIfAny(in);
String to_check;
while (!in.eof())
for (size_t i = 0; i < data.size();)
{
if (data.size() - (in.position() - data.data()) <= 3) {
break;
}
readStringUntilWhitespace(to_check, in);
skipWhitespaceIfAny(in);
word_processing(to_check);
if (emotional_dict.find(to_check) != emotional_dict.cend())
if (!isASCII(data[i]))
{
count_words += 1;
freq += emotional_dict[to_check];
}
word.push_back(data[i]);
++i;
while ((i < data.size()) && (!isASCII(data[i]))) {
word.push_back(data[i]);
++i;
}
if (emotional_dict.find(word) != emotional_dict.cend())
{
count_words += 1;
weight += emotional_dict[word];
}
word = "";
}
else
{
++i;
}
}
Float64 total_tonality = freq / count_words;
res = get_tonality(total_tonality);
Float64 total_tonality = weight / count_words;
res += get_tonality(total_tonality);
}
@ -94,29 +84,38 @@ struct TonalityClassificationImpl
String buf;
Float64 freq = 0;
Float64 weight = 0;
Float64 count_words = 0;
ReadBufferFromMemory in(str.data(), str.size() + 1);
String answer;
String word;
skipWhitespaceIfAny(in);
String to_check;
while (!in.eof())
for (size_t ind = 0; ind < str.size();)
{
if (str.size() - (in.position() - str.data()) <= 3) {
break;
}
readStringUntilWhitespace(to_check, in);
skipWhitespaceIfAny(in);
if (emotional_dict.find(to_check) != emotional_dict.cend())
if (!isASCII(str[ind]))
{
count_words += 1;
freq += emotional_dict[to_check];
word.push_back(str[ind]);
++ind;
while ((ind < str.size()) && (!isASCII(str[ind]))) {
word.push_back(str[ind]);
++ind;
}
if (emotional_dict.find(word) != emotional_dict.cend())
{
count_words += 1;
weight += emotional_dict[word];
}
word = "";
}
else
{
++ind;
}
}
Float64 total_tonality = freq / count_words;
Float64 total_tonality = weight / count_words;
buf = get_tonality(total_tonality);
const auto ans = buf.c_str();