mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
New features draft
This commit is contained in:
parent
3997bfe8de
commit
62f8b8f0c8
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -48,6 +48,7 @@ add_subdirectory (murmurhash)
|
||||
add_subdirectory (replxx-cmake)
|
||||
add_subdirectory (unixodbc-cmake)
|
||||
add_subdirectory (nanodbc-cmake)
|
||||
add_subdirectory (cld2-cmake)
|
||||
|
||||
if (USE_INTERNAL_XZ_LIBRARY)
|
||||
add_subdirectory (xz)
|
||||
|
@ -1,6 +1,5 @@
|
||||
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
|
||||
|
||||
|
||||
set (SRCS
|
||||
${LIBRARY_DIR}/internal/cldutil.cc
|
||||
${LIBRARY_DIR}/internal/cldutil_shared.cc
|
||||
@ -27,10 +26,7 @@ set (SRCS
|
||||
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
|
||||
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
|
||||
)
|
||||
|
||||
add_library(cld2 SHARED ${SRCS})
|
||||
target_compile_options (cld2 PRIVATE -Wl -Wno-narrowing)
|
||||
|
||||
|
||||
target_link_libraries (cld2 PRIVATE ${LIBRARY_DIR}/internal/libcld2.so)
|
||||
add_library(cld2 ${SRCS})
|
||||
set_property(TARGET cld2 PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_options (cld2 PRIVATE -Wno-reserved-id-macro -Wno-c++11-narrowing)
|
||||
target_include_directories(cld2 PUBLIC "${LIBRARY_DIR}/public")
|
||||
|
@ -701,20 +701,23 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
||||
|
||||
/// encoding frequencies
|
||||
{
|
||||
const std::string & encode_frequency_path = config().getString("encoding_frequencies_path", path);
|
||||
FrequencyHolder::getInstance().parseEncodingFrequencies(encode_frequency_path);
|
||||
///const std::string & encode_frequency_path = config().getString("encoding_frequencies_path", path);
|
||||
///FrequencyHolder::getInstance().parseEncodingFrequencies(encode_frequency_path);
|
||||
FrequencyHolder::getInstance().parseEncodingFrequencies(path + "/charset_freq.txt");
|
||||
}
|
||||
|
||||
/// programming languages frequencies
|
||||
{
|
||||
const std::string & programming_frequency_path = config().getString("programming_lang_frequencies_path", path);
|
||||
FrequencyHolder::getInstance().parseProgrammingFrequency(programming_frequency_path);
|
||||
///const std::string & programming_frequency_path = config().getString("programming_lang_frequencies_path", path);
|
||||
///FrequencyHolder::getInstance().parseProgrammingFrequency(programming_frequency_path);
|
||||
FrequencyHolder::getInstance().parseEncodingFrequencies(path + "/prog_freq.txt");
|
||||
}
|
||||
|
||||
/// emotional dictionary
|
||||
{
|
||||
const std::string & emotional_dict_path = config().getString("emotional_dict_path", path);
|
||||
FrequencyHolder::getInstance().parseEmotionalDict(emotional_dict_path);
|
||||
///const std::string & emotional_dict_path = config().getString("emotional_dict_path", path);
|
||||
///FrequencyHolder::getInstance().parseEmotionalDict(emotional_dict_path);
|
||||
FrequencyHolder::getInstance().parseEncodingFrequencies(path + "/emotional_dictionary_rus.txt");
|
||||
}
|
||||
|
||||
|
||||
|
@ -314,7 +314,6 @@ target_link_libraries(clickhouse_common_io
|
||||
)
|
||||
|
||||
|
||||
|
||||
if (USE_RDKAFKA)
|
||||
dbms_target_link_libraries(PRIVATE ${CPPKAFKA_LIBRARY} ${RDKAFKA_LIBRARY})
|
||||
if(NOT USE_INTERNAL_RDKAFKA_LIBRARY)
|
||||
@ -348,6 +347,7 @@ dbms_target_link_libraries (
|
||||
clickhouse_dictionaries_embedded
|
||||
clickhouse_parsers
|
||||
lz4
|
||||
cld2
|
||||
Poco::JSON
|
||||
Poco::MongoDB
|
||||
string_utils
|
||||
@ -516,3 +516,4 @@ if (ENABLE_TESTS AND USE_GTEST)
|
||||
|
||||
add_check(unit_tests_dbms)
|
||||
endif ()
|
||||
|
||||
|
@ -44,7 +44,7 @@ public:
|
||||
//loadEmotionalDict("/home/sergey/ClickHouse/programs/server/emotional_dictionary_rus.txt");
|
||||
}
|
||||
|
||||
void parseProgrammingFrequency(const String & pt)
|
||||
void parseProgrammingFrequency(const String & pt)
|
||||
{
|
||||
path_to_prog_freq = pt;
|
||||
loadProgrammingFrequency(pt);
|
||||
@ -154,15 +154,14 @@ public:
|
||||
{
|
||||
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
|
||||
readString(programming_language, bufline);
|
||||
LOG_TRACE(log, "Loading {}", programming_language);
|
||||
} else
|
||||
}
|
||||
else
|
||||
{
|
||||
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
|
||||
readStringUntilWhitespace(bigram, buf_line);
|
||||
buf_line.ignore();
|
||||
readFloatText(frequency, buf_line);
|
||||
programming_freq[programming_language][bigram] = frequency;
|
||||
LOG_TRACE(log, "Word {}", bigram);
|
||||
}
|
||||
in.position() = newline + 1;
|
||||
}
|
||||
|
@ -4,7 +4,8 @@
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include "/home/sergey/ClickHouse/contrib/cld2/public/compact_lang_det.h"
|
||||
//#include <cld2/compact_lang_det.h>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
@ -116,7 +117,7 @@ struct CharsetClassificationImpl
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
@ -142,7 +143,7 @@ struct CharsetClassificationImpl
|
||||
* If we need to find language of data, we return <Language>
|
||||
* If we need to find charset of data, we return <Charset>.
|
||||
*/
|
||||
|
||||
|
||||
size_t sep = poss_ans.find('_');
|
||||
if (detect_language)
|
||||
{
|
||||
@ -190,10 +191,10 @@ struct CharsetClassificationImpl
|
||||
poss_ans = item.first;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
size_t sep = poss_ans.find('_');
|
||||
String ans_str;
|
||||
|
||||
|
||||
if (detect_language)
|
||||
{
|
||||
ans_str = poss_ans.erase(0, sep + 1);
|
||||
|
118
src/Functions/FunctionsLanguageClassification.cpp
Normal file
118
src/Functions/FunctionsLanguageClassification.cpp
Normal file
@ -0,0 +1,118 @@
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include "/home/sergey/ClickHouse/contrib/cld2/public/compact_lang_det.h"
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/* Determine language of Unicode UTF-8 text.
|
||||
* Uses the cld2 library https://github.com/CLD2Owners/cld2
|
||||
*/
|
||||
template <bool mixed>
|
||||
struct LanguageClassificationImpl
|
||||
{
|
||||
|
||||
using ResultType = String;
|
||||
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
bool is_UTF8 = true;
|
||||
const char * str = data.c_str();
|
||||
if (!mixed)
|
||||
{
|
||||
String ans(LanguageName(CLD2::DetectLanguage(str, strlen(str), true, &is_UTF8)));
|
||||
res = ans;
|
||||
} else {
|
||||
CLD2::Language result_lang_top3[3];
|
||||
int pc[3];
|
||||
int bytes[3];
|
||||
CLD2::DetectLanguageSummary(str, strlen(str), true, result_lang_top3, pc, bytes, &is_UTF8);
|
||||
|
||||
String lang1(LanguageName(result_lang_top3[0]));
|
||||
String lang2(LanguageName(result_lang_top3[1]));
|
||||
String lang3(LanguageName(result_lang_top3[2]));
|
||||
res = lang1 + " " + std::to_string(pc[0]) + "% | ";
|
||||
res += lang2 + " " + std::to_string(pc[1]) + "% | ";
|
||||
res += lang3 + " " + std::to_string(pc[2]) + "%";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
ColumnString::Chars & res_data,
|
||||
ColumnString::Offsets & res_offsets)
|
||||
{
|
||||
res_data.reserve(1024);
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t prev_offset = 0;
|
||||
size_t res_offset = 0;
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const char * str = reinterpret_cast<const char *>(&data[prev_offset]);
|
||||
const char * ans;
|
||||
bool is_UTF8 = true;
|
||||
if (!mixed)
|
||||
{
|
||||
ans = LanguageName(CLD2::DetectLanguage(str, strlen(str), true, &is_UTF8));
|
||||
}
|
||||
else
|
||||
{
|
||||
String top3;
|
||||
CLD2::Language result_lang_top3[3];
|
||||
int pc[3];
|
||||
int bytes[3];
|
||||
CLD2::DetectLanguageSummary(str, strlen(str), true, result_lang_top3, pc, bytes, &is_UTF8);
|
||||
|
||||
String lang1(LanguageName(result_lang_top3[0]));
|
||||
String lang2(LanguageName(result_lang_top3[1]));
|
||||
String lang3(LanguageName(result_lang_top3[2]));
|
||||
top3 = lang1 + " " + std::to_string(pc[0]) + "% | ";
|
||||
top3 += lang2 + " " + std::to_string(pc[1]) + "% | ";
|
||||
top3 += lang3 + " " + std::to_string(pc[2]) + "%";
|
||||
ans = top3.c_str();
|
||||
}
|
||||
size_t cur_offset = offsets[i];
|
||||
|
||||
size_t ans_size = strlen(ans);
|
||||
res_data.resize(res_offset + ans_size + 1);
|
||||
memcpy(&res_data[res_offset], ans, ans_size);
|
||||
res_offset += ans_size;
|
||||
|
||||
res_data[res_offset] = 0;
|
||||
++res_offset;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
prev_offset = cur_offset;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct NameLanguageUTF8Detect
|
||||
{
|
||||
static constexpr auto name = "detectLanguageUTF8";
|
||||
};
|
||||
|
||||
struct NameLanguageMixedUTF8Detect
|
||||
{
|
||||
static constexpr auto name = "detectLanguageMixedUTF8";
|
||||
};
|
||||
|
||||
|
||||
using FunctionLanguageUTF8Detect = FunctionsTextClassification<LanguageClassificationImpl<false>, NameLanguageUTF8Detect>;
|
||||
using FunctionLanguageMixedUTF8Detect = FunctionsTextClassification<LanguageClassificationImpl<true>, NameLanguageMixedUTF8Detect>;
|
||||
|
||||
void registerFunctionLanguageDetectUTF8(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionLanguageUTF8Detect>();
|
||||
factory.registerFunction<FunctionLanguageMixedUTF8Detect>();
|
||||
}
|
||||
|
||||
}
|
@ -33,7 +33,7 @@ struct ProgrammingClassificationImpl
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
static std::unordered_map<String, std::unordered_map<String, Float64>> programming_freq = FrequencyHolder::getInstance().getProgrammingFrequency();
|
||||
@ -66,7 +66,7 @@ struct ProgrammingClassificationImpl
|
||||
prev_command = command;
|
||||
}
|
||||
command = "";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
++i;
|
||||
@ -82,7 +82,7 @@ struct ProgrammingClassificationImpl
|
||||
if (result > max_result)
|
||||
{
|
||||
max_result = result;
|
||||
most_liked = item.first;
|
||||
most_liked = item.first;
|
||||
}
|
||||
}
|
||||
/// If all weights are zero, then we assume that the language is undefined
|
||||
@ -141,7 +141,7 @@ struct ProgrammingClassificationImpl
|
||||
prev_command = command;
|
||||
}
|
||||
command = "";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
++ind;
|
||||
@ -157,7 +157,7 @@ struct ProgrammingClassificationImpl
|
||||
if (result > max_result)
|
||||
{
|
||||
max_result = result;
|
||||
most_liked = item.first;
|
||||
most_liked = item.first;
|
||||
}
|
||||
}
|
||||
/// If all weights are zero, then we assume that the language is undefined
|
||||
|
@ -26,8 +26,8 @@ struct TonalityClassificationImpl
|
||||
if (tonality_level < 0.25) { return "NEG"; }
|
||||
if (tonality_level > 0.5) { return "POS"; }
|
||||
return "NEUT";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void constant(String data, String & res)
|
||||
{
|
||||
static std::unordered_map<String, Float64> emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
@ -58,7 +58,7 @@ struct TonalityClassificationImpl
|
||||
weight += emotional_dict[word];
|
||||
}
|
||||
word = "";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
++i;
|
||||
|
@ -36,6 +36,7 @@ void registerFunctionsStringSimilarity(FunctionFactory &);
|
||||
void registerFunctionsCharsetClassification(FunctionFactory &);
|
||||
void registerFunctionsTonalityClassification(FunctionFactory &);
|
||||
void registerFunctionsProgrammingClassification(FunctionFactory &);
|
||||
void registerFunctionLanguageDetectUTF8(FunctionFactory &);
|
||||
void registerFunctionsURL(FunctionFactory &);
|
||||
void registerFunctionsVisitParam(FunctionFactory &);
|
||||
void registerFunctionsMath(FunctionFactory &);
|
||||
@ -98,6 +99,7 @@ void registerFunctions()
|
||||
registerFunctionsCharsetClassification(factory);
|
||||
registerFunctionsTonalityClassification(factory);
|
||||
registerFunctionsProgrammingClassification(factory);
|
||||
registerFunctionLanguageDetectUTF8(factory);
|
||||
registerFunctionsURL(factory);
|
||||
registerFunctionsVisitParam(factory);
|
||||
registerFunctionsMath(factory);
|
||||
|
Loading…
Reference in New Issue
Block a user