New features draft

This commit is contained in:
s-kat 2021-05-23 19:39:40 +03:00
parent 3997bfe8de
commit 62f8b8f0c8
10 changed files with 152 additions and 31 deletions

View File

@ -48,6 +48,7 @@ add_subdirectory (murmurhash)
add_subdirectory (replxx-cmake)
add_subdirectory (unixodbc-cmake)
add_subdirectory (nanodbc-cmake)
add_subdirectory (cld2-cmake)
if (USE_INTERNAL_XZ_LIBRARY)
add_subdirectory (xz)

View File

@ -1,6 +1,5 @@
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/cld2")
set (SRCS
${LIBRARY_DIR}/internal/cldutil.cc
${LIBRARY_DIR}/internal/cldutil_shared.cc
@ -27,10 +26,7 @@ set (SRCS
${LIBRARY_DIR}/internal/cld2_generated_distinctoctachrome.cc
${LIBRARY_DIR}/internal/cld_generated_score_quad_octa_2.cc
)
add_library(cld2 SHARED ${SRCS})
target_compile_options (cld2 PRIVATE -Wl -Wno-narrowing)
target_link_libraries (cld2 PRIVATE ${LIBRARY_DIR}/internal/libcld2.so)
add_library(cld2 ${SRCS})
set_property(TARGET cld2 PROPERTY POSITION_INDEPENDENT_CODE ON)
target_compile_options (cld2 PRIVATE -Wno-reserved-id-macro -Wno-c++11-narrowing)
target_include_directories(cld2 PUBLIC "${LIBRARY_DIR}/public")

View File

@ -701,20 +701,23 @@ int Server::main(const std::vector<std::string> & /*args*/)
/// encoding frequencies
{
const std::string & encode_frequency_path = config().getString("encoding_frequencies_path", path);
FrequencyHolder::getInstance().parseEncodingFrequencies(encode_frequency_path);
///const std::string & encode_frequency_path = config().getString("encoding_frequencies_path", path);
///FrequencyHolder::getInstance().parseEncodingFrequencies(encode_frequency_path);
FrequencyHolder::getInstance().parseEncodingFrequencies(path + "/charset_freq.txt");
}
/// programming languages frequencies
{
const std::string & programming_frequency_path = config().getString("programming_lang_frequencies_path", path);
FrequencyHolder::getInstance().parseProgrammingFrequency(programming_frequency_path);
///const std::string & programming_frequency_path = config().getString("programming_lang_frequencies_path", path);
///FrequencyHolder::getInstance().parseProgrammingFrequency(programming_frequency_path);
FrequencyHolder::getInstance().parseEncodingFrequencies(path + "/prog_freq.txt");
}
/// emotional dictionary
{
const std::string & emotional_dict_path = config().getString("emotional_dict_path", path);
FrequencyHolder::getInstance().parseEmotionalDict(emotional_dict_path);
///const std::string & emotional_dict_path = config().getString("emotional_dict_path", path);
///FrequencyHolder::getInstance().parseEmotionalDict(emotional_dict_path);
FrequencyHolder::getInstance().parseEncodingFrequencies(path + "/emotional_dictionary_rus.txt");
}

View File

@ -314,7 +314,6 @@ target_link_libraries(clickhouse_common_io
)
if (USE_RDKAFKA)
dbms_target_link_libraries(PRIVATE ${CPPKAFKA_LIBRARY} ${RDKAFKA_LIBRARY})
if(NOT USE_INTERNAL_RDKAFKA_LIBRARY)
@ -348,6 +347,7 @@ dbms_target_link_libraries (
clickhouse_dictionaries_embedded
clickhouse_parsers
lz4
cld2
Poco::JSON
Poco::MongoDB
string_utils
@ -516,3 +516,4 @@ if (ENABLE_TESTS AND USE_GTEST)
add_check(unit_tests_dbms)
endif ()

View File

@ -154,15 +154,14 @@ public:
{
ReadBufferFromMemory bufline(in.position() + 3, newline - in.position());
readString(programming_language, bufline);
LOG_TRACE(log, "Loading {}", programming_language);
} else
}
else
{
ReadBufferFromMemory buf_line(in.position(), newline - in.position());
readStringUntilWhitespace(bigram, buf_line);
buf_line.ignore();
readFloatText(frequency, buf_line);
programming_freq[programming_language][bigram] = frequency;
LOG_TRACE(log, "Word {}", bigram);
}
in.position() = newline + 1;
}

View File

@ -4,7 +4,8 @@
#include <Common/UTF8Helpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include "/home/sergey/ClickHouse/contrib/cld2/public/compact_lang_det.h"
//#include <cld2/compact_lang_det.h>
#include <cstring>
#include <cmath>
#include <unordered_map>

View File

@ -0,0 +1,118 @@
#include <Functions/FunctionsTextClassification.h>
#include <Functions/FunctionFactory.h>
#include "/home/sergey/ClickHouse/contrib/cld2/public/compact_lang_det.h"
namespace DB
{
/* Determine language of Unicode UTF-8 text.
* Uses the cld2 library https://github.com/CLD2Owners/cld2
*/
template <bool mixed>
struct LanguageClassificationImpl
{
using ResultType = String;
static void constant(String data, String & res)
{
bool is_UTF8 = true;
const char * str = data.c_str();
if (!mixed)
{
String ans(LanguageName(CLD2::DetectLanguage(str, strlen(str), true, &is_UTF8)));
res = ans;
} else {
CLD2::Language result_lang_top3[3];
int pc[3];
int bytes[3];
CLD2::DetectLanguageSummary(str, strlen(str), true, result_lang_top3, pc, bytes, &is_UTF8);
String lang1(LanguageName(result_lang_top3[0]));
String lang2(LanguageName(result_lang_top3[1]));
String lang3(LanguageName(result_lang_top3[2]));
res = lang1 + " " + std::to_string(pc[0]) + "% | ";
res += lang2 + " " + std::to_string(pc[1]) + "% | ";
res += lang3 + " " + std::to_string(pc[2]) + "%";
}
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(1024);
res_offsets.resize(offsets.size());
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
const char * str = reinterpret_cast<const char *>(&data[prev_offset]);
const char * ans;
bool is_UTF8 = true;
if (!mixed)
{
ans = LanguageName(CLD2::DetectLanguage(str, strlen(str), true, &is_UTF8));
}
else
{
String top3;
CLD2::Language result_lang_top3[3];
int pc[3];
int bytes[3];
CLD2::DetectLanguageSummary(str, strlen(str), true, result_lang_top3, pc, bytes, &is_UTF8);
String lang1(LanguageName(result_lang_top3[0]));
String lang2(LanguageName(result_lang_top3[1]));
String lang3(LanguageName(result_lang_top3[2]));
top3 = lang1 + " " + std::to_string(pc[0]) + "% | ";
top3 += lang2 + " " + std::to_string(pc[1]) + "% | ";
top3 += lang3 + " " + std::to_string(pc[2]) + "%";
ans = top3.c_str();
}
size_t cur_offset = offsets[i];
size_t ans_size = strlen(ans);
res_data.resize(res_offset + ans_size + 1);
memcpy(&res_data[res_offset], ans, ans_size);
res_offset += ans_size;
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
struct NameLanguageUTF8Detect
{
static constexpr auto name = "detectLanguageUTF8";
};
struct NameLanguageMixedUTF8Detect
{
static constexpr auto name = "detectLanguageMixedUTF8";
};
using FunctionLanguageUTF8Detect = FunctionsTextClassification<LanguageClassificationImpl<false>, NameLanguageUTF8Detect>;
using FunctionLanguageMixedUTF8Detect = FunctionsTextClassification<LanguageClassificationImpl<true>, NameLanguageMixedUTF8Detect>;
void registerFunctionLanguageDetectUTF8(FunctionFactory & factory)
{
factory.registerFunction<FunctionLanguageUTF8Detect>();
factory.registerFunction<FunctionLanguageMixedUTF8Detect>();
}
}

View File

@ -36,6 +36,7 @@ void registerFunctionsStringSimilarity(FunctionFactory &);
void registerFunctionsCharsetClassification(FunctionFactory &);
void registerFunctionsTonalityClassification(FunctionFactory &);
void registerFunctionsProgrammingClassification(FunctionFactory &);
void registerFunctionLanguageDetectUTF8(FunctionFactory &);
void registerFunctionsURL(FunctionFactory &);
void registerFunctionsVisitParam(FunctionFactory &);
void registerFunctionsMath(FunctionFactory &);
@ -98,6 +99,7 @@ void registerFunctions()
registerFunctionsCharsetClassification(factory);
registerFunctionsTonalityClassification(factory);
registerFunctionsProgrammingClassification(factory);
registerFunctionLanguageDetectUTF8(factory);
registerFunctionsURL(factory);
registerFunctionsVisitParam(factory);
registerFunctionsMath(factory);