This commit is contained in:
Nikolay Degterinsky 2022-01-12 16:32:17 +00:00
parent 85b8985df2
commit 1c24667f20
10 changed files with 219 additions and 169 deletions

View File

@ -80,9 +80,6 @@
#include <Server/PostgreSQLHandlerFactory.h>
#include <Server/ProtocolServerAdapter.h>
#include <Server/HTTP/HTTPServer.h>
#include <Common/FrequencyHolder.h>
#include <Interpreters/AsynchronousInsertQueue.h>
#include <Compression/CompressionCodecEncrypted.h>
#include <filesystem>
@ -703,7 +700,6 @@ if (ThreadFuzzer::instance().isEffective())
global_context->setRemoteHostFilter(config());
std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH));
fs::path path = path_str;
std::string default_database = config().getString("default_database", "default");

View File

@ -83,7 +83,6 @@ target_link_libraries(clickhouse_functions PRIVATE lz4)
if (USE_NLP)
target_link_libraries(clickhouse_functions PRIVATE cld2)
target_include_directories(clickhouse_functions SYSTEM PRIVATE "${ClickHouse_SOURCE_DIR}/contrib/cld2/public")
endif()
if (USE_H3)

View File

@ -1,6 +1,6 @@
#include <Common/FrequencyHolder.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/FunctionsTextClassification.h>
#include <memory>
#include <unordered_map>
@ -8,17 +8,12 @@
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
* Using a naive Bayesian classifier, find the most likely charset and language and return it
*/
template <size_t N, bool detect_language>
template <bool detect_language>
struct CharsetClassificationImpl
{
/* We need to solve zero-frequency problem for Naive Bayes Classifier
@ -121,32 +116,27 @@ struct CharsetClassificationImpl
res_offsets[i] = res_offset;
}
}
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception("Cannot apply function detectProgrammingLanguage to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}
};
struct NameCharsetDetect
struct NameDetectCharset
{
static constexpr auto name = "detectCharset";
};
struct NameLanguageDetect
struct NameDetectLanguageUnknown
{
static constexpr auto name = "detectLanguageUnknown";
};
using FunctionCharsetDetect = FunctionStringToString<CharsetClassificationImpl<2, false>, NameCharsetDetect, false>;
using FunctionLanguageDetect = FunctionStringToString<CharsetClassificationImpl<2, true>, NameLanguageDetect, false>;
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
void registerFunctionsCharsetClassification(FunctionFactory & factory)
void registerFunctionDetectCharset(FunctionFactory & factory)
{
factory.registerFunction<FunctionCharsetDetect>();
factory.registerFunction<FunctionLanguageDetect>();
factory.registerFunction<FunctionDetectCharset>();
factory.registerFunction<FunctionDetectLanguageUnknown>();
}
}

View File

@ -1,24 +1,21 @@
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
#endif
#include "config_functions.h"
#if USE_NLP
#include <Functions/FunctionStringToString.h>
#include <Functions/FunctionFactory.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnMap.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionsTextClassification.h>
#include <Interpreters/Context.h>
#include "compact_lang_det.h"
#include <compact_lang_det.h>
namespace DB
{
@ -30,11 +27,12 @@ namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int SUPPORT_IS_DISABLED;
}
struct LanguageClassificationImpl
struct FunctionDetectLanguageImpl
{
static std::string_view codeISO(std::string_view code_string)
static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string)
{
if (code_string.ends_with("-Latn"))
code_string.remove_suffix(code_string.size() - 5);
@ -92,14 +90,9 @@ struct LanguageClassificationImpl
res_offsets[i] = res_offset;
}
}
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception("Cannot apply function detectProgrammingLanguage to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}
};
class LanguageClassificationMixedDetect : public IFunction
class FunctionDetectLanguageMixed : public IFunction
{
public:
static constexpr auto name = "detectLanguageMixed";
@ -107,7 +100,14 @@ public:
/// Number of top results
static constexpr auto top_N = 3;
static FunctionPtr create(ContextPtr) { return std::make_shared<LanguageClassificationMixedDetect>(); }
static FunctionPtr create(ContextPtr context)
{
if (!context->getSettingsRef().allow_experimental_nlp_functions)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
return std::make_shared<FunctionDetectLanguageMixed>();
}
String getName() const override { return name; }
@ -120,8 +120,9 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}. Must be String.",
arguments[0]->getName(), getName());
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeFloat32>());
}
@ -169,7 +170,7 @@ public:
for (size_t j = 0; j < top_N; ++j)
{
auto res_str = LanguageClassificationImpl::codeISO(LanguageCode(result_lang_top3[j]));
auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j]));
Float32 res_float = static_cast<Float32>(pc[j]) / 100;
keys_data->insertData(res_str.data(), res_str.size());
@ -188,18 +189,18 @@ public:
}
};
struct NameLanguageUTF8Detect
struct NameDetectLanguage
{
static constexpr auto name = "detectLanguage";
};
using FunctionLanguageUTF8Detect = FunctionStringToString<LanguageClassificationImpl, NameLanguageUTF8Detect, false>;
using FunctionDetectLanguage = FunctionTextClassificationString<FunctionDetectLanguageImpl, NameDetectLanguage>;
void registerFunctionLanguageDetectUTF8(FunctionFactory & factory)
void registerFunctionsDetectLanguage(FunctionFactory & factory)
{
factory.registerFunction<FunctionLanguageUTF8Detect>();
factory.registerFunction<LanguageClassificationMixedDetect>();
factory.registerFunction<FunctionDetectLanguage>();
factory.registerFunction<FunctionDetectLanguageMixed>();
}
}

View File

@ -1,6 +1,7 @@
#include <Common/FrequencyHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/FunctionsTextClassification.h>
#include <unordered_map>
#include <string_view>
@ -8,18 +9,13 @@
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
/**
* Determine the programming language from the source code.
* We calculate all the unigrams and bigrams of commands in the source code.
* Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages
* Find the biggest weight of the programming language and return it
*/
struct ProgrammingClassificationImpl
struct FunctionDetectProgrammingLanguageImpl
{
/// Calculate total weight
static ALWAYS_INLINE inline Float64 stateMachine(
@ -32,9 +28,7 @@ struct ProgrammingClassificationImpl
/// Try to find each n-gram in dictionary
const auto * it = standard.find(el.first);
if (it != standard.end())
{
res += el.second * it->getMapped();
}
}
return res;
}
@ -59,42 +53,32 @@ struct ProgrammingClassificationImpl
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
std::unordered_map<String, Float64> data_freq;
String prev_command;
String command;
/// Select all commands from the string
for (size_t ind = 0; ind < str_len;)
for (size_t ind = 0; ind < str_len; ++ind)
{
/// Assume that all commands are split by spaces
if (!isspace(str[ind]))
if (isWhitespaceASCII(str[ind]))
continue;
while (ind < str_len && !isWhitespaceASCII(str[ind]))
{
command.push_back(str[ind]);
++ind;
}
while ((ind < str_len) && (!isspace(str[ind])))
{
command.push_back(str[ind]);
++ind;
}
if (prev_command.empty())
{
prev_command = command;
}
else
{
data_freq[prev_command + command] += 1;
data_freq[prev_command] += 1;
prev_command = command;
}
command = "";
}
else
{
++ind;
}
/// We add both unigrams and bigrams to later search for them in the dictionary
if (!prev_command.empty())
data_freq[prev_command + command] += 1;
data_freq[command] += 1;
command.swap(prev_command);
command.clear();
}
String res;
std::string_view res;
Float64 max_result = 0;
/// Iterate over all programming languages and find the language with the highest weight
for (const auto & item : programming_freq)
@ -119,24 +103,19 @@ struct ProgrammingClassificationImpl
res_offsets[i] = res_offset;
}
}
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception("Cannot apply function detectProgrammingLanguage to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
}
};
struct NameGetProgramming
struct NameDetectProgrammingLanguage
{
static constexpr auto name = "detectProgrammingLanguage";
};
using FunctionGetProgramming = FunctionStringToString<ProgrammingClassificationImpl, NameGetProgramming, false>;
using FunctionDetectProgrammingLanguage = FunctionTextClassificationString<FunctionDetectProgrammingLanguageImpl, NameDetectProgrammingLanguage>;
void registerFunctionsProgrammingClassification(FunctionFactory & factory)
void registerFunctionDetectProgrammingLanguage(FunctionFactory & factory)
{
factory.registerFunction<FunctionGetProgramming>();
factory.registerFunction<FunctionDetectProgrammingLanguage>();
}
}

View File

@ -0,0 +1,122 @@
#pragma once
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Interpreters/Context_fwd.h>
#include <Functions/FunctionFactory.h>
#include <Interpreters/Context.h>
namespace DB
{
/// Functions for text classification with different result types
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int SUPPORT_IS_DISABLED;
}
template <typename Impl, typename Name>
class FunctionTextClassificationString : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(ContextPtr context)
{
if (!context->getSettingsRef().allow_experimental_nlp_functions)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
return std::make_shared<FunctionTextClassificationString>();
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}. Must be String.",
arguments[0]->getName(), getName());
return arguments[0];
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
{
const ColumnPtr & column = arguments[0].column;
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
if (!col)
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
auto col_res = ColumnString::create();
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
return col_res;
}
};
template <typename Impl, typename Name>
class FunctionTextClassificationFloat : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(ContextPtr context)
{
if (!context->getSettingsRef().allow_experimental_nlp_functions)
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
return std::make_shared<FunctionTextClassificationFloat>();
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument of function {}. Must be String.",
arguments[0]->getName(), getName());
return std::make_shared<DataTypeFloat32>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
{
const ColumnPtr & column = arguments[0].column;
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
if (!col)
throw Exception(
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
auto col_res = ColumnVector<Float32>::create();
ColumnVector<Float32>::Container & vec_res = col_res->getData();
vec_res.resize(col->size());
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
return col_res;
}
};
}

View File

@ -1,67 +1,59 @@
#include <Common/FrequencyHolder.h>
#include <Common/StringUtils/StringUtils.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringOrArrayToT.h>
#include <Functions/FunctionsTextClassification.h>
#include <unordered_map>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
/**
* Determines the sentiment of text data.
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
* For each text, calculate the average sentiment value of its words and return NEG, POS or NEUT
* For each text, calculate the average sentiment value of its words and return it in range [-1,1]
*/
struct TonalityClassificationImpl
struct FunctionDetectTonalityImpl
{
static Float32 detectTonality(const UInt8 * str, const size_t str_len, const FrequencyHolder::Map & emotional_dict)
static ALWAYS_INLINE inline Float32 detectTonality(
const UInt8 * str,
const size_t str_len,
const FrequencyHolder::Map & emotional_dict)
{
Float64 weight = 0;
UInt64 count_words = 0;
String word;
/// Select all Russian words from the string
for (size_t ind = 0; ind < str_len;)
for (size_t ind = 0; ind < str_len; ++ind)
{
/// Assume that all non-ASCII characters are Russian letters
if (!isASCII(str[ind]))
/// Split words by whitespaces and punctuation signs
if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
continue;
while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
{
word.push_back(str[ind]);
++ind;
while ((ind < str_len) && (!isASCII(str[ind])))
{
word.push_back(str[ind]);
++ind;
}
/// Try to find a russian word in the tonality dictionary
const auto * it = emotional_dict.find(word);
if (it != emotional_dict.end())
{
count_words += 1;
weight += it->getMapped();
}
word.clear();
}
else
/// Try to find a russian word in the tonality dictionary
const auto * it = emotional_dict.find(word);
if (it != emotional_dict.end())
{
++ind;
count_words += 1;
weight += it->getMapped();
}
word.clear();
}
if (!count_words)
return 0;
/// Calculate average value of tonality.
/// Convert values -12..6 to -1..1
return std::max(weight / count_words / 6, -1.0);
}
/// If the function will return constant value for FixedString data type.
static constexpr auto is_fixed_to_constant = false;
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
@ -77,27 +69,6 @@ struct TonalityClassificationImpl
prev_offset = offsets[i];
}
}
static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, Float32 & /*res*/) {}
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<Float32> & res)
{
const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
size_t size = data.size() / n;
for (size_t i = 0; i < size; ++i)
res[i] = detectTonality(data.data() + i * n, n, emotional_dict);
}
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<Float32> &)
{
throw Exception("Cannot apply function detectTonality to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<Float32> &)
{
throw Exception("Cannot apply function detectTonality to UUID argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
};
struct NameDetectTonality
@ -105,9 +76,9 @@ struct NameDetectTonality
static constexpr auto name = "detectTonality";
};
using FunctionDetectTonality = FunctionStringOrArrayToT<TonalityClassificationImpl, NameDetectTonality, Float32>;
using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;
void registerFunctionsTonalityClassification(FunctionFactory & factory)
void registerFunctionDetectTonality(FunctionFactory & factory)
{
factory.registerFunction<FunctionDetectTonality>();
}

View File

@ -1,9 +1,5 @@
#include "config_core.h"
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
#endif
#include <Functions/FunctionFactory.h>
@ -39,9 +35,6 @@ void registerFunctionsStringArray(FunctionFactory &);
void registerFunctionsStringSearch(FunctionFactory &);
void registerFunctionsStringRegexp(FunctionFactory &);
void registerFunctionsStringSimilarity(FunctionFactory &);
void registerFunctionsCharsetClassification(FunctionFactory &);
void registerFunctionsTonalityClassification(FunctionFactory &);
void registerFunctionsProgrammingClassification(FunctionFactory &);
void registerFunctionsStringTokenExtractor(FunctionFactory &);
void registerFunctionsURL(FunctionFactory &);
void registerFunctionsVisitParam(FunctionFactory &);
@ -63,10 +56,6 @@ void registerFunctionTid(FunctionFactory & factory);
void registerFunctionLogTrace(FunctionFactory & factory);
void registerFunctionsTimeWindow(FunctionFactory &);
#if USE_NLP
void registerFunctionLanguageDetectUTF8(FunctionFactory &);
#endif
#if USE_SSL
void registerFunctionEncrypt(FunctionFactory & factory);
void registerFunctionDecrypt(FunctionFactory & factory);
@ -109,9 +98,6 @@ void registerFunctions()
registerFunctionsStringSearch(factory);
registerFunctionsStringRegexp(factory);
registerFunctionsStringSimilarity(factory);
registerFunctionsCharsetClassification(factory);
registerFunctionsTonalityClassification(factory);
registerFunctionsProgrammingClassification(factory);
registerFunctionsStringTokenExtractor(factory);
registerFunctionsURL(factory);
registerFunctionsVisitParam(factory);
@ -131,10 +117,6 @@ void registerFunctions()
registerFunctionsSnowflake(factory);
registerFunctionsTimeWindow(factory);
#if USE_NLP
registerFunctionLanguageDetectUTF8(factory);
#endif
#if USE_SSL
registerFunctionEncrypt(factory);
registerFunctionDecrypt(factory);

View File

@ -37,6 +37,9 @@ void registerFunctionEncodeXMLComponent(FunctionFactory &);
void registerFunctionDecodeXMLComponent(FunctionFactory &);
void registerFunctionExtractTextFromHTML(FunctionFactory &);
void registerFunctionToStringCutToZero(FunctionFactory &);
void registerFunctionDetectCharset(FunctionFactory &);
void registerFunctionDetectTonality(FunctionFactory &);
void registerFunctionDetectProgrammingLanguage(FunctionFactory &);
#if USE_BASE64
void registerFunctionBase64Encode(FunctionFactory &);
@ -48,6 +51,7 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
void registerFunctionStem(FunctionFactory &);
void registerFunctionSynonyms(FunctionFactory &);
void registerFunctionLemmatize(FunctionFactory &);
void registerFunctionsDetectLanguage(FunctionFactory &);
#endif
#if USE_ICU
@ -87,6 +91,9 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionDecodeXMLComponent(factory);
registerFunctionExtractTextFromHTML(factory);
registerFunctionToStringCutToZero(factory);
registerFunctionDetectCharset(factory);
registerFunctionDetectTonality(factory);
registerFunctionDetectProgrammingLanguage(factory);
#if USE_BASE64
registerFunctionBase64Encode(factory);
@ -98,6 +105,7 @@ void registerFunctionsString(FunctionFactory & factory)
registerFunctionStem(factory);
registerFunctionSynonyms(factory);
registerFunctionLemmatize(factory);
registerFunctionsDetectLanguage(factory);
#endif
#if USE_ICU

View File

@ -1,6 +1,8 @@
-- Tags: no-fasttest
-- Tag no-fasttest: depends on cld2 and nlp_data
SET allow_experimental_nlp_functions = 1;
SELECT detectLanguage('Они сошлись. Волна и камень, Стихи и проза, лед и пламень, Не столь различны меж собой.');
SELECT detectLanguage('Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious jewel in his head.');
SELECT detectLanguage('A vaincre sans peril, on triomphe sans gloire.');