mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Refactor
This commit is contained in:
parent
85b8985df2
commit
1c24667f20
@ -80,9 +80,6 @@
|
||||
#include <Server/PostgreSQLHandlerFactory.h>
|
||||
#include <Server/ProtocolServerAdapter.h>
|
||||
#include <Server/HTTP/HTTPServer.h>
|
||||
|
||||
#include <Common/FrequencyHolder.h>
|
||||
|
||||
#include <Interpreters/AsynchronousInsertQueue.h>
|
||||
#include <Compression/CompressionCodecEncrypted.h>
|
||||
#include <filesystem>
|
||||
@ -703,7 +700,6 @@ if (ThreadFuzzer::instance().isEffective())
|
||||
global_context->setRemoteHostFilter(config());
|
||||
|
||||
std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH));
|
||||
|
||||
fs::path path = path_str;
|
||||
std::string default_database = config().getString("default_database", "default");
|
||||
|
||||
|
@ -83,7 +83,6 @@ target_link_libraries(clickhouse_functions PRIVATE lz4)
|
||||
|
||||
if (USE_NLP)
|
||||
target_link_libraries(clickhouse_functions PRIVATE cld2)
|
||||
target_include_directories(clickhouse_functions SYSTEM PRIVATE "${ClickHouse_SOURCE_DIR}/contrib/cld2/public")
|
||||
endif()
|
||||
|
||||
if (USE_H3)
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
@ -8,17 +8,12 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
||||
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
||||
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
||||
*/
|
||||
|
||||
template <size_t N, bool detect_language>
|
||||
template <bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
{
|
||||
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
||||
@ -121,32 +116,27 @@ struct CharsetClassificationImpl
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
|
||||
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
||||
{
|
||||
throw Exception("Cannot apply function detectProgrammingLanguage to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NameCharsetDetect
|
||||
struct NameDetectCharset
|
||||
{
|
||||
static constexpr auto name = "detectCharset";
|
||||
};
|
||||
|
||||
struct NameLanguageDetect
|
||||
struct NameDetectLanguageUnknown
|
||||
{
|
||||
static constexpr auto name = "detectLanguageUnknown";
|
||||
};
|
||||
|
||||
|
||||
using FunctionCharsetDetect = FunctionStringToString<CharsetClassificationImpl<2, false>, NameCharsetDetect, false>;
|
||||
using FunctionLanguageDetect = FunctionStringToString<CharsetClassificationImpl<2, true>, NameLanguageDetect, false>;
|
||||
using FunctionDetectCharset = FunctionTextClassificationString<CharsetClassificationImpl<false>, NameDetectCharset>;
|
||||
using FunctionDetectLanguageUnknown = FunctionTextClassificationString<CharsetClassificationImpl<true>, NameDetectLanguageUnknown>;
|
||||
|
||||
void registerFunctionsCharsetClassification(FunctionFactory & factory)
|
||||
void registerFunctionDetectCharset(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionCharsetDetect>();
|
||||
factory.registerFunction<FunctionLanguageDetect>();
|
||||
factory.registerFunction<FunctionDetectCharset>();
|
||||
factory.registerFunction<FunctionDetectLanguageUnknown>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,24 +1,21 @@
|
||||
#if !defined(ARCADIA_BUILD)
|
||||
#include "config_functions.h"
|
||||
#endif
|
||||
|
||||
#if USE_NLP
|
||||
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
|
||||
#include <Columns/ColumnMap.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
#include "compact_lang_det.h"
|
||||
#include <compact_lang_det.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -30,11 +27,12 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
struct LanguageClassificationImpl
|
||||
struct FunctionDetectLanguageImpl
|
||||
{
|
||||
static std::string_view codeISO(std::string_view code_string)
|
||||
static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string)
|
||||
{
|
||||
if (code_string.ends_with("-Latn"))
|
||||
code_string.remove_suffix(code_string.size() - 5);
|
||||
@ -92,14 +90,9 @@ struct LanguageClassificationImpl
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
|
||||
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
||||
{
|
||||
throw Exception("Cannot apply function detectProgrammingLanguage to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
class LanguageClassificationMixedDetect : public IFunction
|
||||
class FunctionDetectLanguageMixed : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "detectLanguageMixed";
|
||||
@ -107,7 +100,14 @@ public:
|
||||
/// Number of top results
|
||||
static constexpr auto top_N = 3;
|
||||
|
||||
static FunctionPtr create(ContextPtr) { return std::make_shared<LanguageClassificationMixedDetect>(); }
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
@ -120,8 +120,9 @@ public:
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(
|
||||
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeFloat32>());
|
||||
}
|
||||
@ -169,7 +170,7 @@ public:
|
||||
|
||||
for (size_t j = 0; j < top_N; ++j)
|
||||
{
|
||||
auto res_str = LanguageClassificationImpl::codeISO(LanguageCode(result_lang_top3[j]));
|
||||
auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j]));
|
||||
Float32 res_float = static_cast<Float32>(pc[j]) / 100;
|
||||
|
||||
keys_data->insertData(res_str.data(), res_str.size());
|
||||
@ -188,18 +189,18 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct NameLanguageUTF8Detect
|
||||
struct NameDetectLanguage
|
||||
{
|
||||
static constexpr auto name = "detectLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionLanguageUTF8Detect = FunctionStringToString<LanguageClassificationImpl, NameLanguageUTF8Detect, false>;
|
||||
using FunctionDetectLanguage = FunctionTextClassificationString<FunctionDetectLanguageImpl, NameDetectLanguage>;
|
||||
|
||||
void registerFunctionLanguageDetectUTF8(FunctionFactory & factory)
|
||||
void registerFunctionsDetectLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionLanguageUTF8Detect>();
|
||||
factory.registerFunction<LanguageClassificationMixedDetect>();
|
||||
factory.registerFunction<FunctionDetectLanguage>();
|
||||
factory.registerFunction<FunctionDetectLanguageMixed>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string_view>
|
||||
@ -8,18 +9,13 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the programming language from the source code.
|
||||
* We calculate all the unigrams and bigrams of commands in the source code.
|
||||
* Then using a marked-up dictionary with weights of unigrams and bigrams of commands for various programming languages
|
||||
* Find the biggest weight of the programming language and return it
|
||||
*/
|
||||
struct ProgrammingClassificationImpl
|
||||
struct FunctionDetectProgrammingLanguageImpl
|
||||
{
|
||||
/// Calculate total weight
|
||||
static ALWAYS_INLINE inline Float64 stateMachine(
|
||||
@ -32,10 +28,8 @@ struct ProgrammingClassificationImpl
|
||||
/// Try to find each n-gram in dictionary
|
||||
const auto * it = standard.find(el.first);
|
||||
if (it != standard.end())
|
||||
{
|
||||
res += el.second * it->getMapped();
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -59,42 +53,32 @@ struct ProgrammingClassificationImpl
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::unordered_map<String, Float64> data_freq;
|
||||
|
||||
String prev_command;
|
||||
String command;
|
||||
|
||||
/// Select all commands from the string
|
||||
for (size_t ind = 0; ind < str_len;)
|
||||
for (size_t ind = 0; ind < str_len; ++ind)
|
||||
{
|
||||
/// Assume that all commands are split by spaces
|
||||
if (!isspace(str[ind]))
|
||||
{
|
||||
command.push_back(str[ind]);
|
||||
++ind;
|
||||
if (isWhitespaceASCII(str[ind]))
|
||||
continue;
|
||||
|
||||
while ((ind < str_len) && (!isspace(str[ind])))
|
||||
while (ind < str_len && !isWhitespaceASCII(str[ind]))
|
||||
{
|
||||
command.push_back(str[ind]);
|
||||
++ind;
|
||||
}
|
||||
if (prev_command.empty())
|
||||
{
|
||||
prev_command = command;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
/// We add both unigrams and bigrams to later search for them in the dictionary
|
||||
if (!prev_command.empty())
|
||||
data_freq[prev_command + command] += 1;
|
||||
data_freq[prev_command] += 1;
|
||||
prev_command = command;
|
||||
}
|
||||
command = "";
|
||||
}
|
||||
else
|
||||
{
|
||||
++ind;
|
||||
}
|
||||
|
||||
data_freq[command] += 1;
|
||||
command.swap(prev_command);
|
||||
command.clear();
|
||||
}
|
||||
|
||||
String res;
|
||||
std::string_view res;
|
||||
Float64 max_result = 0;
|
||||
/// Iterate over all programming languages and find the language with the highest weight
|
||||
for (const auto & item : programming_freq)
|
||||
@ -119,24 +103,19 @@ struct ProgrammingClassificationImpl
|
||||
res_offsets[i] = res_offset;
|
||||
}
|
||||
}
|
||||
|
||||
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
|
||||
{
|
||||
throw Exception("Cannot apply function detectProgrammingLanguage to fixed string.", ErrorCodes::ILLEGAL_COLUMN);
|
||||
}
|
||||
};
|
||||
|
||||
struct NameGetProgramming
|
||||
struct NameDetectProgrammingLanguage
|
||||
{
|
||||
static constexpr auto name = "detectProgrammingLanguage";
|
||||
};
|
||||
|
||||
|
||||
using FunctionGetProgramming = FunctionStringToString<ProgrammingClassificationImpl, NameGetProgramming, false>;
|
||||
using FunctionDetectProgrammingLanguage = FunctionTextClassificationString<FunctionDetectProgrammingLanguageImpl, NameDetectProgrammingLanguage>;
|
||||
|
||||
void registerFunctionsProgrammingClassification(FunctionFactory & factory)
|
||||
void registerFunctionDetectProgrammingLanguage(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionGetProgramming>();
|
||||
factory.registerFunction<FunctionDetectProgrammingLanguage>();
|
||||
}
|
||||
|
||||
}
|
||||
|
122
src/Functions/FunctionsTextClassification.h
Normal file
122
src/Functions/FunctionsTextClassification.h
Normal file
@ -0,0 +1,122 @@
|
||||
#pragma once
|
||||
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
/// Functions for text classification with different result types
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionTextClassificationString : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionTextClassificationString>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return arguments[0];
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
auto col_res = ColumnString::create();
|
||||
Impl::vector(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets());
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionTextClassificationFloat : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static FunctionPtr create(ContextPtr context)
|
||||
{
|
||||
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
||||
|
||||
return std::make_shared<FunctionTextClassificationFloat>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!isString(arguments[0]))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of argument of function {}. Must be String.",
|
||||
arguments[0]->getName(), getName());
|
||||
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
|
||||
{
|
||||
const ColumnPtr & column = arguments[0].column;
|
||||
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
||||
|
||||
if (!col)
|
||||
throw Exception(
|
||||
"Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||
|
||||
auto col_res = ColumnVector<Float32>::create();
|
||||
ColumnVector<Float32>::Container & vec_res = col_res->getData();
|
||||
vec_res.resize(col->size());
|
||||
|
||||
Impl::vector(col->getChars(), col->getOffsets(), vec_res);
|
||||
return col_res;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -1,41 +1,37 @@
|
||||
#include <Common/FrequencyHolder.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringOrArrayToT.h>
|
||||
#include <Functions/FunctionsTextClassification.h>
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the sentiment of text data.
|
||||
* Uses a marked-up sentiment dictionary, each word has a tonality ranging from -12 to 6.
|
||||
* For each text, calculate the average sentiment value of its words and return NEG, POS or NEUT
|
||||
* For each text, calculate the average sentiment value of its words and return it in range [-1,1]
|
||||
*/
|
||||
struct TonalityClassificationImpl
|
||||
struct FunctionDetectTonalityImpl
|
||||
{
|
||||
static Float32 detectTonality(const UInt8 * str, const size_t str_len, const FrequencyHolder::Map & emotional_dict)
|
||||
static ALWAYS_INLINE inline Float32 detectTonality(
|
||||
const UInt8 * str,
|
||||
const size_t str_len,
|
||||
const FrequencyHolder::Map & emotional_dict)
|
||||
{
|
||||
Float64 weight = 0;
|
||||
UInt64 count_words = 0;
|
||||
|
||||
String word;
|
||||
/// Select all Russian words from the string
|
||||
for (size_t ind = 0; ind < str_len;)
|
||||
for (size_t ind = 0; ind < str_len; ++ind)
|
||||
{
|
||||
/// Assume that all non-ASCII characters are Russian letters
|
||||
if (!isASCII(str[ind]))
|
||||
{
|
||||
word.push_back(str[ind]);
|
||||
++ind;
|
||||
/// Split words by whitespaces and punctuation signs
|
||||
if (isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind]))
|
||||
continue;
|
||||
|
||||
while ((ind < str_len) && (!isASCII(str[ind])))
|
||||
while (ind < str_len && !(isWhitespaceASCII(str[ind]) || isPunctuationASCII(str[ind])))
|
||||
{
|
||||
word.push_back(str[ind]);
|
||||
++ind;
|
||||
@ -49,19 +45,15 @@ struct TonalityClassificationImpl
|
||||
}
|
||||
word.clear();
|
||||
}
|
||||
else
|
||||
{
|
||||
++ind;
|
||||
}
|
||||
}
|
||||
|
||||
if (!count_words)
|
||||
return 0;
|
||||
|
||||
/// Calculate average value of tonality.
|
||||
/// Convert values -12..6 to -1..1
|
||||
return std::max(weight / count_words / 6, -1.0);
|
||||
}
|
||||
|
||||
/// If the function will return constant value for FixedString data type.
|
||||
static constexpr auto is_fixed_to_constant = false;
|
||||
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
@ -77,27 +69,6 @@ struct TonalityClassificationImpl
|
||||
prev_offset = offsets[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, Float32 & /*res*/) {}
|
||||
|
||||
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<Float32> & res)
|
||||
{
|
||||
const auto & emotional_dict = FrequencyHolder::getInstance().getEmotionalDict();
|
||||
|
||||
size_t size = data.size() / n;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
res[i] = detectTonality(data.data() + i * n, n, emotional_dict);
|
||||
}
|
||||
|
||||
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<Float32> &)
|
||||
{
|
||||
throw Exception("Cannot apply function detectTonality to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
}
|
||||
|
||||
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<Float32> &)
|
||||
{
|
||||
throw Exception("Cannot apply function detectTonality to UUID argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
}
|
||||
};
|
||||
|
||||
struct NameDetectTonality
|
||||
@ -105,9 +76,9 @@ struct NameDetectTonality
|
||||
static constexpr auto name = "detectTonality";
|
||||
};
|
||||
|
||||
using FunctionDetectTonality = FunctionStringOrArrayToT<TonalityClassificationImpl, NameDetectTonality, Float32>;
|
||||
using FunctionDetectTonality = FunctionTextClassificationFloat<FunctionDetectTonalityImpl, NameDetectTonality>;
|
||||
|
||||
void registerFunctionsTonalityClassification(FunctionFactory & factory)
|
||||
void registerFunctionDetectTonality(FunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<FunctionDetectTonality>();
|
||||
}
|
||||
|
@ -1,9 +1,5 @@
|
||||
#include "config_core.h"
|
||||
|
||||
#if !defined(ARCADIA_BUILD)
|
||||
# include "config_functions.h"
|
||||
#endif
|
||||
|
||||
#include <Functions/FunctionFactory.h>
|
||||
|
||||
|
||||
@ -39,9 +35,6 @@ void registerFunctionsStringArray(FunctionFactory &);
|
||||
void registerFunctionsStringSearch(FunctionFactory &);
|
||||
void registerFunctionsStringRegexp(FunctionFactory &);
|
||||
void registerFunctionsStringSimilarity(FunctionFactory &);
|
||||
void registerFunctionsCharsetClassification(FunctionFactory &);
|
||||
void registerFunctionsTonalityClassification(FunctionFactory &);
|
||||
void registerFunctionsProgrammingClassification(FunctionFactory &);
|
||||
void registerFunctionsStringTokenExtractor(FunctionFactory &);
|
||||
void registerFunctionsURL(FunctionFactory &);
|
||||
void registerFunctionsVisitParam(FunctionFactory &);
|
||||
@ -63,10 +56,6 @@ void registerFunctionTid(FunctionFactory & factory);
|
||||
void registerFunctionLogTrace(FunctionFactory & factory);
|
||||
void registerFunctionsTimeWindow(FunctionFactory &);
|
||||
|
||||
#if USE_NLP
|
||||
void registerFunctionLanguageDetectUTF8(FunctionFactory &);
|
||||
#endif
|
||||
|
||||
#if USE_SSL
|
||||
void registerFunctionEncrypt(FunctionFactory & factory);
|
||||
void registerFunctionDecrypt(FunctionFactory & factory);
|
||||
@ -109,9 +98,6 @@ void registerFunctions()
|
||||
registerFunctionsStringSearch(factory);
|
||||
registerFunctionsStringRegexp(factory);
|
||||
registerFunctionsStringSimilarity(factory);
|
||||
registerFunctionsCharsetClassification(factory);
|
||||
registerFunctionsTonalityClassification(factory);
|
||||
registerFunctionsProgrammingClassification(factory);
|
||||
registerFunctionsStringTokenExtractor(factory);
|
||||
registerFunctionsURL(factory);
|
||||
registerFunctionsVisitParam(factory);
|
||||
@ -131,10 +117,6 @@ void registerFunctions()
|
||||
registerFunctionsSnowflake(factory);
|
||||
registerFunctionsTimeWindow(factory);
|
||||
|
||||
#if USE_NLP
|
||||
registerFunctionLanguageDetectUTF8(factory);
|
||||
#endif
|
||||
|
||||
#if USE_SSL
|
||||
registerFunctionEncrypt(factory);
|
||||
registerFunctionDecrypt(factory);
|
||||
|
@ -37,6 +37,9 @@ void registerFunctionEncodeXMLComponent(FunctionFactory &);
|
||||
void registerFunctionDecodeXMLComponent(FunctionFactory &);
|
||||
void registerFunctionExtractTextFromHTML(FunctionFactory &);
|
||||
void registerFunctionToStringCutToZero(FunctionFactory &);
|
||||
void registerFunctionDetectCharset(FunctionFactory &);
|
||||
void registerFunctionDetectTonality(FunctionFactory &);
|
||||
void registerFunctionDetectProgrammingLanguage(FunctionFactory &);
|
||||
|
||||
#if USE_BASE64
|
||||
void registerFunctionBase64Encode(FunctionFactory &);
|
||||
@ -48,6 +51,7 @@ void registerFunctionTryBase64Decode(FunctionFactory &);
|
||||
void registerFunctionStem(FunctionFactory &);
|
||||
void registerFunctionSynonyms(FunctionFactory &);
|
||||
void registerFunctionLemmatize(FunctionFactory &);
|
||||
void registerFunctionsDetectLanguage(FunctionFactory &);
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
@ -87,6 +91,9 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionDecodeXMLComponent(factory);
|
||||
registerFunctionExtractTextFromHTML(factory);
|
||||
registerFunctionToStringCutToZero(factory);
|
||||
registerFunctionDetectCharset(factory);
|
||||
registerFunctionDetectTonality(factory);
|
||||
registerFunctionDetectProgrammingLanguage(factory);
|
||||
|
||||
#if USE_BASE64
|
||||
registerFunctionBase64Encode(factory);
|
||||
@ -98,6 +105,7 @@ void registerFunctionsString(FunctionFactory & factory)
|
||||
registerFunctionStem(factory);
|
||||
registerFunctionSynonyms(factory);
|
||||
registerFunctionLemmatize(factory);
|
||||
registerFunctionsDetectLanguage(factory);
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
|
@ -1,6 +1,8 @@
|
||||
-- Tags: no-fasttest
|
||||
-- Tag no-fasttest: depends on cld2 and nlp_data
|
||||
|
||||
SET allow_experimental_nlp_functions = 1;
|
||||
|
||||
SELECT detectLanguage('Они сошлись. Волна и камень, Стихи и проза, лед и пламень, Не столь различны меж собой.');
|
||||
SELECT detectLanguage('Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious jewel in his head.');
|
||||
SELECT detectLanguage('A vaincre sans peril, on triomphe sans gloire.');
|
||||
|
Loading…
Reference in New Issue
Block a user