2022-01-12 16:32:17 +00:00
|
|
|
#include "config_functions.h"
|
2021-05-31 16:01:28 +00:00
|
|
|
|
2021-12-22 21:03:42 +00:00
|
|
|
#if USE_NLP
|
2021-05-31 16:01:28 +00:00
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
#include <Columns/ColumnMap.h>
|
|
|
|
#include <Columns/ColumnArray.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
#include <Columns/ColumnsNumber.h>
|
2022-01-18 21:32:32 +00:00
|
|
|
#include <Common/isValidUTF8.h>
|
2021-12-30 02:14:57 +00:00
|
|
|
#include <DataTypes/DataTypeMap.h>
|
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
#include <DataTypes/DataTypeTuple.h>
|
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
|
|
#include <Functions/FunctionHelpers.h>
|
2022-01-12 16:32:17 +00:00
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionsTextClassification.h>
|
|
|
|
#include <Interpreters/Context.h>
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
#include <compact_lang_det.h>
|
2021-05-23 16:39:40 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
/* Determine language of Unicode UTF-8 text.
|
|
|
|
* Uses the cld2 library https://github.com/CLD2Owners/cld2
|
|
|
|
*/
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2021-12-30 03:35:37 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
2022-01-12 16:32:17 +00:00
|
|
|
extern const int SUPPORT_IS_DISABLED;
|
2021-12-30 03:35:37 +00:00
|
|
|
}
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
struct FunctionDetectLanguageImpl
|
2021-05-23 16:39:40 +00:00
|
|
|
{
|
2022-01-12 16:32:17 +00:00
|
|
|
static ALWAYS_INLINE inline std::string_view codeISO(std::string_view code_string)
|
2021-12-30 02:14:57 +00:00
|
|
|
{
|
|
|
|
if (code_string.ends_with("-Latn"))
|
|
|
|
code_string.remove_suffix(code_string.size() - 5);
|
|
|
|
|
|
|
|
if (code_string.ends_with("-Hant"))
|
|
|
|
code_string.remove_suffix(code_string.size() - 5);
|
|
|
|
|
|
|
|
// Old deprecated codes
|
|
|
|
if (code_string == "iw")
|
|
|
|
return "he";
|
|
|
|
|
|
|
|
if (code_string == "jw")
|
|
|
|
return "jv";
|
2021-05-23 16:39:40 +00:00
|
|
|
|
2021-12-30 02:14:57 +00:00
|
|
|
if (code_string == "in")
|
|
|
|
return "id";
|
|
|
|
|
|
|
|
if (code_string == "mo")
|
|
|
|
return "ro";
|
|
|
|
|
|
|
|
// Some languages do not have 2 letter codes, for example code for Cebuano is ceb
|
|
|
|
if (code_string.size() != 2)
|
|
|
|
return "other";
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
return code_string;
|
2021-05-23 16:39:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vector(
|
|
|
|
const ColumnString::Chars & data,
|
|
|
|
const ColumnString::Offsets & offsets,
|
|
|
|
ColumnString::Chars & res_data,
|
|
|
|
ColumnString::Offsets & res_offsets)
|
|
|
|
{
|
2022-01-10 15:36:32 +00:00
|
|
|
/// Constant 3 is based on the fact that in general we need 2 characters for ISO code + 1 zero byte
|
|
|
|
res_data.reserve(offsets.size() * 3);
|
2021-05-23 16:39:40 +00:00
|
|
|
res_offsets.resize(offsets.size());
|
|
|
|
|
2022-01-17 10:01:06 +00:00
|
|
|
bool is_reliable;
|
2021-05-23 16:39:40 +00:00
|
|
|
size_t res_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
2022-01-18 21:32:32 +00:00
|
|
|
const UInt8 * str = data.data() + offsets[i - 1];
|
2022-01-10 15:36:32 +00:00
|
|
|
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2022-01-18 21:32:32 +00:00
|
|
|
std::string_view res;
|
|
|
|
|
|
|
|
if (UTF8::isValidUTF8(str, str_len))
|
|
|
|
{
|
|
|
|
auto lang = CLD2::DetectLanguage(reinterpret_cast<const char *>(str), str_len, true, &is_reliable);
|
|
|
|
res = codeISO(LanguageCode(lang));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
res = "un";
|
|
|
|
}
|
2021-05-23 16:39:40 +00:00
|
|
|
|
2021-12-30 03:35:37 +00:00
|
|
|
res_data.resize(res_offset + res.size() + 1);
|
|
|
|
memcpy(&res_data[res_offset], res.data(), res.size());
|
2021-05-23 16:39:40 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
res_data[res_offset + res.size()] = 0;
|
|
|
|
res_offset += res.size() + 1;
|
2021-05-23 16:39:40 +00:00
|
|
|
|
|
|
|
res_offsets[i] = res_offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
class FunctionDetectLanguageMixed : public IFunction
|
2021-05-23 16:39:40 +00:00
|
|
|
{
|
2021-12-30 02:14:57 +00:00
|
|
|
public:
|
|
|
|
static constexpr auto name = "detectLanguageMixed";
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
/// Number of top results
|
|
|
|
static constexpr auto top_N = 3;
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
static FunctionPtr create(ContextPtr context)
|
|
|
|
{
|
|
|
|
if (!context->getSettingsRef().allow_experimental_nlp_functions)
|
|
|
|
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
|
|
|
"Natural language processing function '{}' is experimental. Set `allow_experimental_nlp_functions` setting to enable it", name);
|
|
|
|
|
|
|
|
return std::make_shared<FunctionDetectLanguageMixed>();
|
|
|
|
}
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
String getName() const override { return name; }
|
|
|
|
|
|
|
|
size_t getNumberOfArguments() const override { return 1; }
|
|
|
|
|
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
|
|
|
|
|
|
|
bool useDefaultImplementationForConstants() const override { return true; }
|
|
|
|
|
|
|
|
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
|
|
|
{
|
|
|
|
if (!isString(arguments[0]))
|
2022-01-12 16:32:17 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
|
|
|
"Illegal type {} of argument of function {}. Must be String.",
|
|
|
|
arguments[0]->getName(), getName());
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeFloat32>());
|
2021-12-30 02:14:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
|
|
|
{
|
|
|
|
const auto & column = arguments[0].column;
|
|
|
|
const ColumnString * col = checkAndGetColumn<ColumnString>(column.get());
|
|
|
|
|
|
|
|
if (!col)
|
|
|
|
throw Exception(
|
|
|
|
"Illegal columns " + arguments[0].column->getName() + " of arguments of function " + getName(),
|
|
|
|
ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
const auto & input_data = col->getChars();
|
|
|
|
const auto & input_offsets = col->getOffsets();
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
/// Create and fill the result map.
|
|
|
|
|
|
|
|
const auto & result_type_map = static_cast<const DataTypeMap &>(*result_type);
|
|
|
|
const DataTypePtr & key_type = result_type_map.getKeyType();
|
|
|
|
const DataTypePtr & value_type = result_type_map.getValueType();
|
|
|
|
|
|
|
|
MutableColumnPtr keys_data = key_type->createColumn();
|
|
|
|
MutableColumnPtr values_data = value_type->createColumn();
|
|
|
|
MutableColumnPtr offsets = DataTypeNumber<IColumn::Offset>().createColumn();
|
|
|
|
|
2022-01-10 15:36:32 +00:00
|
|
|
size_t total_elements = input_rows_count * top_N;
|
2021-12-30 02:14:57 +00:00
|
|
|
keys_data->reserve(total_elements);
|
|
|
|
values_data->reserve(total_elements);
|
|
|
|
offsets->reserve(input_rows_count);
|
|
|
|
|
2022-01-17 10:01:06 +00:00
|
|
|
bool is_reliable;
|
2022-01-10 15:36:32 +00:00
|
|
|
CLD2::Language result_lang_top3[top_N];
|
|
|
|
int32_t pc[top_N];
|
|
|
|
int bytes[top_N];
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
IColumn::Offset current_offset = 0;
|
|
|
|
for (size_t i = 0; i < input_rows_count; ++i)
|
|
|
|
{
|
2022-01-18 21:32:32 +00:00
|
|
|
const UInt8 * str = input_data.data() + input_offsets[i - 1];
|
2021-12-30 02:14:57 +00:00
|
|
|
const size_t str_len = input_offsets[i] - input_offsets[i - 1] - 1;
|
|
|
|
|
2022-01-18 21:32:32 +00:00
|
|
|
if (UTF8::isValidUTF8(str, str_len))
|
|
|
|
{
|
|
|
|
CLD2::DetectLanguageSummary(reinterpret_cast<const char *>(str), str_len, true, result_lang_top3, pc, bytes, &is_reliable);
|
|
|
|
|
|
|
|
for (size_t j = 0; j < top_N; ++j)
|
|
|
|
{
|
2022-01-19 10:31:04 +00:00
|
|
|
if (pc[j] == 0)
|
2022-01-18 21:32:32 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
auto res_str = FunctionDetectLanguageImpl::codeISO(LanguageCode(result_lang_top3[j]));
|
|
|
|
Float32 res_float = static_cast<Float32>(pc[j]) / 100;
|
2021-12-30 02:14:57 +00:00
|
|
|
|
2022-01-18 21:32:32 +00:00
|
|
|
keys_data->insertData(res_str.data(), res_str.size());
|
|
|
|
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
|
|
|
++current_offset;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2021-12-30 02:14:57 +00:00
|
|
|
{
|
2022-01-18 21:32:32 +00:00
|
|
|
std::string_view res_str = "un";
|
|
|
|
Float32 res_float = 0;
|
2021-12-30 02:14:57 +00:00
|
|
|
|
|
|
|
keys_data->insertData(res_str.data(), res_str.size());
|
2022-01-10 15:36:32 +00:00
|
|
|
values_data->insertData(reinterpret_cast<const char *>(&res_float), sizeof(res_float));
|
2022-01-18 21:32:32 +00:00
|
|
|
++current_offset;
|
2021-12-30 02:14:57 +00:00
|
|
|
}
|
|
|
|
offsets->insert(current_offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto nested_column = ColumnArray::create(
|
|
|
|
ColumnTuple::create(Columns{std::move(keys_data), std::move(values_data)}),
|
|
|
|
std::move(offsets));
|
|
|
|
|
|
|
|
return ColumnMap::create(nested_column);
|
|
|
|
}
|
2021-05-23 16:39:40 +00:00
|
|
|
};
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
struct NameDetectLanguage
|
2021-05-23 16:39:40 +00:00
|
|
|
{
|
2021-12-30 02:14:57 +00:00
|
|
|
static constexpr auto name = "detectLanguage";
|
2021-05-23 16:39:40 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
using FunctionDetectLanguage = FunctionTextClassificationString<FunctionDetectLanguageImpl, NameDetectLanguage>;
|
2021-05-23 16:39:40 +00:00
|
|
|
|
2022-01-12 16:32:17 +00:00
|
|
|
void registerFunctionsDetectLanguage(FunctionFactory & factory)
|
2021-05-23 16:39:40 +00:00
|
|
|
{
|
2022-01-12 16:32:17 +00:00
|
|
|
factory.registerFunction<FunctionDetectLanguage>();
|
|
|
|
factory.registerFunction<FunctionDetectLanguageMixed>();
|
2021-05-23 16:39:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2021-05-31 15:30:30 +00:00
|
|
|
#endif
|