2018-09-09 23:36:06 +00:00
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionStringOrArrayToT.h>
|
|
|
|
#include <Common/UTF8Helpers.h>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
|
|
|
}
|
|
|
|
|
2020-09-07 18:00:37 +00:00
|
|
|
namespace
|
|
|
|
{
|
2018-09-09 23:36:06 +00:00
|
|
|
|
|
|
|
/** If the string is UTF-8 encoded text, it returns the length of the text in code points.
|
|
|
|
* (not in characters: the length of the text "ё" can be either 1 or 2, depending on the normalization)
|
|
|
|
* (not in characters: the length of the text "" can be either 1 or 2, depending on the normalization)
|
|
|
|
* Otherwise, the behavior is undefined.
|
|
|
|
*/
|
|
|
|
struct LengthUTF8Impl
|
|
|
|
{
|
|
|
|
static constexpr auto is_fixed_to_constant = false;
|
|
|
|
|
2018-11-25 00:08:50 +00:00
|
|
|
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt64> & res)
|
2018-09-09 23:36:06 +00:00
|
|
|
{
|
|
|
|
size_t size = offsets.size();
|
|
|
|
|
|
|
|
ColumnString::Offset prev_offset = 0;
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
res[i] = UTF8::countCodePoints(&data[prev_offset], offsets[i] - prev_offset - 1);
|
|
|
|
prev_offset = offsets[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt64 & /*res*/)
|
2018-09-09 23:36:06 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt64> & res)
|
2018-09-09 23:36:06 +00:00
|
|
|
{
|
|
|
|
size_t size = data.size() / n;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
res[i] = UTF8::countCodePoints(&data[i * n], n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-08 00:16:39 +00:00
|
|
|
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt64> &)
|
2018-09-09 23:36:06 +00:00
|
|
|
{
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function lengthUTF8 to Array argument");
|
2018-09-09 23:36:06 +00:00
|
|
|
}
|
2021-07-04 14:26:09 +00:00
|
|
|
|
2021-07-04 15:55:22 +00:00
|
|
|
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt64> &)
|
2021-07-04 14:26:09 +00:00
|
|
|
{
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function lengthUTF8 to UUID argument");
|
2021-07-04 14:26:09 +00:00
|
|
|
}
|
2023-01-30 20:37:08 +00:00
|
|
|
|
|
|
|
[[noreturn]] static void ipv6(const ColumnIPv6::Container &, size_t &, PaddedPODArray<UInt64> &)
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function lengthUTF8 to IPv6 argument");
|
|
|
|
}
|
|
|
|
|
|
|
|
[[noreturn]] static void ipv4(const ColumnIPv4::Container &, size_t &, PaddedPODArray<UInt64> &)
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function lengthUTF8 to IPv4 argument");
|
|
|
|
}
|
2018-09-09 23:36:06 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct NameLengthUTF8
|
|
|
|
{
|
|
|
|
static constexpr auto name = "lengthUTF8";
|
|
|
|
};
|
|
|
|
using FunctionLengthUTF8 = FunctionStringOrArrayToT<LengthUTF8Impl, NameLengthUTF8, UInt64>;
|
|
|
|
|
2020-09-07 18:00:37 +00:00
|
|
|
}
|
|
|
|
|
2022-07-04 07:01:39 +00:00
|
|
|
REGISTER_FUNCTION(LengthUTF8)
|
2018-09-09 23:36:06 +00:00
|
|
|
{
|
|
|
|
factory.registerFunction<FunctionLengthUTF8>();
|
2018-10-05 19:52:42 +00:00
|
|
|
|
|
|
|
/// Compatibility aliases.
|
2022-08-27 21:53:51 +00:00
|
|
|
factory.registerAlias("CHAR_LENGTH", "lengthUTF8", FunctionFactory::CaseInsensitive);
|
|
|
|
factory.registerAlias("CHARACTER_LENGTH", "lengthUTF8", FunctionFactory::CaseInsensitive);
|
2018-09-09 23:36:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|