ClickHouse/src/Functions/normalizeString.cpp

183 lines
5.4 KiB
C++

#include "config.h"
#if USE_ICU
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <unicode/rep.h>
#include <unicode/unistr.h>
#include <unicode/unorm2.h>
#include <unicode/ustring.h>
#include <unicode/utypes.h>
#include <Common/logger_useful.h>
#include <Columns/ColumnString.h>
#include <Parsers/IAST_fwd.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int CANNOT_NORMALIZE_STRING;
}
namespace
{
// Expansion factors are specified for UTF-32, since icu uses UTF-32 for normalization
// Maximum expansion factors for different normalization forms
// https://unicode.org/faq/normalization.html#12
struct NormalizeNFCImpl
{
static constexpr auto name = "normalizeUTF8NFC";
static constexpr auto expansionFactor = 3;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFCInstance(err);
}
};
struct NormalizeNFDImpl
{
static constexpr auto name = "normalizeUTF8NFD";
static constexpr auto expansionFactor = 4;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFDInstance(err);
}
};
struct NormalizeNFKCImpl
{
static constexpr auto name = "normalizeUTF8NFKC";
static constexpr auto expansionFactor = 18;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFKCInstance(err);
}
};
struct NormalizeNFKDImpl
{
static constexpr auto name = "normalizeUTF8NFKD";
static constexpr auto expansionFactor = 18;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFKDInstance(err);
}
};
template<typename NormalizeImpl>
struct NormalizeUTF8Impl
{
static void vector(const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets,
size_t input_rows_count)
{
UErrorCode err = U_ZERO_ERROR;
const UNormalizer2 *normalizer = NormalizeImpl::getNormalizer(&err);
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (getNormalizer): {}", u_errorName(err));
res_offsets.resize(input_rows_count);
res_data.reserve(data.size() * 2);
ColumnString::Offset current_from_offset = 0;
ColumnString::Offset current_to_offset = 0;
PODArray<UChar> from_uchars;
PODArray<UChar> to_uchars;
for (size_t i = 0; i < input_rows_count; ++i)
{
size_t from_size = offsets[i] - current_from_offset - 1;
from_uchars.resize(from_size + 1);
int32_t from_code_points = 0;
u_strFromUTF8(
from_uchars.data(),
from_uchars.size(),
&from_code_points,
reinterpret_cast<const char*>(&data[current_from_offset]),
from_size,
&err);
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strFromUTF8): {}", u_errorName(err));
to_uchars.resize(from_code_points * NormalizeImpl::expansionFactor + 1);
int32_t to_code_points = unorm2_normalize(
normalizer,
from_uchars.data(),
from_code_points,
to_uchars.data(),
to_uchars.size(),
&err);
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (normalize): {}", u_errorName(err));
size_t max_to_size = current_to_offset + 4 * to_code_points + 1;
if (res_data.size() < max_to_size)
res_data.resize(max_to_size);
int32_t to_size = 0;
u_strToUTF8(
reinterpret_cast<char*>(&res_data[current_to_offset]),
res_data.size() - current_to_offset,
&to_size,
to_uchars.data(),
to_code_points,
&err);
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strToUTF8): {}", u_errorName(err));
current_to_offset += to_size;
res_data[current_to_offset] = 0;
++current_to_offset;
res_offsets[i] = current_to_offset;
current_from_offset = offsets[i];
}
res_data.resize(current_to_offset);
}
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &, size_t)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot apply function normalizeUTF8 to fixed string.");
}
};
using FunctionNormalizeUTF8NFC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFCImpl>, NormalizeNFCImpl>;
using FunctionNormalizeUTF8NFD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFDImpl>, NormalizeNFDImpl>;
using FunctionNormalizeUTF8NFKC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKCImpl>, NormalizeNFKCImpl>;
using FunctionNormalizeUTF8NFKD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKDImpl>, NormalizeNFKDImpl>;
}
REGISTER_FUNCTION(NormalizeUTF8)
{
factory.registerFunction<FunctionNormalizeUTF8NFC>();
factory.registerFunction<FunctionNormalizeUTF8NFD>();
factory.registerFunction<FunctionNormalizeUTF8NFKC>();
factory.registerFunction<FunctionNormalizeUTF8NFKD>();
}
}
#endif