Add nfd and perf test

This commit is contained in:
Viachaslau Boben 2021-09-27 18:45:04 +03:00
parent b3325772f7
commit 762904adbd
4 changed files with 148 additions and 44 deletions

View File

@ -1,6 +1,10 @@
#if !defined(ARCADIA_BUILD)
# include "config_core.h"
#endif
#if USE_ICU
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <unicode/normalizer2.h>
#include <unicode/rep.h>
#include <unicode/unistr.h>
#include <unicode/unorm2.h>
@ -15,12 +19,67 @@ namespace DB
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int CANNOT_NORMALIZE_STRING;
}
namespace
{
// Expansion factors are specified for UTF-32, since icu uses UTF-32 for normalization
// Maximum expansion factors for different normalization forms
// https://unicode.org/faq/normalization.html#12
struct NormalizeNFCImpl
{
static constexpr auto name = "normalizeUTF8NFC";
static constexpr auto expansionFactor = 3;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFCInstance(err);
}
};
struct NormalizeNFDImpl
{
static constexpr auto name = "normalizeUTF8NFD";
static constexpr auto expansionFactor = 4;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFDInstance(err);
}
};
struct NormalizeNFKCImpl
{
static constexpr auto name = "normalizeUTF8NFKC";
static constexpr auto expansionFactor = 18;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFKCInstance(err);
}
};
struct NormalizeNFKDImpl
{
static constexpr auto name = "normalizeUTF8NFKD";
static constexpr auto expansionFactor = 18;
static const UNormalizer2 *getNormalizer(UErrorCode *err)
{
return unorm2_getNFKDInstance(err);
}
};
template<typename NormalizeImpl>
struct NormalizeUTF8Impl
{
@ -31,10 +90,9 @@ struct NormalizeUTF8Impl
{
UErrorCode err = U_ZERO_ERROR;
const UNormalizer2 *normalizer = unorm2_getNFCInstance(&err);
if (U_FAILURE(err)) {
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
}
const UNormalizer2 *normalizer = NormalizeImpl::getNormalizer(&err);
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (getNormalizer): {}", u_errorName(err));
size_t size = offsets.size();
res_offsets.resize(size);
@ -60,13 +118,10 @@ struct NormalizeUTF8Impl
reinterpret_cast<const char*>(&data[current_from_offset]),
from_size,
&err);
if (U_FAILURE(err)) {
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
}
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strFromUTF8): {}", u_errorName(err));
// NFC should produce no more than 3x code points
// https://unicode.org/faq/normalization.html#12
to_uchars.resize(from_code_points * 3 + 1);
to_uchars.resize(from_code_points * NormalizeImpl::expansionFactor + 1);
int32_t to_code_points = unorm2_normalize(
normalizer,
@ -75,14 +130,12 @@ struct NormalizeUTF8Impl
to_uchars.data(),
to_uchars.size(),
&err);
if (U_FAILURE(err)) {
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
}
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (normalize): {}", u_errorName(err));
size_t max_to_size = current_to_offset + 2 * to_code_points + 1;
if (res_data.size() < max_to_size) {
size_t max_to_size = current_to_offset + 4 * to_code_points + 1;
if (res_data.size() < max_to_size)
res_data.resize(max_to_size);
}
int32_t to_size;
u_strToUTF8(
@ -92,9 +145,8 @@ struct NormalizeUTF8Impl
to_uchars.data(),
to_code_points,
&err);
if (U_FAILURE(err)) {
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
}
if (U_FAILURE(err))
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strToUTF8): {}", u_errorName(err));
current_to_offset += to_size;
res_data[current_to_offset] = 0;
@ -111,16 +163,20 @@ struct NormalizeUTF8Impl
}
};
struct NameNormalizeUTF8
using FunctionNormalizeUTF8NFC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFCImpl>, NormalizeNFCImpl>;
using FunctionNormalizeUTF8NFD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFDImpl>, NormalizeNFDImpl>;
using FunctionNormalizeUTF8NFKC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKCImpl>, NormalizeNFKCImpl>;
using FunctionNormalizeUTF8NFKD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKDImpl>, NormalizeNFKDImpl>;
}
void registerFunctionNormalizeUTF8(FunctionFactory & factory)
{
static constexpr auto name = "normalizeUTF8";
};
using FunctionNormalizeUTF8 = FunctionStringToString<NormalizeUTF8Impl, NameNormalizeUTF8>;
}
void registerFunctionNormalizeUTF8(FunctionFactory & factory) {
factory.registerFunction<FunctionNormalizeUTF8>();
factory.registerFunction<FunctionNormalizeUTF8NFC>();
factory.registerFunction<FunctionNormalizeUTF8NFD>();
factory.registerFunction<FunctionNormalizeUTF8NFKC>();
factory.registerFunction<FunctionNormalizeUTF8NFKD>();
}
}
#endif

View File

@ -0,0 +1,15 @@
<test>
<preconditions>
<table_exists>hits_10m_single</table_exists>
</preconditions>
<create_query>CREATE TABLE strings (words String) ENGINE Memory</create_query>
<fill_query>INSERT INTO strings SELECT SearchPhrase FROM hits_10m_single WHERE length(SearchPhrase) > 0</fill_query>
<query>SELECT normalizeUTF8NFC(words) FROM strings FORMAT Null</query>
<query>SELECT normalizeUTF8NFD(words) FROM strings FORMAT Null</query>
<query>SELECT normalizeUTF8NFKC(words) FROM strings FORMAT Null</query>
<query>SELECT normalizeUTF8NFKD(words) FROM strings FORMAT Null</query>
<drop_query>DROP TABLE IF EXISTS strings</drop_query>
</test>

View File

@ -1,3 +1,11 @@
ё ё 2 4 ё ё 2 2
ё 4 ё 2
ё 2 ё 2
1 ё 4 ё 2 ё 4 ё 2 ё 4
2 ё 2 ё 2 ё 4 ё 2 ё 4
3 జ్ఞ‌ా 15 జ్ఞ‌ా 15 జ్ఞ‌ా 15 జ్ఞ‌ా 15 జ్ఞ‌ా 15
4 本気ですか 15 本気ですか 15 本気ですか 18 本気ですか 15 本気ですか 18
5 ﷺ 3 ﷺ 3 ﷺ 3 صلى الله عليه وسلم 33 صلى الله عليه وسلم 33
6 ᾂ 3 ᾂ 3 ᾂ 8 ᾂ 3 ᾂ 8
7 ΐ 2 ΐ 2 ΐ 6 ΐ 2 ΐ 6
8 שּׁ 6 שּׁ 6 שּׁ 6 שּׁ 6 שּׁ 6
9 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12
10 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 282 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 282

View File

@ -1,19 +1,44 @@
-- Tags: no-fasttest
DROP TABLE IF EXISTS normalize_test;
CREATE TABLE normalize_test (value String) ENGINE = MergeTree ORDER BY value;
CREATE TABLE normalize_test (id int, value String) ENGINE = MergeTree ORDER BY value;
SELECT
'ё' AS norm,
'ё' AS denorm,
length(norm),
length(denorm),
normalizeUTF8(norm),
normalizeUTF8(denorm),
length(normalizeUTF8(norm)),
length(normalizeUTF8(denorm));
'ё' AS norm, 'ё' AS denorm,
length(norm), length(denorm),
normalizeUTF8NFC(norm) AS norm_nfc,
normalizeUTF8NFC(denorm) AS denorm_nfc,
length(norm_nfc),
length(denorm_nfc);
INSERT INTO normalize_test (value) VALUES ('ё');
INSERT INTO normalize_test (value) VALUES ('ё');
SELECT value, length(value), normalizeUTF8(value) AS normalized, length(normalized) FROM normalize_test;
INSERT INTO normalize_test (id, value) VALUES (1, 'ё');
INSERT INTO normalize_test (id, value) VALUES (2, 'ё');
INSERT INTO normalize_test (id, value) VALUES (3, 'జ్ఞ‌ా');
INSERT INTO normalize_test (id, value) VALUES (4, '本気ですか');
INSERT INTO normalize_test (id, value) VALUES (5, '');
INSERT INTO normalize_test (id, value) VALUES (6, '');
INSERT INTO normalize_test (id, value) VALUES (7, 'ΐ');
INSERT INTO normalize_test (id, value) VALUES (8, 'שּׁ');
INSERT INTO normalize_test (id, value) VALUES (9, '𝅘𝅥𝅮');
SELECT char(228) AS value, normalizeUTF8(value); -- { serverError 619 }
INSERT INTO normalize_test (id, value) VALUES (10, 'Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒');
SELECT
id, value, length(value),
normalizeUTF8NFC(value) AS nfc, length(nfc) AS nfc_len,
normalizeUTF8NFD(value) AS nfd, length(nfd) AS nfd_len,
normalizeUTF8NFKC(value) AS nfkc, length(nfkc) AS nfkc_len,
normalizeUTF8NFKD(value) AS nfkd, length(nfkd) AS nfkd_len
FROM normalize_test
ORDER BY id;
SELECT char(228) AS value, normalizeUTF8NFC(value); -- { serverError 621 }
SELECT char(228) AS value, normalizeUTF8NFD(value); -- { serverError 621 }
SELECT char(228) AS value, normalizeUTF8NFKC(value); -- { serverError 621 }
SELECT char(228) AS value, normalizeUTF8NFKD(value); -- { serverError 621 }