From 762904adbda95dc24b771250b1f32ccd404db739 Mon Sep 17 00:00:00 2001 From: Viachaslau Boben Date: Mon, 27 Sep 2021 18:45:04 +0300 Subject: [PATCH] Add nfd and perf test --- src/Functions/normalizeString.cpp | 114 +++++++++++++----- tests/performance/normalize_utf8.xml | 15 +++ .../02011_normalize_utf8.reference | 12 +- .../0_stateless/02011_normalize_utf8.sql | 51 ++++++-- 4 files changed, 148 insertions(+), 44 deletions(-) create mode 100644 tests/performance/normalize_utf8.xml diff --git a/src/Functions/normalizeString.cpp b/src/Functions/normalizeString.cpp index 178c2dc2cf1..5beca566cd1 100644 --- a/src/Functions/normalizeString.cpp +++ b/src/Functions/normalizeString.cpp @@ -1,6 +1,10 @@ +#if !defined(ARCADIA_BUILD) +# include "config_core.h" +#endif + +#if USE_ICU #include #include -#include #include #include #include @@ -15,12 +19,67 @@ namespace DB namespace ErrorCodes { + extern const int ILLEGAL_COLUMN; extern const int CANNOT_NORMALIZE_STRING; } namespace { +// Expansion factors are specified for UTF-32, since icu uses UTF-32 for normalization +// Maximum expansion factors for different normalization forms +// https://unicode.org/faq/normalization.html#12 + +struct NormalizeNFCImpl +{ + static constexpr auto name = "normalizeUTF8NFC"; + + static constexpr auto expansionFactor = 3; + + static const UNormalizer2 *getNormalizer(UErrorCode *err) + { + return unorm2_getNFCInstance(err); + } +}; + +struct NormalizeNFDImpl +{ + static constexpr auto name = "normalizeUTF8NFD"; + + static constexpr auto expansionFactor = 4; + + static const UNormalizer2 *getNormalizer(UErrorCode *err) + { + return unorm2_getNFDInstance(err); + } +}; + +struct NormalizeNFKCImpl +{ + static constexpr auto name = "normalizeUTF8NFKC"; + + static constexpr auto expansionFactor = 18; + + static const UNormalizer2 *getNormalizer(UErrorCode *err) + { + return unorm2_getNFKCInstance(err); + } +}; + + +struct NormalizeNFKDImpl +{ + static constexpr auto name = "normalizeUTF8NFKD"; + + static constexpr auto expansionFactor = 18; + + static const UNormalizer2 *getNormalizer(UErrorCode *err) + { + return unorm2_getNFKDInstance(err); + } +}; + +template struct NormalizeUTF8Impl { @@ -31,10 +90,9 @@ struct NormalizeUTF8Impl { UErrorCode err = U_ZERO_ERROR; - const UNormalizer2 *normalizer = unorm2_getNFCInstance(&err); - if (U_FAILURE(err)) { - throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err)); - } + const UNormalizer2 *normalizer = NormalizeImpl::getNormalizer(&err); + if (U_FAILURE(err)) + throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (getNormalizer): {}", u_errorName(err)); size_t size = offsets.size(); res_offsets.resize(size); @@ -60,13 +118,10 @@ struct NormalizeUTF8Impl reinterpret_cast(&data[current_from_offset]), from_size, &err); - if (U_FAILURE(err)) { - throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err)); - } + if (U_FAILURE(err)) + throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strFromUTF8): {}", u_errorName(err)); - // NFC should produce no more than 3x code points - // https://unicode.org/faq/normalization.html#12 - to_uchars.resize(from_code_points * 3 + 1); + to_uchars.resize(from_code_points * NormalizeImpl::expansionFactor + 1); int32_t to_code_points = unorm2_normalize( normalizer, @@ -75,14 +130,12 @@ struct NormalizeUTF8Impl to_uchars.data(), to_uchars.size(), &err); - if (U_FAILURE(err)) { - throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err)); - } + if (U_FAILURE(err)) + throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (normalize): {}", u_errorName(err)); - size_t max_to_size = current_to_offset + 2 * to_code_points + 1; - if (res_data.size() < max_to_size) { + size_t max_to_size = current_to_offset + 4 * to_code_points + 1; + if (res_data.size() < max_to_size) res_data.resize(max_to_size); - } int32_t to_size; u_strToUTF8( @@ -92,9 +145,8 @@ struct NormalizeUTF8Impl to_uchars.data(), to_code_points, &err); - if (U_FAILURE(err)) { - throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err)); - } + if (U_FAILURE(err)) + throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strToUTF8): {}", u_errorName(err)); current_to_offset += to_size; res_data[current_to_offset] = 0; @@ -111,16 +163,20 @@ struct NormalizeUTF8Impl } }; -struct NameNormalizeUTF8 +using FunctionNormalizeUTF8NFC = FunctionStringToString, NormalizeNFCImpl>; +using FunctionNormalizeUTF8NFD = FunctionStringToString, NormalizeNFDImpl>; +using FunctionNormalizeUTF8NFKC = FunctionStringToString, NormalizeNFKCImpl>; +using FunctionNormalizeUTF8NFKD = FunctionStringToString, NormalizeNFKDImpl>; +} + +void registerFunctionNormalizeUTF8(FunctionFactory & factory) { - static constexpr auto name = "normalizeUTF8"; -}; - -using FunctionNormalizeUTF8 = FunctionStringToString; -} - -void registerFunctionNormalizeUTF8(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); } } + +#endif diff --git a/tests/performance/normalize_utf8.xml b/tests/performance/normalize_utf8.xml new file mode 100644 index 00000000000..de9bd87fdf8 --- /dev/null +++ b/tests/performance/normalize_utf8.xml @@ -0,0 +1,15 @@ + + + hits_10m_single + + + CREATE TABLE strings (words String) ENGINE Memory + INSERT INTO strings SELECT SearchPhrase FROM hits_10m_single WHERE length(SearchPhrase) > 0 + + SELECT normalizeUTF8NFC(words) FROM strings FORMAT Null + SELECT normalizeUTF8NFD(words) FROM strings FORMAT Null + SELECT normalizeUTF8NFKC(words) FROM strings FORMAT Null + SELECT normalizeUTF8NFKD(words) FROM strings FORMAT Null + + DROP TABLE IF EXISTS strings + diff --git a/tests/queries/0_stateless/02011_normalize_utf8.reference b/tests/queries/0_stateless/02011_normalize_utf8.reference index 6878a38ca0d..b97f0ee5a01 100644 --- a/tests/queries/0_stateless/02011_normalize_utf8.reference +++ b/tests/queries/0_stateless/02011_normalize_utf8.reference @@ -1,3 +1,11 @@ ё ё 2 4 ё ё 2 2 -ё 4 ё 2 -ё 2 ё 2 +1 ё 4 ё 2 ё 4 ё 2 ё 4 +2 ё 2 ё 2 ё 4 ё 2 ё 4 +3 జ్ఞ‌ా 15 జ్ఞ‌ా 15 జ్ఞ‌ా 15 జ్ఞ‌ా 15 జ్ఞ‌ా 15 +4 本気ですか 15 本気ですか 15 本気ですか 18 本気ですか 15 本気ですか 18 +5 ﷺ 3 ﷺ 3 ﷺ 3 صلى الله عليه وسلم 33 صلى الله عليه وسلم 33 +6 ᾂ 3 ᾂ 3 ᾂ 8 ᾂ 3 ᾂ 8 +7 ΐ 2 ΐ 2 ΐ 6 ΐ 2 ΐ 6 +8 שּׁ 6 שּׁ 6 שּׁ 6 שּׁ 6 שּׁ 6 +9 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 +10 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 282 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 282 diff --git a/tests/queries/0_stateless/02011_normalize_utf8.sql b/tests/queries/0_stateless/02011_normalize_utf8.sql index c28a0c0a794..5abb6b4d8fb 100644 --- a/tests/queries/0_stateless/02011_normalize_utf8.sql +++ b/tests/queries/0_stateless/02011_normalize_utf8.sql @@ -1,19 +1,44 @@ +-- Tags: no-fasttest + DROP TABLE IF EXISTS normalize_test; -CREATE TABLE normalize_test (value String) ENGINE = MergeTree ORDER BY value; +CREATE TABLE normalize_test (id int, value String) ENGINE = MergeTree ORDER BY value; + SELECT - 'ё' AS norm, - 'ё' AS denorm, - length(norm), - length(denorm), - normalizeUTF8(norm), - normalizeUTF8(denorm), - length(normalizeUTF8(norm)), - length(normalizeUTF8(denorm)); + 'ё' AS norm, 'ё' AS denorm, + length(norm), length(denorm), + normalizeUTF8NFC(norm) AS norm_nfc, + normalizeUTF8NFC(denorm) AS denorm_nfc, + length(norm_nfc), + length(denorm_nfc); -INSERT INTO normalize_test (value) VALUES ('ё'); -INSERT INTO normalize_test (value) VALUES ('ё'); -SELECT value, length(value), normalizeUTF8(value) AS normalized, length(normalized) FROM normalize_test; +INSERT INTO normalize_test (id, value) VALUES (1, 'ё'); +INSERT INTO normalize_test (id, value) VALUES (2, 'ё'); +INSERT INTO normalize_test (id, value) VALUES (3, 'జ్ఞ‌ా'); +INSERT INTO normalize_test (id, value) VALUES (4, '本気ですか'); +INSERT INTO normalize_test (id, value) VALUES (5, 'ﷺ'); +INSERT INTO normalize_test (id, value) VALUES (6, 'ᾂ'); +INSERT INTO normalize_test (id, value) VALUES (7, 'ΐ'); +INSERT INTO normalize_test (id, value) VALUES (8, 'שּׁ'); +INSERT INTO normalize_test (id, value) VALUES (9, '𝅘𝅥𝅮'); -SELECT char(228) AS value, normalizeUTF8(value); -- { serverError 619 } + +INSERT INTO normalize_test (id, value) VALUES (10, 'Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒'); + + + +SELECT + id, value, length(value), + normalizeUTF8NFC(value) AS nfc, length(nfc) AS nfc_len, + normalizeUTF8NFD(value) AS nfd, length(nfd) AS nfd_len, + normalizeUTF8NFKC(value) AS nfkc, length(nfkc) AS nfkc_len, + normalizeUTF8NFKD(value) AS nfkd, length(nfkd) AS nfkd_len +FROM normalize_test +ORDER BY id; + + +SELECT char(228) AS value, normalizeUTF8NFC(value); -- { serverError 621 } +SELECT char(228) AS value, normalizeUTF8NFD(value); -- { serverError 621 } +SELECT char(228) AS value, normalizeUTF8NFKC(value); -- { serverError 621 } +SELECT char(228) AS value, normalizeUTF8NFKD(value); -- { serverError 621 }