From 762904adbda95dc24b771250b1f32ccd404db739 Mon Sep 17 00:00:00 2001
From: Viachaslau Boben <darkkeks@rambler.ru>
Date: Mon, 27 Sep 2021 18:45:04 +0300
Subject: [PATCH] Add nfd and perf test

---
 src/Functions/normalizeString.cpp             | 114 +++++++++++++-----
 tests/performance/normalize_utf8.xml          |  15 +++
 .../02011_normalize_utf8.reference            |  12 +-
 .../0_stateless/02011_normalize_utf8.sql      |  51 ++++++--
 4 files changed, 148 insertions(+), 44 deletions(-)
 create mode 100644 tests/performance/normalize_utf8.xml
diff --git a/src/Functions/normalizeString.cpp b/src/Functions/normalizeString.cpp
index 178c2dc2cf1..5beca566cd1 100644
--- a/src/Functions/normalizeString.cpp
+++ b/src/Functions/normalizeString.cpp
@@ -1,6 +1,10 @@
+#if !defined(ARCADIA_BUILD)
+#    include "config_core.h"
+#endif
+
+#if USE_ICU
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionStringToString.h>
-#include <unicode/normalizer2.h>
 #include <unicode/rep.h>
 #include <unicode/unistr.h>
 #include <unicode/unorm2.h>
@@ -15,12 +19,67 @@ namespace DB
 
 namespace ErrorCodes
 {
+    extern const int ILLEGAL_COLUMN;
     extern const int CANNOT_NORMALIZE_STRING;
 }
 
 namespace
 {
 
+// Expansion factors are specified for UTF-32, since icu uses UTF-32 for normalization
+// Maximum expansion factors for different normalization forms
+// https://unicode.org/faq/normalization.html#12
+
+struct NormalizeNFCImpl
+{
+    static constexpr auto name = "normalizeUTF8NFC";
+
+    static constexpr auto expansionFactor = 3;
+
+    static const UNormalizer2 *getNormalizer(UErrorCode *err)
+    {
+        return unorm2_getNFCInstance(err);
+    }
+};
+
+struct NormalizeNFDImpl
+{
+    static constexpr auto name = "normalizeUTF8NFD";
+
+    static constexpr auto expansionFactor = 4;
+
+    static const UNormalizer2 *getNormalizer(UErrorCode *err)
+    {
+        return unorm2_getNFDInstance(err);
+    }
+};
+
+struct NormalizeNFKCImpl
+{
+    static constexpr auto name = "normalizeUTF8NFKC";
+
+    static constexpr auto expansionFactor = 18;
+
+    static const UNormalizer2 *getNormalizer(UErrorCode *err)
+    {
+        return unorm2_getNFKCInstance(err);
+    }
+};
+
+
+struct NormalizeNFKDImpl
+{
+    static constexpr auto name = "normalizeUTF8NFKD";
+
+    static constexpr auto expansionFactor = 18;
+
+    static const UNormalizer2 *getNormalizer(UErrorCode *err)
+    {
+        return unorm2_getNFKDInstance(err);
+    }
+};
+
+template<typename NormalizeImpl>
 struct NormalizeUTF8Impl
 {
 
@@ -31,10 +90,9 @@ struct NormalizeUTF8Impl
     {
         UErrorCode err = U_ZERO_ERROR;
 
-        const UNormalizer2 *normalizer = unorm2_getNFCInstance(&err);
-        if (U_FAILURE(err)) {
-            throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
-        }
+        const UNormalizer2 *normalizer = NormalizeImpl::getNormalizer(&err);
+        if (U_FAILURE(err))
+            throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (getNormalizer): {}", u_errorName(err));
 
         size_t size = offsets.size();
         res_offsets.resize(size);
@@ -60,13 +118,10 @@ struct NormalizeUTF8Impl
                 reinterpret_cast<const char*>(&data[current_from_offset]),
                 from_size,
                 &err);
-            if (U_FAILURE(err)) {
-                throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
-            }
+            if (U_FAILURE(err))
+                throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strFromUTF8): {}", u_errorName(err));
 
-            // NFC should produce no more than 3x code points
-            // https://unicode.org/faq/normalization.html#12
-            to_uchars.resize(from_code_points * 3 + 1);
+            to_uchars.resize(from_code_points * NormalizeImpl::expansionFactor + 1);
 
             int32_t to_code_points = unorm2_normalize(
                 normalizer,
@@ -75,14 +130,12 @@ struct NormalizeUTF8Impl
                 to_uchars.data(),
                 to_uchars.size(),
                 &err);
-            if (U_FAILURE(err)) {
-                throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
-            }
+            if (U_FAILURE(err))
+                throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (normalize): {}", u_errorName(err));
 
-            size_t max_to_size = current_to_offset + 2 * to_code_points + 1;
-            if (res_data.size() < max_to_size) {
+            size_t max_to_size = current_to_offset + 4 * to_code_points + 1;
+            if (res_data.size() < max_to_size)
                 res_data.resize(max_to_size);
-            }
 
             int32_t to_size;
             u_strToUTF8(
@@ -92,9 +145,8 @@ struct NormalizeUTF8Impl
                 to_uchars.data(),
                 to_code_points,
                 &err);
-            if (U_FAILURE(err)) {
-                throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
-            }
+            if (U_FAILURE(err))
+                throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strToUTF8): {}", u_errorName(err));
 
             current_to_offset += to_size;
             res_data[current_to_offset] = 0;
@@ -111,16 +163,20 @@ struct NormalizeUTF8Impl
     }
 };
 
-struct NameNormalizeUTF8
+using FunctionNormalizeUTF8NFC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFCImpl>, NormalizeNFCImpl>;
+using FunctionNormalizeUTF8NFD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFDImpl>, NormalizeNFDImpl>;
+using FunctionNormalizeUTF8NFKC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKCImpl>, NormalizeNFKCImpl>;
+using FunctionNormalizeUTF8NFKD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKDImpl>, NormalizeNFKDImpl>;
+}
+
+void registerFunctionNormalizeUTF8(FunctionFactory & factory)
 {
-    static constexpr auto name = "normalizeUTF8";
-};
-
-using FunctionNormalizeUTF8 = FunctionStringToString<NormalizeUTF8Impl, NameNormalizeUTF8>;
-}
-
-void registerFunctionNormalizeUTF8(FunctionFactory & factory) {
-    factory.registerFunction<FunctionNormalizeUTF8>();
+    factory.registerFunction<FunctionNormalizeUTF8NFC>();
+    factory.registerFunction<FunctionNormalizeUTF8NFD>();
+    factory.registerFunction<FunctionNormalizeUTF8NFKC>();
+    factory.registerFunction<FunctionNormalizeUTF8NFKD>();
 }
 
 }
+
+#endif
diff --git a/tests/performance/normalize_utf8.xml b/tests/performance/normalize_utf8.xml
new file mode 100644
index 00000000000..de9bd87fdf8
--- /dev/null
+++ b/tests/performance/normalize_utf8.xml
@@ -0,0 +1,15 @@
+<test>
+    <preconditions>
+        <table_exists>hits_10m_single</table_exists>
+    </preconditions>
+
+    <create_query>CREATE TABLE strings (words String) ENGINE Memory</create_query>
+    <fill_query>INSERT INTO strings SELECT SearchPhrase FROM hits_10m_single WHERE length(SearchPhrase) > 0</fill_query>
+
+    <query>SELECT normalizeUTF8NFC(words) FROM strings FORMAT Null</query>
+    <query>SELECT normalizeUTF8NFD(words) FROM strings FORMAT Null</query>
+    <query>SELECT normalizeUTF8NFKC(words) FROM strings FORMAT Null</query>
+    <query>SELECT normalizeUTF8NFKD(words) FROM strings FORMAT Null</query>
+
+    <drop_query>DROP TABLE IF EXISTS strings</drop_query>
+</test>
diff --git a/tests/queries/0_stateless/02011_normalize_utf8.reference b/tests/queries/0_stateless/02011_normalize_utf8.reference
index 6878a38ca0d..b97f0ee5a01 100644
--- a/tests/queries/0_stateless/02011_normalize_utf8.reference
+++ b/tests/queries/0_stateless/02011_normalize_utf8.reference
@@ -1,3 +1,11 @@
 ё	ё	2	4	ё	ё	2	2
-ё	4	ё	2
-ё	2	ё	2
+1	ё	4	ё	2	ё	4	ё	2	ё	4
+2	ё	2	ё	2	ё	4	ё	2	ё	4
+3	జ్ఞ‌ా	15	జ్ఞ‌ా	15	జ్ఞ‌ా	15	జ్ఞ‌ా	15	జ్ఞ‌ా	15
+4	本気ですか	15	本気ですか	15	本気ですか	18	本気ですか	15	本気ですか	18
+5	ﷺ	3	ﷺ	3	ﷺ	3	صلى الله عليه وسلم	33	صلى الله عليه وسلم	33
+6	ᾂ	3	ᾂ	3	ᾂ	8	ᾂ	3	ᾂ	8
+7	ΐ	2	ΐ	2	ΐ	6	ΐ	2	ΐ	6
+8	שּׁ	6	שּׁ	6	שּׁ	6	שּׁ	6	שּׁ	6
+9	𝅘𝅥𝅮	12	𝅘𝅥𝅮	12	𝅘𝅥𝅮	12	𝅘𝅥𝅮	12	𝅘𝅥𝅮	12
+10	Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒	281	Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒	281	Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒	282	Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒	281	Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒	282
diff --git a/tests/queries/0_stateless/02011_normalize_utf8.sql b/tests/queries/0_stateless/02011_normalize_utf8.sql
index c28a0c0a794..5abb6b4d8fb 100644
--- a/tests/queries/0_stateless/02011_normalize_utf8.sql
+++ b/tests/queries/0_stateless/02011_normalize_utf8.sql
@@ -1,19 +1,44 @@
+-- Tags: no-fasttest
+
 DROP TABLE IF EXISTS normalize_test;
-CREATE TABLE normalize_test (value String) ENGINE = MergeTree ORDER BY value;
+CREATE TABLE normalize_test (id int, value String) ENGINE = MergeTree ORDER BY value;
+
 
 SELECT
-    'ё' AS norm,
-    'ё' AS denorm,
-    length(norm),
-    length(denorm),
-    normalizeUTF8(norm),
-    normalizeUTF8(denorm),
-    length(normalizeUTF8(norm)),
-    length(normalizeUTF8(denorm));
+    'ё' AS norm, 'ё' AS denorm,
+    length(norm), length(denorm),
+    normalizeUTF8NFC(norm) AS norm_nfc,
+    normalizeUTF8NFC(denorm) AS denorm_nfc,
+    length(norm_nfc),
+    length(denorm_nfc);
 
-INSERT INTO normalize_test (value) VALUES ('ё');
-INSERT INTO normalize_test (value) VALUES ('ё');
 
-SELECT value, length(value), normalizeUTF8(value) AS normalized, length(normalized) FROM normalize_test;
+INSERT INTO normalize_test (id, value) VALUES (1, 'ё');
+INSERT INTO normalize_test (id, value) VALUES (2, 'ё');
+INSERT INTO normalize_test (id, value) VALUES (3, 'జ్ఞ‌ా');
+INSERT INTO normalize_test (id, value) VALUES (4, '本気ですか');
+INSERT INTO normalize_test (id, value) VALUES (5, 'ﷺ');
+INSERT INTO normalize_test (id, value) VALUES (6, 'ᾂ');
+INSERT INTO normalize_test (id, value) VALUES (7, 'ΐ');
+INSERT INTO normalize_test (id, value) VALUES (8, 'שּׁ');
+INSERT INTO normalize_test (id, value) VALUES (9, '𝅘𝅥𝅮');
 
-SELECT char(228) AS value, normalizeUTF8(value); -- { serverError 619 }
+
+INSERT INTO normalize_test (id, value) VALUES (10, 'Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒');
+
+
+
+SELECT
+    id, value, length(value),
+    normalizeUTF8NFC(value) AS nfc, length(nfc) AS nfc_len,
+    normalizeUTF8NFD(value) AS nfd, length(nfd) AS nfd_len,
+    normalizeUTF8NFKC(value) AS nfkc, length(nfkc) AS nfkc_len,
+    normalizeUTF8NFKD(value) AS nfkd, length(nfkd) AS nfkd_len
+FROM normalize_test
+ORDER BY id;
+
+
+SELECT char(228) AS value, normalizeUTF8NFC(value); -- { serverError 621 }
+SELECT char(228) AS value, normalizeUTF8NFD(value); -- { serverError 621 }
+SELECT char(228) AS value, normalizeUTF8NFKC(value); -- { serverError 621 }
+SELECT char(228) AS value, normalizeUTF8NFKD(value); -- { serverError 621 }