mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
Add nfd and perf test
This commit is contained in:
parent
b3325772f7
commit
762904adbd
@ -1,6 +1,10 @@
|
||||
#if !defined(ARCADIA_BUILD)
|
||||
# include "config_core.h"
|
||||
#endif
|
||||
|
||||
#if USE_ICU
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionStringToString.h>
|
||||
#include <unicode/normalizer2.h>
|
||||
#include <unicode/rep.h>
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/unorm2.h>
|
||||
@ -15,12 +19,67 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int CANNOT_NORMALIZE_STRING;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Expansion factors are specified for UTF-32, since icu uses UTF-32 for normalization
|
||||
// Maximum expansion factors for different normalization forms
|
||||
// https://unicode.org/faq/normalization.html#12
|
||||
|
||||
struct NormalizeNFCImpl
|
||||
{
|
||||
static constexpr auto name = "normalizeUTF8NFC";
|
||||
|
||||
static constexpr auto expansionFactor = 3;
|
||||
|
||||
static const UNormalizer2 *getNormalizer(UErrorCode *err)
|
||||
{
|
||||
return unorm2_getNFCInstance(err);
|
||||
}
|
||||
};
|
||||
|
||||
struct NormalizeNFDImpl
|
||||
{
|
||||
static constexpr auto name = "normalizeUTF8NFD";
|
||||
|
||||
static constexpr auto expansionFactor = 4;
|
||||
|
||||
static const UNormalizer2 *getNormalizer(UErrorCode *err)
|
||||
{
|
||||
return unorm2_getNFDInstance(err);
|
||||
}
|
||||
};
|
||||
|
||||
struct NormalizeNFKCImpl
|
||||
{
|
||||
static constexpr auto name = "normalizeUTF8NFKC";
|
||||
|
||||
static constexpr auto expansionFactor = 18;
|
||||
|
||||
static const UNormalizer2 *getNormalizer(UErrorCode *err)
|
||||
{
|
||||
return unorm2_getNFKCInstance(err);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct NormalizeNFKDImpl
|
||||
{
|
||||
static constexpr auto name = "normalizeUTF8NFKD";
|
||||
|
||||
static constexpr auto expansionFactor = 18;
|
||||
|
||||
static const UNormalizer2 *getNormalizer(UErrorCode *err)
|
||||
{
|
||||
return unorm2_getNFKDInstance(err);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename NormalizeImpl>
|
||||
struct NormalizeUTF8Impl
|
||||
{
|
||||
|
||||
@ -31,10 +90,9 @@ struct NormalizeUTF8Impl
|
||||
{
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
|
||||
const UNormalizer2 *normalizer = unorm2_getNFCInstance(&err);
|
||||
if (U_FAILURE(err)) {
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
|
||||
}
|
||||
const UNormalizer2 *normalizer = NormalizeImpl::getNormalizer(&err);
|
||||
if (U_FAILURE(err))
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (getNormalizer): {}", u_errorName(err));
|
||||
|
||||
size_t size = offsets.size();
|
||||
res_offsets.resize(size);
|
||||
@ -60,13 +118,10 @@ struct NormalizeUTF8Impl
|
||||
reinterpret_cast<const char*>(&data[current_from_offset]),
|
||||
from_size,
|
||||
&err);
|
||||
if (U_FAILURE(err)) {
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
|
||||
}
|
||||
if (U_FAILURE(err))
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strFromUTF8): {}", u_errorName(err));
|
||||
|
||||
// NFC should produce no more than 3x code points
|
||||
// https://unicode.org/faq/normalization.html#12
|
||||
to_uchars.resize(from_code_points * 3 + 1);
|
||||
to_uchars.resize(from_code_points * NormalizeImpl::expansionFactor + 1);
|
||||
|
||||
int32_t to_code_points = unorm2_normalize(
|
||||
normalizer,
|
||||
@ -75,14 +130,12 @@ struct NormalizeUTF8Impl
|
||||
to_uchars.data(),
|
||||
to_uchars.size(),
|
||||
&err);
|
||||
if (U_FAILURE(err)) {
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
|
||||
}
|
||||
if (U_FAILURE(err))
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (normalize): {}", u_errorName(err));
|
||||
|
||||
size_t max_to_size = current_to_offset + 2 * to_code_points + 1;
|
||||
if (res_data.size() < max_to_size) {
|
||||
size_t max_to_size = current_to_offset + 4 * to_code_points + 1;
|
||||
if (res_data.size() < max_to_size)
|
||||
res_data.resize(max_to_size);
|
||||
}
|
||||
|
||||
int32_t to_size;
|
||||
u_strToUTF8(
|
||||
@ -92,9 +145,8 @@ struct NormalizeUTF8Impl
|
||||
to_uchars.data(),
|
||||
to_code_points,
|
||||
&err);
|
||||
if (U_FAILURE(err)) {
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed: {}", u_errorName(err));
|
||||
}
|
||||
if (U_FAILURE(err))
|
||||
throw Exception(ErrorCodes::CANNOT_NORMALIZE_STRING, "Normalization failed (strToUTF8): {}", u_errorName(err));
|
||||
|
||||
current_to_offset += to_size;
|
||||
res_data[current_to_offset] = 0;
|
||||
@ -111,16 +163,20 @@ struct NormalizeUTF8Impl
|
||||
}
|
||||
};
|
||||
|
||||
struct NameNormalizeUTF8
|
||||
using FunctionNormalizeUTF8NFC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFCImpl>, NormalizeNFCImpl>;
|
||||
using FunctionNormalizeUTF8NFD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFDImpl>, NormalizeNFDImpl>;
|
||||
using FunctionNormalizeUTF8NFKC = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKCImpl>, NormalizeNFKCImpl>;
|
||||
using FunctionNormalizeUTF8NFKD = FunctionStringToString<NormalizeUTF8Impl<NormalizeNFKDImpl>, NormalizeNFKDImpl>;
|
||||
}
|
||||
|
||||
void registerFunctionNormalizeUTF8(FunctionFactory & factory)
|
||||
{
|
||||
static constexpr auto name = "normalizeUTF8";
|
||||
};
|
||||
|
||||
using FunctionNormalizeUTF8 = FunctionStringToString<NormalizeUTF8Impl, NameNormalizeUTF8>;
|
||||
}
|
||||
|
||||
void registerFunctionNormalizeUTF8(FunctionFactory & factory) {
|
||||
factory.registerFunction<FunctionNormalizeUTF8>();
|
||||
factory.registerFunction<FunctionNormalizeUTF8NFC>();
|
||||
factory.registerFunction<FunctionNormalizeUTF8NFD>();
|
||||
factory.registerFunction<FunctionNormalizeUTF8NFKC>();
|
||||
factory.registerFunction<FunctionNormalizeUTF8NFKD>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
15
tests/performance/normalize_utf8.xml
Normal file
15
tests/performance/normalize_utf8.xml
Normal file
@ -0,0 +1,15 @@
|
||||
<test>
|
||||
<preconditions>
|
||||
<table_exists>hits_10m_single</table_exists>
|
||||
</preconditions>
|
||||
|
||||
<create_query>CREATE TABLE strings (words String) ENGINE Memory</create_query>
|
||||
<fill_query>INSERT INTO strings SELECT SearchPhrase FROM hits_10m_single WHERE length(SearchPhrase) > 0</fill_query>
|
||||
|
||||
<query>SELECT normalizeUTF8NFC(words) FROM strings FORMAT Null</query>
|
||||
<query>SELECT normalizeUTF8NFD(words) FROM strings FORMAT Null</query>
|
||||
<query>SELECT normalizeUTF8NFKC(words) FROM strings FORMAT Null</query>
|
||||
<query>SELECT normalizeUTF8NFKD(words) FROM strings FORMAT Null</query>
|
||||
|
||||
<drop_query>DROP TABLE IF EXISTS strings</drop_query>
|
||||
</test>
|
@ -1,3 +1,11 @@
|
||||
ё ё 2 4 ё ё 2 2
|
||||
ё 4 ё 2
|
||||
ё 2 ё 2
|
||||
1 ё 4 ё 2 ё 4 ё 2 ё 4
|
||||
2 ё 2 ё 2 ё 4 ё 2 ё 4
|
||||
3 జ్ఞా 15 జ్ఞా 15 జ్ఞా 15 జ్ఞా 15 జ్ఞా 15
|
||||
4 本気ですか 15 本気ですか 15 本気ですか 18 本気ですか 15 本気ですか 18
|
||||
5 ﷺ 3 ﷺ 3 ﷺ 3 صلى الله عليه وسلم 33 صلى الله عليه وسلم 33
|
||||
6 ᾂ 3 ᾂ 3 ᾂ 8 ᾂ 3 ᾂ 8
|
||||
7 ΐ 2 ΐ 2 ΐ 6 ΐ 2 ΐ 6
|
||||
8 שּׁ 6 שּׁ 6 שּׁ 6 שּׁ 6 שּׁ 6
|
||||
9 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12 𝅘𝅥𝅮 12
|
||||
10 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 282 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 281 Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒ 282
|
||||
|
@ -1,19 +1,44 @@
|
||||
-- Tags: no-fasttest
|
||||
|
||||
DROP TABLE IF EXISTS normalize_test;
|
||||
CREATE TABLE normalize_test (value String) ENGINE = MergeTree ORDER BY value;
|
||||
CREATE TABLE normalize_test (id int, value String) ENGINE = MergeTree ORDER BY value;
|
||||
|
||||
|
||||
SELECT
|
||||
'ё' AS norm,
|
||||
'ё' AS denorm,
|
||||
length(norm),
|
||||
length(denorm),
|
||||
normalizeUTF8(norm),
|
||||
normalizeUTF8(denorm),
|
||||
length(normalizeUTF8(norm)),
|
||||
length(normalizeUTF8(denorm));
|
||||
'ё' AS norm, 'ё' AS denorm,
|
||||
length(norm), length(denorm),
|
||||
normalizeUTF8NFC(norm) AS norm_nfc,
|
||||
normalizeUTF8NFC(denorm) AS denorm_nfc,
|
||||
length(norm_nfc),
|
||||
length(denorm_nfc);
|
||||
|
||||
INSERT INTO normalize_test (value) VALUES ('ё');
|
||||
INSERT INTO normalize_test (value) VALUES ('ё');
|
||||
|
||||
SELECT value, length(value), normalizeUTF8(value) AS normalized, length(normalized) FROM normalize_test;
|
||||
INSERT INTO normalize_test (id, value) VALUES (1, 'ё');
|
||||
INSERT INTO normalize_test (id, value) VALUES (2, 'ё');
|
||||
INSERT INTO normalize_test (id, value) VALUES (3, 'జ్ఞా');
|
||||
INSERT INTO normalize_test (id, value) VALUES (4, '本気ですか');
|
||||
INSERT INTO normalize_test (id, value) VALUES (5, 'ﷺ');
|
||||
INSERT INTO normalize_test (id, value) VALUES (6, 'ᾂ');
|
||||
INSERT INTO normalize_test (id, value) VALUES (7, 'ΐ');
|
||||
INSERT INTO normalize_test (id, value) VALUES (8, 'שּׁ');
|
||||
INSERT INTO normalize_test (id, value) VALUES (9, '𝅘𝅥𝅮');
|
||||
|
||||
SELECT char(228) AS value, normalizeUTF8(value); -- { serverError 619 }
|
||||
|
||||
INSERT INTO normalize_test (id, value) VALUES (10, 'Q̹̣̩̭̰̰̹̄ͬ̿͋̃ṷ̬̰ͥe̘͚͈̰̺̍͐s͎̜̖t͔̣̯̲̜̠ͣ̑ͨ̉̈̈o̲͙̺͊ͯͣ̐̋̂̔ ̳͉͍̒̂è̗ͥͯͨ̍ͮ͛ ̦̹̣̰̐̅̑͑̅̂t͙̭̻̖͛̾e̺͙ͣ͒̚ṣ̠͉͓͔̲̦̎t̖͖̝͓̣ͭ͑̈́̂ỏ̥͕͈͛̓ ̀ͦ̽ͅZͯ̑̎a͆l̻ͨ̋ͧͣͨͬg͉̙̟̾̅̾ͬo̠ͮ͒');
|
||||
|
||||
|
||||
|
||||
SELECT
|
||||
id, value, length(value),
|
||||
normalizeUTF8NFC(value) AS nfc, length(nfc) AS nfc_len,
|
||||
normalizeUTF8NFD(value) AS nfd, length(nfd) AS nfd_len,
|
||||
normalizeUTF8NFKC(value) AS nfkc, length(nfkc) AS nfkc_len,
|
||||
normalizeUTF8NFKD(value) AS nfkd, length(nfkd) AS nfkd_len
|
||||
FROM normalize_test
|
||||
ORDER BY id;
|
||||
|
||||
|
||||
SELECT char(228) AS value, normalizeUTF8NFC(value); -- { serverError 621 }
|
||||
SELECT char(228) AS value, normalizeUTF8NFD(value); -- { serverError 621 }
|
||||
SELECT char(228) AS value, normalizeUTF8NFKC(value); -- { serverError 621 }
|
||||
SELECT char(228) AS value, normalizeUTF8NFKD(value); -- { serverError 621 }
|
||||
|
Loading…
Reference in New Issue
Block a user