Merge pull request #61749 from ClickHouse/pufit/volnitsky-assert-fix

Fix crash in `multiSearchAllPositionsCaseInsensitiveUTF8` for incorrect UTF-8
This commit is contained in:
Alexey Milovidov 2024-03-25 04:47:03 +03:00 committed by GitHub
commit cd3edf3f5b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 8 deletions

View File

@ -191,7 +191,8 @@ namespace VolnitskyTraits
if (length_l != length_r)
return false;
assert(length_l >= 2 && length_r >= 2);
if (length_l < 2 || length_r < 2)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.
chars.c0 = seq_l[seq_ngram_offset];
chars.c1 = seq_l[seq_ngram_offset + 1];
@ -253,7 +254,9 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;
assert(size_l >= 1 && size_u >= 1);
if (size_l == 0 || size_u == 0)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.
chars.c1 = seq_l[0];
putNGramBase(n, offset);
@ -276,7 +279,8 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;
assert(size_l > seq_ngram_offset && size_u > seq_ngram_offset);
if (size_l <= seq_ngram_offset || size_u <= seq_ngram_offset)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.
chars.c0 = seq_l[seq_ngram_offset];
putNGramBase(n, offset);
@ -302,10 +306,8 @@ namespace VolnitskyTraits
if (size_first_l != size_first_u || size_second_l != size_second_u)
return false;
assert(size_first_l > seq_ngram_offset);
assert(size_first_u > seq_ngram_offset);
assert(size_second_l > 0);
assert(size_second_u > 0);
if (size_first_l <= seq_ngram_offset || size_first_u <= seq_ngram_offset || size_second_l == 0 || size_second_u == 0)
return false;
auto c0l = first_l_seq[seq_ngram_offset];
auto c0u = first_u_seq[seq_ngram_offset];
@ -399,7 +401,7 @@ public:
if (fallback || fallback_searcher.force_fallback)
return;
hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
hash = std::make_unique<VolnitskyTraits::Offset[]>(VolnitskyTraits::hash_size);
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0

View File

@ -12872,3 +12872,4 @@
1
1
1
1

View File

@ -223,6 +223,8 @@ select [2] = multiSearchAllPositions(materialize('abab'), materialize(['ba']));
select [1] = multiSearchAllPositionsCaseInsensitive(materialize('aBaB'), materialize(['abab']));
select [3] = multiSearchAllPositionsUTF8(materialize('ab€ab'), materialize(['']));
select [3] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize('ab€AB'), materialize(['€ab']));
-- checks the correct handling of broken utf-8 sequence
select [0] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize(''), materialize(['a\x90\x90\x90\x90\x90\x90']));
select 1 = multiSearchAny(materialize('abcdefgh'), ['b']);
select 1 = multiSearchAny(materialize('abcdefgh'), ['bc']);