mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-12 18:45:20 +00:00
Merge pull request #61749 from ClickHouse/pufit/volnitsky-assert-fix
Fix crash in `multiSearchAllPositionsCaseInsensitiveUTF8` for incorrect UTF-8
This commit is contained in:
commit
cd3edf3f5b
@ -191,7 +191,8 @@ namespace VolnitskyTraits
|
|||||||
if (length_l != length_r)
|
if (length_l != length_r)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
assert(length_l >= 2 && length_r >= 2);
|
if (length_l < 2 || length_r < 2)
|
||||||
|
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.
|
||||||
|
|
||||||
chars.c0 = seq_l[seq_ngram_offset];
|
chars.c0 = seq_l[seq_ngram_offset];
|
||||||
chars.c1 = seq_l[seq_ngram_offset + 1];
|
chars.c1 = seq_l[seq_ngram_offset + 1];
|
||||||
@ -253,7 +254,9 @@ namespace VolnitskyTraits
|
|||||||
if (size_l != size_u)
|
if (size_l != size_u)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
assert(size_l >= 1 && size_u >= 1);
|
if (size_l == 0 || size_u == 0)
|
||||||
|
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.
|
||||||
|
|
||||||
chars.c1 = seq_l[0];
|
chars.c1 = seq_l[0];
|
||||||
putNGramBase(n, offset);
|
putNGramBase(n, offset);
|
||||||
|
|
||||||
@ -276,7 +279,8 @@ namespace VolnitskyTraits
|
|||||||
if (size_l != size_u)
|
if (size_l != size_u)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
assert(size_l > seq_ngram_offset && size_u > seq_ngram_offset);
|
if (size_l <= seq_ngram_offset || size_u <= seq_ngram_offset)
|
||||||
|
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.
|
||||||
|
|
||||||
chars.c0 = seq_l[seq_ngram_offset];
|
chars.c0 = seq_l[seq_ngram_offset];
|
||||||
putNGramBase(n, offset);
|
putNGramBase(n, offset);
|
||||||
@ -302,10 +306,8 @@ namespace VolnitskyTraits
|
|||||||
if (size_first_l != size_first_u || size_second_l != size_second_u)
|
if (size_first_l != size_first_u || size_second_l != size_second_u)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
assert(size_first_l > seq_ngram_offset);
|
if (size_first_l <= seq_ngram_offset || size_first_u <= seq_ngram_offset || size_second_l == 0 || size_second_u == 0)
|
||||||
assert(size_first_u > seq_ngram_offset);
|
return false;
|
||||||
assert(size_second_l > 0);
|
|
||||||
assert(size_second_u > 0);
|
|
||||||
|
|
||||||
auto c0l = first_l_seq[seq_ngram_offset];
|
auto c0l = first_l_seq[seq_ngram_offset];
|
||||||
auto c0u = first_u_seq[seq_ngram_offset];
|
auto c0u = first_u_seq[seq_ngram_offset];
|
||||||
@ -399,7 +401,7 @@ public:
|
|||||||
if (fallback || fallback_searcher.force_fallback)
|
if (fallback || fallback_searcher.force_fallback)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
|
hash = std::make_unique<VolnitskyTraits::Offset[]>(VolnitskyTraits::hash_size);
|
||||||
|
|
||||||
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
|
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
|
||||||
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
|
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
|
||||||
|
@ -12872,3 +12872,4 @@
|
|||||||
1
|
1
|
||||||
1
|
1
|
||||||
1
|
1
|
||||||
|
1
|
||||||
|
@ -223,6 +223,8 @@ select [2] = multiSearchAllPositions(materialize('abab'), materialize(['ba']));
|
|||||||
select [1] = multiSearchAllPositionsCaseInsensitive(materialize('aBaB'), materialize(['abab']));
|
select [1] = multiSearchAllPositionsCaseInsensitive(materialize('aBaB'), materialize(['abab']));
|
||||||
select [3] = multiSearchAllPositionsUTF8(materialize('ab€ab'), materialize(['€']));
|
select [3] = multiSearchAllPositionsUTF8(materialize('ab€ab'), materialize(['€']));
|
||||||
select [3] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize('ab€AB'), materialize(['€ab']));
|
select [3] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize('ab€AB'), materialize(['€ab']));
|
||||||
|
-- checks the correct handling of broken utf-8 sequence
|
||||||
|
select [0] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize(''), materialize(['a\x90\x90\x90\x90\x90\x90']));
|
||||||
|
|
||||||
select 1 = multiSearchAny(materialize('abcdefgh'), ['b']);
|
select 1 = multiSearchAny(materialize('abcdefgh'), ['b']);
|
||||||
select 1 = multiSearchAny(materialize('abcdefgh'), ['bc']);
|
select 1 = multiSearchAny(materialize('abcdefgh'), ['bc']);
|
||||||
|
Loading…
Reference in New Issue
Block a user