From 81bb2242fdd2c98af7ad0dcd66d3b2275f0aadba Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 29 Jun 2022 15:08:16 +0000 Subject: [PATCH] Fix countSubstrings() & position() on patterns with 0-bytes SQL functions countSubstrings(), countSubstringsCaseInsensitive(), countSubstringsUTF8(), position(), positionCaseInsensitive(), positionUTF8() with non-const pattern argument use fallback sorters LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher which call ::strstr(), resp. ::strcasestr(). These functions assume that the haystack is 0-terminated and they even document that. However, the callers did not check if the haystack contains 0-byte (perhaps because its sort of expensive). As a consequence, if the haystack contained a zero byte in it's payload, matches behind this zero byte were ignored. create table t (id UInt32, pattern String) engine = MergeTree() order by id; insert into t values (1, 'x'); select countSubstrings('aaaxxxaa\0xxx', pattern) from t; We returned 3 before this commit, now we return 6 --- src/Common/StringSearcher.h | 69 +++++++------------ src/Functions/PositionImpl.h | 6 +- ...sition_countsubstrings_zero_byte.reference | 12 ++++ ...346_position_countsubstrings_zero_byte.sql | 24 +++++++ 4 files changed, 62 insertions(+), 49 deletions(-) create mode 100644 tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference create mode 100644 tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql diff --git a/src/Common/StringSearcher.h b/src/Common/StringSearcher.h index a82115a9923..7d669ddd369 100644 --- a/src/Common/StringSearcher.h +++ b/src/Common/StringSearcher.h @@ -826,66 +826,43 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher; using ASCIICaseSensitiveTokenSearcher = TokenSearcher; using ASCIICaseInsensitiveTokenSearcher = TokenSearcher; - -/** Uses functions from libc. - * It makes sense to use only with short haystacks when cheap initialization is required. - * There is no option for case-insensitive search for UTF-8 strings. - * It is required that strings are zero-terminated. - */ - -struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase +/// Use only with short haystacks where cheap initialization is required. +template +struct StdLibASCIIStringSearcher : public StringSearcherBase { - const char * const needle; + const char * const needle_start; + const char * const needle_end; template requires (sizeof(CharT) == 1) - LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */) - : needle(reinterpret_cast(needle_)) {} + StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_) + : needle_start{reinterpret_cast(needle_start_)} + , needle_end{reinterpret_cast(needle_start) + needle_size_} + {} template requires (sizeof(CharT) == 1) - const CharT * search(const CharT * haystack, const CharT * const haystack_end) const + const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const { - const auto * res = strstr(reinterpret_cast(haystack), reinterpret_cast(needle)); - if (!res) - return haystack_end; - return reinterpret_cast(res); + if constexpr (CaseInsensitive) + { + return std::search( + haystack_start, haystack_end, needle_start, needle_end, + [](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);}); + } + else + { + return std::search( + haystack_start, haystack_end, needle_start, needle_end); + } } template requires (sizeof(CharT) == 1) - const CharT * search(const CharT * haystack, const size_t haystack_size) const + const CharT * search(const CharT * haystack_start, const size_t haystack_length) const { - return search(haystack, haystack + haystack_size); + return search(haystack_start, haystack_start + haystack_length); } }; -struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase -{ - const char * const needle; - - template - requires (sizeof(CharT) == 1) - LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */) - : needle(reinterpret_cast(needle_)) {} - - template - requires (sizeof(CharT) == 1) - const CharT * search(const CharT * haystack, const CharT * const haystack_end) const - { - const auto * res = strcasestr(reinterpret_cast(haystack), reinterpret_cast(needle)); - if (!res) - return haystack_end; - return reinterpret_cast(res); - } - - template - requires (sizeof(CharT) == 1) - const CharT * search(const CharT * haystack, const size_t haystack_size) const - { - return search(haystack, haystack + haystack_size); - } -}; - - } diff --git a/src/Functions/PositionImpl.h b/src/Functions/PositionImpl.h index 5380fcc36d9..76f10373a58 100644 --- a/src/Functions/PositionImpl.h +++ b/src/Functions/PositionImpl.h @@ -26,7 +26,7 @@ struct PositionCaseSensitiveASCII using MultiSearcherInBigHaystack = MultiVolnitsky; /// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization. - using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher; + using SearcherInSmallHaystack = StdLibASCIIStringSearcher; static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) { @@ -62,7 +62,7 @@ struct PositionCaseInsensitiveASCII /// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it. using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher; using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive; - using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher; + using SearcherInSmallHaystack = StdLibASCIIStringSearcher; static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/) { @@ -94,7 +94,7 @@ struct PositionCaseSensitiveUTF8 { using SearcherInBigHaystack = VolnitskyUTF8; using MultiSearcherInBigHaystack = MultiVolnitskyUTF8; - using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher; + using SearcherInSmallHaystack = StdLibASCIIStringSearcher; static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) { diff --git a/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference new file mode 100644 index 00000000000..2b70bdc272e --- /dev/null +++ b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.reference @@ -0,0 +1,12 @@ +6 +6 +6 +6 +6 +6 +7 +7 +7 +7 +7 +7 diff --git a/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql new file mode 100644 index 00000000000..6208baf41c4 --- /dev/null +++ b/tests/queries/0_stateless/02346_position_countsubstrings_zero_byte.sql @@ -0,0 +1,24 @@ +drop table if exists tab; + +create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id; +insert into tab values (1, 'aaaxxxaa\0xxx', 'x'); + +select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1; +select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1; +select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1; + +select countSubstrings(haystack, pattern) from tab where id = 1; +select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1; +select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1; + +insert into tab values (2, 'aaaaa\0x', 'x'); + +select position('aaaaa\0x', pattern) from tab where id = 2; +select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2; +select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2; + +select position(haystack, pattern) from tab where id = 2; +select positionCaseInsensitive(haystack, pattern) from tab where id = 2; +select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2; + +drop table if exists tab;