Fix countSubstrings() & position() on patterns with 0-bytes

SQL functions countSubstrings(), countSubstringsCaseInsensitive(),
countSubstringsUTF8(), position(), positionCaseInsensitive(),
positionUTF8() with non-const pattern argument use fallback sorters
LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher
which call ::strstr(), resp. ::strcasestr(). These functions assume that
the haystack is 0-terminated and they even document that. However, the
callers did not check if the haystack contains 0-byte (perhaps because
its sort of expensive). As a consequence, if the haystack contained a
zero byte in it's payload, matches behind this zero byte were ignored.

    create table t (id UInt32, pattern String) engine = MergeTree() order by id;
    insert into t values (1, 'x');
    select countSubstrings('aaaxxxaa\0xxx', pattern) from t;

We returned 3 before this commit, now we return 6
This commit is contained in:
Robert Schulze 2022-06-29 15:08:16 +00:00
parent baeb1811e1
commit 81bb2242fd
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
4 changed files with 62 additions and 49 deletions

View File

@ -826,66 +826,43 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
/** Uses functions from libc.
* It makes sense to use only with short haystacks when cheap initialization is required.
* There is no option for case-insensitive search for UTF-8 strings.
* It is required that strings are zero-terminated.
*/
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
/// Use only with short haystacks where cheap initialization is required.
template <bool CaseInsensitive>
struct StdLibASCIIStringSearcher : public StringSearcherBase
{
const char * const needle;
const char * const needle_start;
const char * const needle_end;
template <typename CharT>
requires (sizeof(CharT) == 1)
LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
: needle(reinterpret_cast<const char *>(needle_)) {}
StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_)
: needle_start{reinterpret_cast<const char *>(needle_start_)}
, needle_end{reinterpret_cast<const char *>(needle_start) + needle_size_}
{}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
{
const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
if (!res)
return haystack_end;
return reinterpret_cast<const CharT *>(res);
if constexpr (CaseInsensitive)
{
return std::search(
haystack_start, haystack_end, needle_start, needle_end,
[](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
}
else
{
return std::search(
haystack_start, haystack_end, needle_start, needle_end);
}
}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
const CharT * search(const CharT * haystack_start, const size_t haystack_length) const
{
return search(haystack, haystack + haystack_size);
return search(haystack_start, haystack_start + haystack_length);
}
};
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
{
const char * const needle;
template <typename CharT>
requires (sizeof(CharT) == 1)
LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
: needle(reinterpret_cast<const char *>(needle_)) {}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
if (!res)
return haystack_end;
return reinterpret_cast<const CharT *>(res);
}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
};
}

View File

@ -26,7 +26,7 @@ struct PositionCaseSensitiveASCII
using MultiSearcherInBigHaystack = MultiVolnitsky;
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
{
@ -62,7 +62,7 @@ struct PositionCaseInsensitiveASCII
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ true>;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
{
@ -94,7 +94,7 @@ struct PositionCaseSensitiveUTF8
{
using SearcherInBigHaystack = VolnitskyUTF8;
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
{

View File

@ -0,0 +1,12 @@
6
6
6
6
6
6
7
7
7
7
7
7

View File

@ -0,0 +1,24 @@
drop table if exists tab;
create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id;
insert into tab values (1, 'aaaxxxaa\0xxx', 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstrings(haystack, pattern) from tab where id = 1;
select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1;
select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1;
insert into tab values (2, 'aaaaa\0x', 'x');
select position('aaaaa\0x', pattern) from tab where id = 2;
select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2;
select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2;
select position(haystack, pattern) from tab where id = 2;
select positionCaseInsensitive(haystack, pattern) from tab where id = 2;
select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2;
drop table if exists tab;