Merge pull request #38589 from ClickHouse/fix-zero-bytes-in-haystack

Fix countSubstrings() & position() on patterns with 0-bytes
This commit is contained in:
Robert Schulze 2022-07-01 16:15:43 +02:00 committed by GitHub
commit 2a1ede0f5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 63 additions and 49 deletions

View File

@ -826,66 +826,44 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
/** Uses functions from libc.
* It makes sense to use only with short haystacks when cheap initialization is required.
* There is no option for case-insensitive search for UTF-8 strings.
* It is required that strings are zero-terminated.
*/
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
/// Use only with short haystacks where cheap initialization is required.
template <bool CaseInsensitive>
struct StdLibASCIIStringSearcher : public StringSearcherBase
{
const char * const needle;
const char * const needle_start;
const char * const needle_end;
template <typename CharT>
requires (sizeof(CharT) == 1)
LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
: needle(reinterpret_cast<const char *>(needle_)) {}
StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_)
: needle_start{reinterpret_cast<const char *>(needle_start_)}
, needle_end{reinterpret_cast<const char *>(needle_start) + needle_size_}
{}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
{
const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
if (!res)
return haystack_end;
return reinterpret_cast<const CharT *>(res);
if constexpr (CaseInsensitive)
{
return std::search(
haystack_start, haystack_end, needle_start, needle_end,
[](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
}
else
{
return std::search(
haystack_start, haystack_end, needle_start, needle_end,
[](char c1, char c2) {return c1 == c2;});
}
}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
const CharT * search(const CharT * haystack_start, const size_t haystack_length) const
{
return search(haystack, haystack + haystack_size);
return search(haystack_start, haystack_start + haystack_length);
}
};
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
{
const char * const needle;
template <typename CharT>
requires (sizeof(CharT) == 1)
LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
: needle(reinterpret_cast<const char *>(needle_)) {}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
if (!res)
return haystack_end;
return reinterpret_cast<const CharT *>(res);
}
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
};
}

View File

@ -26,7 +26,7 @@ struct PositionCaseSensitiveASCII
using MultiSearcherInBigHaystack = MultiVolnitsky;
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
{
@ -62,7 +62,7 @@ struct PositionCaseInsensitiveASCII
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ true>;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
{
@ -94,7 +94,7 @@ struct PositionCaseSensitiveUTF8
{
using SearcherInBigHaystack = VolnitskyUTF8;
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
{

View File

@ -0,0 +1,12 @@
6
6
6
6
6
6
7
7
7
7
7
7

View File

@ -0,0 +1,24 @@
drop table if exists tab;
create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id;
insert into tab values (1, 'aaaxxxaa\0xxx', 'x');
select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1;
select countSubstrings(haystack, pattern) from tab where id = 1;
select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1;
select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1;
insert into tab values (2, 'aaaaa\0x', 'x');
select position('aaaaa\0x', pattern) from tab where id = 2;
select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2;
select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2;
select position(haystack, pattern) from tab where id = 2;
select positionCaseInsensitive(haystack, pattern) from tab where id = 2;
select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2;
drop table if exists tab;