mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(), countSubstringsUTF8(), position(), positionCaseInsensitive(), positionUTF8() with non-const pattern argument use fallback sorters LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher which call ::strstr(), resp. ::strcasestr(). These functions assume that the haystack is 0-terminated and they even document that. However, the callers did not check if the haystack contains 0-byte (perhaps because its sort of expensive). As a consequence, if the haystack contained a zero byte in it's payload, matches behind this zero byte were ignored. create table t (id UInt32, pattern String) engine = MergeTree() order by id; insert into t values (1, 'x'); select countSubstrings('aaaxxxaa\0xxx', pattern) from t; We returned 3 before this commit, now we return 6
This commit is contained in:
parent
baeb1811e1
commit
81bb2242fd
@ -826,66 +826,43 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
|
|||||||
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
|
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
|
||||||
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
|
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
|
||||||
|
|
||||||
|
/// Use only with short haystacks where cheap initialization is required.
|
||||||
/** Uses functions from libc.
|
template <bool CaseInsensitive>
|
||||||
* It makes sense to use only with short haystacks when cheap initialization is required.
|
struct StdLibASCIIStringSearcher : public StringSearcherBase
|
||||||
* There is no option for case-insensitive search for UTF-8 strings.
|
|
||||||
* It is required that strings are zero-terminated.
|
|
||||||
*/
|
|
||||||
|
|
||||||
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
|
||||||
{
|
{
|
||||||
const char * const needle;
|
const char * const needle_start;
|
||||||
|
const char * const needle_end;
|
||||||
|
|
||||||
template <typename CharT>
|
template <typename CharT>
|
||||||
requires (sizeof(CharT) == 1)
|
requires (sizeof(CharT) == 1)
|
||||||
LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
|
StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_)
|
||||||
: needle(reinterpret_cast<const char *>(needle_)) {}
|
: needle_start{reinterpret_cast<const char *>(needle_start_)}
|
||||||
|
, needle_end{reinterpret_cast<const char *>(needle_start) + needle_size_}
|
||||||
|
{}
|
||||||
|
|
||||||
template <typename CharT>
|
template <typename CharT>
|
||||||
requires (sizeof(CharT) == 1)
|
requires (sizeof(CharT) == 1)
|
||||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
|
||||||
{
|
{
|
||||||
const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
|
if constexpr (CaseInsensitive)
|
||||||
if (!res)
|
{
|
||||||
return haystack_end;
|
return std::search(
|
||||||
return reinterpret_cast<const CharT *>(res);
|
haystack_start, haystack_end, needle_start, needle_end,
|
||||||
|
[](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return std::search(
|
||||||
|
haystack_start, haystack_end, needle_start, needle_end);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename CharT>
|
template <typename CharT>
|
||||||
requires (sizeof(CharT) == 1)
|
requires (sizeof(CharT) == 1)
|
||||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
const CharT * search(const CharT * haystack_start, const size_t haystack_length) const
|
||||||
{
|
{
|
||||||
return search(haystack, haystack + haystack_size);
|
return search(haystack_start, haystack_start + haystack_length);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
|
||||||
{
|
|
||||||
const char * const needle;
|
|
||||||
|
|
||||||
template <typename CharT>
|
|
||||||
requires (sizeof(CharT) == 1)
|
|
||||||
LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
|
|
||||||
: needle(reinterpret_cast<const char *>(needle_)) {}
|
|
||||||
|
|
||||||
template <typename CharT>
|
|
||||||
requires (sizeof(CharT) == 1)
|
|
||||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
|
||||||
{
|
|
||||||
const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
|
|
||||||
if (!res)
|
|
||||||
return haystack_end;
|
|
||||||
return reinterpret_cast<const CharT *>(res);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename CharT>
|
|
||||||
requires (sizeof(CharT) == 1)
|
|
||||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
|
||||||
{
|
|
||||||
return search(haystack, haystack + haystack_size);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ struct PositionCaseSensitiveASCII
|
|||||||
using MultiSearcherInBigHaystack = MultiVolnitsky;
|
using MultiSearcherInBigHaystack = MultiVolnitsky;
|
||||||
|
|
||||||
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
|
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
|
||||||
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
|
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
|
||||||
|
|
||||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||||
{
|
{
|
||||||
@ -62,7 +62,7 @@ struct PositionCaseInsensitiveASCII
|
|||||||
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
|
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
|
||||||
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
|
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
|
||||||
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
|
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
|
||||||
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
|
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ true>;
|
||||||
|
|
||||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
|
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
|
||||||
{
|
{
|
||||||
@ -94,7 +94,7 @@ struct PositionCaseSensitiveUTF8
|
|||||||
{
|
{
|
||||||
using SearcherInBigHaystack = VolnitskyUTF8;
|
using SearcherInBigHaystack = VolnitskyUTF8;
|
||||||
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
|
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
|
||||||
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
|
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
|
||||||
|
|
||||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||||
{
|
{
|
||||||
|
@ -0,0 +1,12 @@
|
|||||||
|
6
|
||||||
|
6
|
||||||
|
6
|
||||||
|
6
|
||||||
|
6
|
||||||
|
6
|
||||||
|
7
|
||||||
|
7
|
||||||
|
7
|
||||||
|
7
|
||||||
|
7
|
||||||
|
7
|
@ -0,0 +1,24 @@
|
|||||||
|
drop table if exists tab;
|
||||||
|
|
||||||
|
create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id;
|
||||||
|
insert into tab values (1, 'aaaxxxaa\0xxx', 'x');
|
||||||
|
|
||||||
|
select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1;
|
||||||
|
select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1;
|
||||||
|
select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1;
|
||||||
|
|
||||||
|
select countSubstrings(haystack, pattern) from tab where id = 1;
|
||||||
|
select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1;
|
||||||
|
select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1;
|
||||||
|
|
||||||
|
insert into tab values (2, 'aaaaa\0x', 'x');
|
||||||
|
|
||||||
|
select position('aaaaa\0x', pattern) from tab where id = 2;
|
||||||
|
select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2;
|
||||||
|
select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2;
|
||||||
|
|
||||||
|
select position(haystack, pattern) from tab where id = 2;
|
||||||
|
select positionCaseInsensitive(haystack, pattern) from tab where id = 2;
|
||||||
|
select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2;
|
||||||
|
|
||||||
|
drop table if exists tab;
|
Loading…
Reference in New Issue
Block a user