mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Fix countSubstrings() & position() on patterns with 0-bytes
SQL functions countSubstrings(), countSubstringsCaseInsensitive(), countSubstringsUTF8(), position(), positionCaseInsensitive(), positionUTF8() with non-const pattern argument use fallback sorters LibCASCIICaseSensitiveStringSearcher and LibCASCIICaseInsensitiveStringSearcher which call ::strstr(), resp. ::strcasestr(). These functions assume that the haystack is 0-terminated and they even document that. However, the callers did not check if the haystack contains 0-byte (perhaps because its sort of expensive). As a consequence, if the haystack contained a zero byte in it's payload, matches behind this zero byte were ignored. create table t (id UInt32, pattern String) engine = MergeTree() order by id; insert into t values (1, 'x'); select countSubstrings('aaaxxxaa\0xxx', pattern) from t; We returned 3 before this commit, now we return 6
This commit is contained in:
parent
baeb1811e1
commit
81bb2242fd
@ -826,66 +826,43 @@ using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
|
||||
using ASCIICaseSensitiveTokenSearcher = TokenSearcher<ASCIICaseSensitiveStringSearcher>;
|
||||
using ASCIICaseInsensitiveTokenSearcher = TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
|
||||
|
||||
|
||||
/** Uses functions from libc.
|
||||
* It makes sense to use only with short haystacks when cheap initialization is required.
|
||||
* There is no option for case-insensitive search for UTF-8 strings.
|
||||
* It is required that strings are zero-terminated.
|
||||
*/
|
||||
|
||||
struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
||||
/// Use only with short haystacks where cheap initialization is required.
|
||||
template <bool CaseInsensitive>
|
||||
struct StdLibASCIIStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
const char * const needle_start;
|
||||
const char * const needle_end;
|
||||
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
|
||||
: needle(reinterpret_cast<const char *>(needle_)) {}
|
||||
StdLibASCIIStringSearcher(const CharT * const needle_start_, const size_t needle_size_)
|
||||
: needle_start{reinterpret_cast<const char *>(needle_start_)}
|
||||
, needle_end{reinterpret_cast<const char *>(needle_start) + needle_size_}
|
||||
{}
|
||||
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
const CharT * search(const CharT * haystack_start, const CharT * const haystack_end) const
|
||||
{
|
||||
const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
|
||||
if (!res)
|
||||
return haystack_end;
|
||||
return reinterpret_cast<const CharT *>(res);
|
||||
if constexpr (CaseInsensitive)
|
||||
{
|
||||
return std::search(
|
||||
haystack_start, haystack_end, needle_start, needle_end,
|
||||
[](char c1, char c2) {return std::toupper(c1) == std::toupper(c2);});
|
||||
}
|
||||
else
|
||||
{
|
||||
return std::search(
|
||||
haystack_start, haystack_end, needle_start, needle_end);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
const CharT * search(const CharT * haystack_start, const size_t haystack_length) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
return search(haystack_start, haystack_start + haystack_length);
|
||||
}
|
||||
};
|
||||
|
||||
struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
|
||||
: needle(reinterpret_cast<const char *>(needle_)) {}
|
||||
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
|
||||
if (!res)
|
||||
return haystack_end;
|
||||
return reinterpret_cast<const CharT *>(res);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ struct PositionCaseSensitiveASCII
|
||||
using MultiSearcherInBigHaystack = MultiVolnitsky;
|
||||
|
||||
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
|
||||
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
|
||||
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||
{
|
||||
@ -62,7 +62,7 @@ struct PositionCaseInsensitiveASCII
|
||||
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
|
||||
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
|
||||
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
|
||||
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
|
||||
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ true>;
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
|
||||
{
|
||||
@ -94,7 +94,7 @@ struct PositionCaseSensitiveUTF8
|
||||
{
|
||||
using SearcherInBigHaystack = VolnitskyUTF8;
|
||||
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
|
||||
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
|
||||
using SearcherInSmallHaystack = StdLibASCIIStringSearcher</*CaseInsensitive*/ false>;
|
||||
|
||||
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
|
||||
{
|
||||
|
@ -0,0 +1,12 @@
|
||||
6
|
||||
6
|
||||
6
|
||||
6
|
||||
6
|
||||
6
|
||||
7
|
||||
7
|
||||
7
|
||||
7
|
||||
7
|
||||
7
|
@ -0,0 +1,24 @@
|
||||
drop table if exists tab;
|
||||
|
||||
create table tab (id UInt32, haystack String, pattern String) engine = MergeTree() order by id;
|
||||
insert into tab values (1, 'aaaxxxaa\0xxx', 'x');
|
||||
|
||||
select countSubstrings('aaaxxxaa\0xxx', pattern) from tab where id = 1;
|
||||
select countSubstringsCaseInsensitive('aaaxxxaa\0xxx', pattern) from tab where id = 1;
|
||||
select countSubstringsCaseInsensitiveUTF8('aaaxxxaa\0xxx', pattern) from tab where id = 1;
|
||||
|
||||
select countSubstrings(haystack, pattern) from tab where id = 1;
|
||||
select countSubstringsCaseInsensitive(haystack, pattern) from tab where id = 1;
|
||||
select countSubstringsCaseInsensitiveUTF8(haystack, pattern) from tab where id = 1;
|
||||
|
||||
insert into tab values (2, 'aaaaa\0x', 'x');
|
||||
|
||||
select position('aaaaa\0x', pattern) from tab where id = 2;
|
||||
select positionCaseInsensitive('aaaaa\0x', pattern) from tab where id = 2;
|
||||
select positionCaseInsensitiveUTF8('aaaaa\0x', pattern) from tab where id = 2;
|
||||
|
||||
select position(haystack, pattern) from tab where id = 2;
|
||||
select positionCaseInsensitive(haystack, pattern) from tab where id = 2;
|
||||
select positionCaseInsensitiveUTF8(haystack, pattern) from tab where id = 2;
|
||||
|
||||
drop table if exists tab;
|
Loading…
Reference in New Issue
Block a user