2020-05-06 23:21:13 +00:00
|
|
|
#pragma once
|
|
|
|
|
2020-07-05 15:57:59 +00:00
|
|
|
#include <type_traits>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/types.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
#include <Common/Volnitsky.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
2022-05-16 20:23:51 +00:00
|
|
|
#include <Core/ColumnNumbers.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
#include "Regexps.h"
|
|
|
|
|
2022-09-28 12:35:02 +00:00
|
|
|
#include "config.h"
|
2022-01-17 18:45:54 +00:00
|
|
|
#include <re2_st/re2.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
|
|
|
}
|
|
|
|
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
namespace impl
|
|
|
|
{
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2020-07-05 15:57:59 +00:00
|
|
|
/// Is the [I]LIKE expression reduced to finding a substring in a string?
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
inline bool likePatternIsSubstring(std::string_view pattern, String & res)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
if (pattern.size() < 2 || !pattern.starts_with('%') || !pattern.ends_with('%'))
|
2020-05-06 23:21:13 +00:00
|
|
|
return false;
|
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
res.clear();
|
2022-05-11 11:54:26 +00:00
|
|
|
res.reserve(pattern.size() - 2);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
const char * pos = pattern.data() + 1;
|
|
|
|
const char * const end = pattern.data() + pattern.size() - 1;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
while (pos < end)
|
|
|
|
{
|
|
|
|
switch (*pos)
|
|
|
|
{
|
|
|
|
case '%':
|
|
|
|
case '_':
|
|
|
|
return false;
|
|
|
|
case '\\':
|
|
|
|
++pos;
|
|
|
|
if (pos == end)
|
|
|
|
return false;
|
|
|
|
else
|
|
|
|
res += *pos;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
res += *pos;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
}
|
|
|
|
|
2022-05-24 12:03:14 +00:00
|
|
|
// For more readable instantiations of MatchImpl<>
|
|
|
|
struct MatchTraits
|
|
|
|
{
|
|
|
|
enum class Syntax
|
|
|
|
{
|
|
|
|
Like,
|
|
|
|
Re2
|
|
|
|
};
|
|
|
|
|
|
|
|
enum class Case
|
|
|
|
{
|
|
|
|
Sensitive,
|
|
|
|
Insensitive
|
|
|
|
};
|
|
|
|
|
|
|
|
enum class Result
|
|
|
|
{
|
|
|
|
DontNegate,
|
|
|
|
Negate
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* NOTE: We want to run regexp search for whole columns by one call (as implemented in function 'position')
|
|
|
|
* but for that, regexp engine must support \0 bytes and their interpretation as string boundaries.
|
|
|
|
*/
|
|
|
|
template <typename Name, MatchTraits::Syntax syntax_, MatchTraits::Case case_, MatchTraits::Result result_>
|
2020-05-06 23:21:13 +00:00
|
|
|
struct MatchImpl
|
|
|
|
{
|
|
|
|
static constexpr bool use_default_implementation_for_constants = true;
|
2020-08-01 21:14:23 +00:00
|
|
|
static constexpr bool supports_start_pos = false;
|
2021-09-21 16:43:46 +00:00
|
|
|
static constexpr auto name = Name::name;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {2};}
|
2022-05-16 20:23:51 +00:00
|
|
|
|
2020-05-06 23:21:13 +00:00
|
|
|
using ResultType = UInt8;
|
|
|
|
|
2022-05-24 12:03:14 +00:00
|
|
|
static constexpr bool is_like = (syntax_ == MatchTraits::Syntax::Like);
|
|
|
|
static constexpr bool case_insensitive = (case_ == MatchTraits::Case::Insensitive);
|
|
|
|
static constexpr bool negate = (result_ == MatchTraits::Result::Negate);
|
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
using Searcher = std::conditional_t<case_insensitive, VolnitskyCaseInsensitiveUTF8, VolnitskyUTF8>;
|
2020-07-05 15:57:59 +00:00
|
|
|
|
2020-05-06 23:21:13 +00:00
|
|
|
static void vectorConstant(
|
2022-05-13 08:52:25 +00:00
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
|
|
|
const String & needle,
|
2022-05-31 07:15:59 +00:00
|
|
|
[[maybe_unused]] const ColumnPtr & start_pos_,
|
2020-08-01 21:14:23 +00:00
|
|
|
PaddedPODArray<UInt8> & res)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-05-24 19:16:47 +00:00
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
|
2022-05-31 07:15:59 +00:00
|
|
|
assert(haystack_size == res.size());
|
|
|
|
assert(start_pos_ == nullptr);
|
2020-08-01 21:14:23 +00:00
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
if (haystack_offsets.empty())
|
2020-05-06 23:21:13 +00:00
|
|
|
return;
|
|
|
|
|
2023-01-13 06:55:54 +00:00
|
|
|
/// Fast path for [I]LIKE, because the result is always true or false
|
2023-01-13 04:20:03 +00:00
|
|
|
/// col [i]like '%%'
|
|
|
|
/// col not [i]like '%%'
|
|
|
|
/// col like '%'
|
|
|
|
/// col not [i]like '%'
|
2023-01-13 06:55:54 +00:00
|
|
|
/// match(like, '^$')
|
|
|
|
if ((is_like && (needle == "%%" or needle == "%")) || (!is_like && needle == ".*"))
|
2023-01-13 04:20:03 +00:00
|
|
|
{
|
|
|
|
for (auto & re : res)
|
|
|
|
re = !negate;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// Special case that the [I]LIKE expression reduces to finding a substring in a string
|
2022-05-11 11:54:26 +00:00
|
|
|
String strstr_pattern;
|
2022-05-24 12:03:14 +00:00
|
|
|
if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-05-13 08:52:25 +00:00
|
|
|
const UInt8 * const begin = haystack_data.data();
|
|
|
|
const UInt8 * const end = haystack_data.data() + haystack_data.size();
|
2020-05-06 23:21:13 +00:00
|
|
|
const UInt8 * pos = begin;
|
|
|
|
|
|
|
|
/// The current index in the array of strings.
|
|
|
|
size_t i = 0;
|
|
|
|
|
|
|
|
/// TODO You need to make that `searcher` is common to all the calls of the function.
|
2020-07-05 15:57:59 +00:00
|
|
|
Searcher searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
|
|
|
{
|
|
|
|
/// Let's determine which index it refers to.
|
2022-05-13 08:52:25 +00:00
|
|
|
while (begin + haystack_offsets[i] <= pos)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-05-13 08:52:25 +00:00
|
|
|
res[i] = negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// We check that the entry does not pass through the boundaries of strings.
|
2022-05-13 08:52:25 +00:00
|
|
|
if (pos + strstr_pattern.size() < begin + haystack_offsets[i])
|
|
|
|
res[i] = !negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
else
|
2022-05-13 08:52:25 +00:00
|
|
|
res[i] = negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
pos = begin + haystack_offsets[i];
|
2020-05-06 23:21:13 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Tail, in which there can be no substring.
|
|
|
|
if (i < res.size())
|
2022-05-13 08:52:25 +00:00
|
|
|
memset(&res[i], negate, (res.size() - i) * sizeof(res[0]));
|
2022-11-04 09:09:48 +00:00
|
|
|
|
|
|
|
return;
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
const auto & regexp = Regexps::Regexp(Regexps::createRegexp<is_like, /*no_capture*/ true, case_insensitive>(needle));
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
String required_substring;
|
|
|
|
bool is_trivial;
|
|
|
|
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.
|
|
|
|
|
|
|
|
regexp.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
if (required_substring.empty())
|
|
|
|
{
|
|
|
|
if (!regexp.getRE2()) /// An empty regexp. Always matches.
|
|
|
|
memset(res.data(), !negate, haystack_size * sizeof(res[0]));
|
|
|
|
else
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
size_t prev_offset = 0;
|
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
const bool match = regexp.getRE2()->Match(
|
|
|
|
{reinterpret_cast<const char *>(&haystack_data[prev_offset]), haystack_offsets[i] - prev_offset - 1},
|
|
|
|
0,
|
|
|
|
haystack_offsets[i] - prev_offset - 1,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match;
|
|
|
|
|
|
|
|
prev_offset = haystack_offsets[i];
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
}
|
2022-11-04 09:09:48 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// NOTE This almost matches with the case of impl::likePatternIsSubstring.
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
const UInt8 * const begin = haystack_data.data();
|
|
|
|
const UInt8 * const end = haystack_data.begin() + haystack_data.size();
|
|
|
|
const UInt8 * pos = begin;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// The current index in the array of strings.
|
|
|
|
size_t i = 0;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
Searcher searcher(required_substring.data(), required_substring.size(), end - pos);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
|
|
|
{
|
|
|
|
/// Determine which index it refers to.
|
|
|
|
while (begin + haystack_offsets[i] <= pos)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
res[i] = negate;
|
|
|
|
++i;
|
|
|
|
}
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// We check that the entry does not pass through the boundaries of strings.
|
|
|
|
if (pos + required_substring.size() < begin + haystack_offsets[i])
|
|
|
|
{
|
|
|
|
/// And if it does not, if necessary, we check the regexp.
|
|
|
|
if (is_trivial)
|
|
|
|
res[i] = !negate;
|
|
|
|
else
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
const char * str_data = reinterpret_cast<const char *>(&haystack_data[haystack_offsets[i - 1]]);
|
|
|
|
size_t str_size = haystack_offsets[i] - haystack_offsets[i - 1] - 1;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
|
|
|
|
* so that it can match when `required_substring` occurs into the string several times,
|
|
|
|
* and at the first occurrence, the regexp is not a match.
|
|
|
|
*/
|
|
|
|
const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast<const char *>(pos) - str_data) : 0;
|
|
|
|
const size_t end_pos = str_size;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
const bool match = regexp.getRE2()->Match(
|
|
|
|
{str_data, str_size},
|
|
|
|
start_pos,
|
|
|
|
end_pos,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match;
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
}
|
2022-11-04 09:09:48 +00:00
|
|
|
else
|
|
|
|
res[i] = negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
pos = begin + haystack_offsets[i];
|
|
|
|
++i;
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
2022-11-04 09:09:48 +00:00
|
|
|
|
|
|
|
/// Tail, in which there can be no substring.
|
|
|
|
if (i < res.size())
|
|
|
|
memset(&res[i], negate, (res.size() - i) * sizeof(res[0]));
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Very carefully crafted copy-paste.
|
|
|
|
static void vectorFixedConstant(
|
2022-05-13 08:52:25 +00:00
|
|
|
const ColumnString::Chars & haystack,
|
|
|
|
size_t N,
|
|
|
|
const String & needle,
|
2020-08-01 21:14:23 +00:00
|
|
|
PaddedPODArray<UInt8> & res)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-05-24 19:16:47 +00:00
|
|
|
const size_t haystack_size = haystack.size() / N;
|
|
|
|
|
2022-05-31 07:15:59 +00:00
|
|
|
assert(haystack_size == res.size());
|
2022-05-24 19:16:47 +00:00
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
if (haystack.empty())
|
2020-05-06 23:21:13 +00:00
|
|
|
return;
|
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// Special case that the [I]LIKE expression reduces to finding a substring in a string
|
2022-05-11 11:54:26 +00:00
|
|
|
String strstr_pattern;
|
2022-05-24 12:03:14 +00:00
|
|
|
if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-05-13 08:52:25 +00:00
|
|
|
const UInt8 * const begin = haystack.data();
|
|
|
|
const UInt8 * const end = haystack.data() + haystack.size();
|
2020-05-06 23:21:13 +00:00
|
|
|
const UInt8 * pos = begin;
|
|
|
|
|
|
|
|
size_t i = 0;
|
|
|
|
const UInt8 * next_pos = begin;
|
|
|
|
|
2022-05-13 08:52:25 +00:00
|
|
|
/// If needle is larger than string size - it cannot be found.
|
|
|
|
if (strstr_pattern.size() <= N)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2020-07-05 15:57:59 +00:00
|
|
|
Searcher searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
|
|
|
{
|
|
|
|
/// Let's determine which index it refers to.
|
2022-05-13 08:52:25 +00:00
|
|
|
while (next_pos + N <= pos)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-05-13 08:52:25 +00:00
|
|
|
res[i] = negate;
|
|
|
|
next_pos += N;
|
2020-05-06 23:21:13 +00:00
|
|
|
++i;
|
|
|
|
}
|
2022-05-13 08:52:25 +00:00
|
|
|
next_pos += N;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
/// We check that the entry does not pass through the boundaries of strings.
|
|
|
|
if (pos + strstr_pattern.size() <= next_pos)
|
2022-05-13 08:52:25 +00:00
|
|
|
res[i] = !negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
else
|
2022-05-13 08:52:25 +00:00
|
|
|
res[i] = negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
|
|
|
pos = next_pos;
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Tail, in which there can be no substring.
|
|
|
|
if (i < res.size())
|
2022-05-13 08:52:25 +00:00
|
|
|
memset(&res[i], negate, (res.size() - i) * sizeof(res[0]));
|
2022-11-04 09:09:48 +00:00
|
|
|
|
|
|
|
return;
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
const auto & regexp = Regexps::Regexp(Regexps::createRegexp<is_like, /*no_capture*/ true, case_insensitive>(needle));
|
|
|
|
|
|
|
|
String required_substring;
|
|
|
|
bool is_trivial;
|
|
|
|
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
regexp.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
if (required_substring.empty())
|
|
|
|
{
|
|
|
|
if (!regexp.getRE2()) /// An empty regexp. Always matches.
|
|
|
|
memset(res.data(), !negate, haystack_size * sizeof(res[0]));
|
|
|
|
else
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
size_t offset = 0;
|
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
const bool match = regexp.getRE2()->Match(
|
|
|
|
{reinterpret_cast<const char *>(&haystack[offset]), N},
|
|
|
|
0,
|
|
|
|
N,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match;
|
|
|
|
|
|
|
|
offset += N;
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
}
|
2022-11-04 09:09:48 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// NOTE This almost matches with the case of likePatternIsSubstring.
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
const UInt8 * const begin = haystack.data();
|
|
|
|
const UInt8 * const end = haystack.data() + haystack.size();
|
|
|
|
const UInt8 * pos = begin;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
size_t i = 0;
|
|
|
|
const UInt8 * next_pos = begin;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// If required substring is larger than string size - it cannot be found.
|
|
|
|
if (required_substring.size() <= N)
|
|
|
|
{
|
|
|
|
Searcher searcher(required_substring.data(), required_substring.size(), end - pos);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/// We will search for the next occurrence in all rows at once.
|
|
|
|
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
|
|
|
|
{
|
|
|
|
/// Let's determine which index it refers to.
|
|
|
|
while (next_pos + N <= pos)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-11-04 09:09:48 +00:00
|
|
|
res[i] = negate;
|
2022-05-13 08:52:25 +00:00
|
|
|
next_pos += N;
|
2022-11-04 09:09:48 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
next_pos += N;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
if (pos + required_substring.size() <= next_pos)
|
|
|
|
{
|
|
|
|
/// And if it does not, if necessary, we check the regexp.
|
|
|
|
if (is_trivial)
|
|
|
|
res[i] = !negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
else
|
2022-11-04 09:09:48 +00:00
|
|
|
{
|
|
|
|
const char * str_data = reinterpret_cast<const char *>(next_pos - N);
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
|
|
|
|
* so that it can match when `required_substring` occurs into the string several times,
|
|
|
|
* and at the first occurrence, the regexp is not a match.
|
|
|
|
*/
|
|
|
|
const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast<const char *>(pos) - str_data) : 0;
|
|
|
|
const size_t end_pos = N;
|
|
|
|
|
|
|
|
const bool match = regexp.getRE2()->Match(
|
|
|
|
{str_data, N},
|
|
|
|
start_pos,
|
|
|
|
end_pos,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match;
|
|
|
|
}
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
2022-11-04 09:09:48 +00:00
|
|
|
else
|
|
|
|
res[i] = negate;
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-11-04 09:09:48 +00:00
|
|
|
pos = next_pos;
|
|
|
|
++i;
|
|
|
|
}
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
2022-11-04 09:09:48 +00:00
|
|
|
|
|
|
|
/// Tail, in which there can be no substring.
|
|
|
|
if (i < res.size())
|
|
|
|
memset(&res[i], negate, (res.size() - i) * sizeof(res[0]));
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
static void vectorVector(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
|
|
|
const ColumnString::Chars & needle_data,
|
|
|
|
const ColumnString::Offsets & needle_offset,
|
2022-05-31 07:15:59 +00:00
|
|
|
[[maybe_unused]] const ColumnPtr & start_pos_,
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
PaddedPODArray<UInt8> & res)
|
|
|
|
{
|
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
|
2022-05-31 07:15:59 +00:00
|
|
|
assert(haystack_size == needle_offset.size());
|
|
|
|
assert(haystack_size == res.size());
|
|
|
|
assert(start_pos_ == nullptr);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
|
|
|
if (haystack_offsets.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
String required_substr;
|
|
|
|
bool is_trivial;
|
|
|
|
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.
|
|
|
|
|
|
|
|
size_t prev_haystack_offset = 0;
|
|
|
|
size_t prev_needle_offset = 0;
|
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
Regexps::LocalCacheTable cache;
|
|
|
|
Regexps::RegexpPtr regexp;
|
|
|
|
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
|
|
|
{
|
|
|
|
const auto * const cur_haystack_data = &haystack_data[prev_haystack_offset];
|
|
|
|
const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset - 1;
|
|
|
|
|
|
|
|
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
|
|
|
|
const size_t cur_needle_length = needle_offset[i] - prev_needle_offset - 1;
|
|
|
|
|
|
|
|
const auto & needle = String(
|
|
|
|
reinterpret_cast<const char *>(cur_needle_data),
|
|
|
|
cur_needle_length);
|
|
|
|
|
2022-05-24 12:03:14 +00:00
|
|
|
if (is_like && impl::likePatternIsSubstring(needle, required_substr))
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
{
|
|
|
|
if (required_substr.size() > cur_haystack_length)
|
|
|
|
res[i] = negate;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length);
|
|
|
|
const auto * match = searcher.search(cur_haystack_data, cur_haystack_length);
|
2022-05-24 18:55:04 +00:00
|
|
|
res[i] = negate ^ (match != cur_haystack_data + cur_haystack_length);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-08-15 18:58:46 +00:00
|
|
|
regexp = cache.getOrSet<is_like, /*no_capture*/ true, case_insensitive>(needle);
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
|
|
|
if (required_substr.empty())
|
|
|
|
{
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
if (!regexp->getRE2()) /// An empty regexp. Always matches.
|
2022-05-24 20:59:48 +00:00
|
|
|
res[i] = !negate;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
else
|
|
|
|
{
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
const bool match = regexp->getRE2()->Match(
|
2022-05-24 18:55:04 +00:00
|
|
|
{reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
|
|
|
|
0,
|
|
|
|
cur_haystack_length,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length);
|
|
|
|
const auto * match = searcher.search(cur_haystack_data, cur_haystack_length);
|
|
|
|
|
|
|
|
if (match == cur_haystack_data + cur_haystack_length)
|
|
|
|
res[i] = negate; // no match
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (is_trivial)
|
|
|
|
res[i] = !negate; // no wildcards in pattern
|
|
|
|
else
|
|
|
|
{
|
|
|
|
const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0;
|
|
|
|
const size_t end_pos = cur_haystack_length;
|
|
|
|
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
const bool match2 = regexp->getRE2()->Match(
|
2022-05-24 18:55:04 +00:00
|
|
|
{reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
|
|
|
|
start_pos,
|
|
|
|
end_pos,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match2;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
prev_haystack_offset = haystack_offsets[i];
|
|
|
|
prev_needle_offset = needle_offset[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vectorFixedVector(
|
|
|
|
const ColumnString::Chars & haystack,
|
|
|
|
size_t N,
|
|
|
|
const ColumnString::Chars & needle_data,
|
|
|
|
const ColumnString::Offsets & needle_offset,
|
2022-05-31 07:15:59 +00:00
|
|
|
[[maybe_unused]] const ColumnPtr & start_pos_,
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
PaddedPODArray<UInt8> & res)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
const size_t haystack_size = haystack.size()/N;
|
|
|
|
|
2022-05-31 07:15:59 +00:00
|
|
|
assert(haystack_size == needle_offset.size());
|
|
|
|
assert(haystack_size == res.size());
|
|
|
|
assert(start_pos_ == nullptr);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
|
|
|
if (haystack.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
String required_substr;
|
|
|
|
bool is_trivial;
|
|
|
|
bool required_substring_is_prefix; // for `anchored` execution of the regexp.
|
|
|
|
|
|
|
|
size_t prev_haystack_offset = 0;
|
|
|
|
size_t prev_needle_offset = 0;
|
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
Regexps::LocalCacheTable cache;
|
|
|
|
Regexps::RegexpPtr regexp;
|
|
|
|
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
|
|
|
{
|
|
|
|
const auto * const cur_haystack_data = &haystack[prev_haystack_offset];
|
|
|
|
const size_t cur_haystack_length = N;
|
|
|
|
|
|
|
|
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
|
|
|
|
const size_t cur_needle_length = needle_offset[i] - prev_needle_offset - 1;
|
|
|
|
|
|
|
|
const auto & needle = String(
|
|
|
|
reinterpret_cast<const char *>(cur_needle_data),
|
|
|
|
cur_needle_length);
|
|
|
|
|
2022-05-24 12:03:14 +00:00
|
|
|
if (is_like && impl::likePatternIsSubstring(needle, required_substr))
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
{
|
|
|
|
if (required_substr.size() > cur_haystack_length)
|
|
|
|
res[i] = negate;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length);
|
|
|
|
const auto * match = searcher.search(cur_haystack_data, cur_haystack_length);
|
2022-05-24 18:55:04 +00:00
|
|
|
res[i] = negate ^ (match != cur_haystack_data + cur_haystack_length);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-08-15 18:58:46 +00:00
|
|
|
regexp = cache.getOrSet<is_like, /*no_capture*/ true, case_insensitive>(needle);
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix);
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
|
|
|
if (required_substr.empty())
|
|
|
|
{
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
if (!regexp->getRE2()) /// An empty regexp. Always matches.
|
2022-05-24 20:59:48 +00:00
|
|
|
res[i] = !negate;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
else
|
|
|
|
{
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
const bool match = regexp->getRE2()->Match(
|
2022-05-24 18:55:04 +00:00
|
|
|
{reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
|
|
|
|
0,
|
|
|
|
cur_haystack_length,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length);
|
|
|
|
const auto * match = searcher.search(cur_haystack_data, cur_haystack_length);
|
|
|
|
|
|
|
|
if (match == cur_haystack_data + cur_haystack_length)
|
|
|
|
res[i] = negate; // no match
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (is_trivial)
|
|
|
|
res[i] = !negate; // no wildcards in pattern
|
|
|
|
else
|
|
|
|
{
|
|
|
|
const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0;
|
|
|
|
const size_t end_pos = cur_haystack_length;
|
|
|
|
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
const bool match2 = regexp->getRE2()->Match(
|
2022-05-24 18:55:04 +00:00
|
|
|
{reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
|
|
|
|
start_pos,
|
|
|
|
end_pos,
|
|
|
|
re2_st::RE2::UNANCHORED,
|
|
|
|
nullptr,
|
|
|
|
0);
|
|
|
|
res[i] = negate ^ match2;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
prev_haystack_offset += N;
|
|
|
|
prev_needle_offset = needle_offset[i];
|
|
|
|
}
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename... Args>
|
|
|
|
static void constantVector(Args &&...)
|
|
|
|
{
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support search with non-constant needles in constant haystack", name);
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|