Move check for regexp array size into implementations

- This is not needed for non-const regexp array arguments (the
  cardinality of arrays is fixed per column) but it cleans up the code
  and runs the check only in functions which have restrictions on the
  number of patterns.

- For functions using hyperscans, it was checked that the number of
  regexes is < 2^32. Removed the check because I don't think anyone will
  every specify 4 billion patterns.
This commit is contained in:
Robert Schulze 2022-06-24 16:42:39 +02:00
parent 7913edc172
commit 3478db9fb6
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
12 changed files with 32 additions and 43 deletions

View File

@ -21,16 +21,13 @@ namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int FUNCTION_NOT_ALLOWED;
}
template <typename Impl, size_t LimitArgs>
template <typename Impl>
class FunctionsMultiStringFuzzySearch : public IFunction
{
static_assert(LimitArgs > 0);
public:
static constexpr auto name = Impl::name;
static FunctionPtr create(ContextPtr context)
@ -96,11 +93,6 @@ public:
Array src_arr = col_const_arr->getValue<Array>();
if (src_arr.size() > LimitArgs)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at most {}",
getName(), std::to_string(src_arr.size()), std::to_string(LimitArgs));
std::vector<std::string_view> refs;
refs.reserve(src_arr.size());

View File

@ -36,18 +36,13 @@ namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int FUNCTION_NOT_ALLOWED;
}
/// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number.
/// But some other searchers use this function, for example, multiMatchAny -- hyperscan does not have such restrictions
template <typename Impl, size_t LimitArgs = std::numeric_limits<UInt8>::max()>
template <typename Impl>
class FunctionsMultiStringSearch : public IFunction
{
static_assert(LimitArgs > 0);
public:
static constexpr auto name = Impl::name;
static FunctionPtr create(ContextPtr context)
@ -97,12 +92,6 @@ public:
Array src_arr = col_const_arr->getValue<Array>();
if (src_arr.size() > LimitArgs)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at most {}",
getName(), std::to_string(src_arr.size()), std::to_string(LimitArgs));
std::vector<std::string_view> refs;
refs.reserve(src_arr.size());

View File

@ -7,6 +7,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
template <typename Name, typename Impl>
struct MultiSearchFirstIndexImpl
{
@ -28,6 +33,12 @@ struct MultiSearchFirstIndexImpl
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
{
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
if (needles.size() > std::numeric_limits<UInt8>::max())
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at most {}",
name, std::to_string(needles.size()), std::to_string(std::numeric_limits<UInt8>::max()));
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
res.resize(haystack_string_size);

View File

@ -7,6 +7,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
template <typename Name, typename Impl>
struct MultiSearchImpl
{
@ -28,6 +33,12 @@ struct MultiSearchImpl
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
{
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
if (needles.size() > std::numeric_limits<UInt8>::max())
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at most {}",
name, std::to_string(needles.size()), std::to_string(std::numeric_limits<UInt8>::max()));
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
res.resize(haystack_string_size);

View File

@ -13,9 +13,7 @@ struct NameMultiFuzzyMatchAllIndices
static constexpr auto name = "multiFuzzyMatchAllIndices";
};
using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch<
MultiMatchAllIndicesImpl<NameMultiFuzzyMatchAllIndices, UInt64, true>,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch<MultiMatchAllIndicesImpl<NameMultiFuzzyMatchAllIndices, UInt64, true>>;
}

View File

@ -13,9 +13,7 @@ struct NameMultiFuzzyMatchAny
static constexpr auto name = "multiFuzzyMatchAny";
};
using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch<
MultiMatchAnyImpl<NameMultiFuzzyMatchAny, UInt8, true, false, true>,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch<MultiMatchAnyImpl<NameMultiFuzzyMatchAny, UInt8, true, false, true>>;
}

View File

@ -13,9 +13,7 @@ struct NameMultiFuzzyMatchAnyIndex
static constexpr auto name = "multiFuzzyMatchAnyIndex";
};
using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch<
MultiMatchAnyImpl<NameMultiFuzzyMatchAnyIndex, UInt64, false, true, true>,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch<MultiMatchAnyImpl<NameMultiFuzzyMatchAnyIndex, UInt64, false, true, true>>;
}

View File

@ -13,9 +13,7 @@ struct NameMultiMatchAllIndices
static constexpr auto name = "multiMatchAllIndices";
};
using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch<
MultiMatchAllIndicesImpl<NameMultiMatchAllIndices, UInt64, false>,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch<MultiMatchAllIndicesImpl<NameMultiMatchAllIndices, UInt64, false>>;
}

View File

@ -13,9 +13,7 @@ struct NameMultiMatchAny
static constexpr auto name = "multiMatchAny";
};
using FunctionMultiMatchAny = FunctionsMultiStringSearch<
MultiMatchAnyImpl<NameMultiMatchAny, UInt8, true, false, false>,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAny = FunctionsMultiStringSearch<MultiMatchAnyImpl<NameMultiMatchAny, UInt8, true, false, false>>;
}

View File

@ -13,9 +13,7 @@ struct NameMultiMatchAnyIndex
static constexpr auto name = "multiMatchAnyIndex";
};
using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<
MultiMatchAnyImpl<NameMultiMatchAnyIndex, UInt64, false, true, false>,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<MultiMatchAnyImpl<NameMultiMatchAnyIndex, UInt64, false, true, false>>;
}

View File

@ -13,8 +13,7 @@ struct NameMultiSearchAnyCaseInsensitive
{
static constexpr auto name = "multiSearchAnyCaseInsensitive";
};
using FunctionMultiSearchCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchImpl<NameMultiSearchAnyCaseInsensitive, PositionCaseInsensitiveASCII>>;
using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch<MultiSearchImpl<NameMultiSearchAnyCaseInsensitive, PositionCaseInsensitiveASCII>>;
}

View File

@ -14,8 +14,7 @@ struct NameMultiSearchFirstIndex
static constexpr auto name = "multiSearchFirstIndex";
};
using FunctionMultiSearchFirstIndex
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<NameMultiSearchFirstIndex, PositionCaseSensitiveASCII>>;
using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<NameMultiSearchFirstIndex, PositionCaseSensitiveASCII>>;
}