mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 17:12:03 +00:00
All multi{Fuzzy}MatchAllIndices functions
This commit is contained in:
parent
22dfc611c9
commit
57f20ba17e
@ -268,14 +268,12 @@ struct MultiMatchAnyImpl
|
|||||||
static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1);
|
static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1);
|
||||||
using ResultType = Type;
|
using ResultType = Type;
|
||||||
static constexpr bool is_using_hyperscan = true;
|
static constexpr bool is_using_hyperscan = true;
|
||||||
|
/// Variable for understanding, if we used offsets for the output, most
|
||||||
static void vector_constant(
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||||
const ColumnString::Chars & haystack_data,
|
static constexpr bool is_column_array = false;
|
||||||
const ColumnString::Offsets & haystack_offsets,
|
static auto ReturnType()
|
||||||
const std::vector<StringRef> & needles,
|
|
||||||
PaddedPODArray<Type> & res)
|
|
||||||
{
|
{
|
||||||
vector_constant(haystack_data, haystack_offsets, needles, res, std::nullopt);
|
return std::make_shared<DataTypeNumber<ResultType>>();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vector_constant(
|
static void vector_constant(
|
||||||
@ -283,10 +281,22 @@ struct MultiMatchAnyImpl
|
|||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const std::vector<StringRef> & needles,
|
const std::vector<StringRef> & needles,
|
||||||
PaddedPODArray<Type> & res,
|
PaddedPODArray<Type> & res,
|
||||||
|
PaddedPODArray<UInt64> & offsets)
|
||||||
|
{
|
||||||
|
vector_constant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vector_constant(
|
||||||
|
const ColumnString::Chars & haystack_data,
|
||||||
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
|
const std::vector<StringRef> & needles,
|
||||||
|
PaddedPODArray<Type> & res,
|
||||||
|
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
||||||
[[maybe_unused]] std::optional<UInt32> edit_distance)
|
[[maybe_unused]] std::optional<UInt32> edit_distance)
|
||||||
{
|
{
|
||||||
(void)FindAny;
|
(void)FindAny;
|
||||||
(void)FindAnyIndex;
|
(void)FindAnyIndex;
|
||||||
|
res.resize(haystack_offsets.size());
|
||||||
#if USE_HYPERSCAN
|
#if USE_HYPERSCAN
|
||||||
const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex, MultiSearchDistance>(needles, edit_distance);
|
const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex, MultiSearchDistance>(needles, edit_distance);
|
||||||
hs_scratch_t * scratch = nullptr;
|
hs_scratch_t * scratch = nullptr;
|
||||||
@ -307,15 +317,18 @@ struct MultiMatchAnyImpl
|
|||||||
*reinterpret_cast<Type *>(context) = id;
|
*reinterpret_cast<Type *>(context) = id;
|
||||||
else if constexpr (FindAny)
|
else if constexpr (FindAny)
|
||||||
*reinterpret_cast<Type *>(context) = 1;
|
*reinterpret_cast<Type *>(context) = 1;
|
||||||
return 0;
|
/// Once we hit the callback, there is no need to search for others.
|
||||||
|
return 1;
|
||||||
};
|
};
|
||||||
const size_t haystack_offsets_size = haystack_offsets.size();
|
const size_t haystack_offsets_size = haystack_offsets.size();
|
||||||
UInt64 offset = 0;
|
UInt64 offset = 0;
|
||||||
for (size_t i = 0; i < haystack_offsets_size; ++i)
|
for (size_t i = 0; i < haystack_offsets_size; ++i)
|
||||||
{
|
{
|
||||||
UInt64 length = haystack_offsets[i] - offset - 1;
|
UInt64 length = haystack_offsets[i] - offset - 1;
|
||||||
|
/// Hyperscan restriction.
|
||||||
if (length > std::numeric_limits<UInt32>::max())
|
if (length > std::numeric_limits<UInt32>::max())
|
||||||
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
|
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
|
||||||
|
/// Zero the result, scan, check, update the offset.
|
||||||
res[i] = 0;
|
res[i] = 0;
|
||||||
err = hs_scan(
|
err = hs_scan(
|
||||||
hyperscan_regex->getDB(),
|
hyperscan_regex->getDB(),
|
||||||
@ -325,7 +338,7 @@ struct MultiMatchAnyImpl
|
|||||||
smart_scratch.get(),
|
smart_scratch.get(),
|
||||||
on_match,
|
on_match,
|
||||||
&res[i]);
|
&res[i]);
|
||||||
if (err != HS_SUCCESS)
|
if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED)
|
||||||
throw Exception("Failed to scan with hyperscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT);
|
throw Exception("Failed to scan with hyperscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT);
|
||||||
offset = haystack_offsets[i];
|
offset = haystack_offsets[i];
|
||||||
}
|
}
|
||||||
@ -353,6 +366,87 @@ struct MultiMatchAnyImpl
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename Type, bool MultiSearchDistance>
|
||||||
|
struct MultiMatchAllIndicesImpl
|
||||||
|
{
|
||||||
|
using ResultType = Type;
|
||||||
|
static constexpr bool is_using_hyperscan = true;
|
||||||
|
/// Variable for understanding, if we used offsets for the output, most
|
||||||
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||||
|
static constexpr bool is_column_array = true;
|
||||||
|
static auto ReturnType()
|
||||||
|
{
|
||||||
|
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vector_constant(
|
||||||
|
const ColumnString::Chars & haystack_data,
|
||||||
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
|
const std::vector<StringRef> & needles,
|
||||||
|
PaddedPODArray<Type> & res,
|
||||||
|
PaddedPODArray<UInt64> & offsets)
|
||||||
|
{
|
||||||
|
vector_constant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void vector_constant(
|
||||||
|
const ColumnString::Chars & haystack_data,
|
||||||
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
|
const std::vector<StringRef> & needles,
|
||||||
|
PaddedPODArray<Type> & res,
|
||||||
|
PaddedPODArray<UInt64> & offsets,
|
||||||
|
[[maybe_unused]] std::optional<UInt32> edit_distance)
|
||||||
|
{
|
||||||
|
offsets.resize(haystack_offsets.size());
|
||||||
|
#if USE_HYPERSCAN
|
||||||
|
const auto & hyperscan_regex = MultiRegexps::get</*SaveIndices=*/true, MultiSearchDistance>(needles, edit_distance);
|
||||||
|
hs_scratch_t * scratch = nullptr;
|
||||||
|
hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch);
|
||||||
|
|
||||||
|
if (err != HS_SUCCESS)
|
||||||
|
throw Exception("Could not clone scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||||
|
|
||||||
|
MultiRegexps::ScratchPtr smart_scratch(scratch);
|
||||||
|
|
||||||
|
auto on_match = [](unsigned int id,
|
||||||
|
unsigned long long /* from */,
|
||||||
|
unsigned long long /* to */,
|
||||||
|
unsigned int /* flags */,
|
||||||
|
void * context) -> int
|
||||||
|
{
|
||||||
|
static_cast<PaddedPODArray<Type>*>(context)->push_back(id);
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
const size_t haystack_offsets_size = haystack_offsets.size();
|
||||||
|
UInt64 offset = 0;
|
||||||
|
for (size_t i = 0; i < haystack_offsets_size; ++i)
|
||||||
|
{
|
||||||
|
UInt64 length = haystack_offsets[i] - offset - 1;
|
||||||
|
/// Hyperscan restriction.
|
||||||
|
if (length > std::numeric_limits<UInt32>::max())
|
||||||
|
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
|
||||||
|
/// Scan, check, update the offsets array and the offset of haystack.
|
||||||
|
err = hs_scan(
|
||||||
|
hyperscan_regex->getDB(),
|
||||||
|
reinterpret_cast<const char *>(haystack_data.data()) + offset,
|
||||||
|
length,
|
||||||
|
0,
|
||||||
|
smart_scratch.get(),
|
||||||
|
on_match,
|
||||||
|
&res);
|
||||||
|
if (err != HS_SUCCESS)
|
||||||
|
throw Exception("Failed to scan with hyperscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT);
|
||||||
|
offsets[i] = res.size();
|
||||||
|
offset = haystack_offsets[i];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
throw Exception(
|
||||||
|
"multi-search all indices is not implemented when hyperscan is off (is it Intel processor?)",
|
||||||
|
ErrorCodes::NOT_IMPLEMENTED);
|
||||||
|
#endif // USE_HYPERSCAN
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
struct ExtractImpl
|
struct ExtractImpl
|
||||||
{
|
{
|
||||||
@ -866,6 +960,10 @@ struct NameMultiMatchAnyIndex
|
|||||||
{
|
{
|
||||||
static constexpr auto name = "multiMatchAnyIndex";
|
static constexpr auto name = "multiMatchAnyIndex";
|
||||||
};
|
};
|
||||||
|
struct NameMultiMatchAllIndices
|
||||||
|
{
|
||||||
|
static constexpr auto name = "multiMatchAllIndices";
|
||||||
|
};
|
||||||
struct NameMultiFuzzyMatchAny
|
struct NameMultiFuzzyMatchAny
|
||||||
{
|
{
|
||||||
static constexpr auto name = "multiFuzzyMatchAny";
|
static constexpr auto name = "multiFuzzyMatchAny";
|
||||||
@ -874,6 +972,10 @@ struct NameMultiFuzzyMatchAnyIndex
|
|||||||
{
|
{
|
||||||
static constexpr auto name = "multiFuzzyMatchAnyIndex";
|
static constexpr auto name = "multiFuzzyMatchAnyIndex";
|
||||||
};
|
};
|
||||||
|
struct NameMultiFuzzyMatchAllIndices
|
||||||
|
{
|
||||||
|
static constexpr auto name = "multiFuzzyMatchAllIndices";
|
||||||
|
};
|
||||||
struct NameExtract
|
struct NameExtract
|
||||||
{
|
{
|
||||||
static constexpr auto name = "extract";
|
static constexpr auto name = "extract";
|
||||||
@ -908,6 +1010,11 @@ using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<
|
|||||||
NameMultiMatchAnyIndex,
|
NameMultiMatchAnyIndex,
|
||||||
std::numeric_limits<UInt32>::max()>;
|
std::numeric_limits<UInt32>::max()>;
|
||||||
|
|
||||||
|
using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch<
|
||||||
|
MultiMatchAllIndicesImpl<UInt64, false>,
|
||||||
|
NameMultiMatchAllIndices,
|
||||||
|
std::numeric_limits<UInt32>::max()>;
|
||||||
|
|
||||||
using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch<
|
using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch<
|
||||||
MultiMatchAnyImpl<UInt8, true, false, true>,
|
MultiMatchAnyImpl<UInt8, true, false, true>,
|
||||||
NameMultiFuzzyMatchAny,
|
NameMultiFuzzyMatchAny,
|
||||||
@ -918,6 +1025,11 @@ using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch<
|
|||||||
NameMultiFuzzyMatchAnyIndex,
|
NameMultiFuzzyMatchAnyIndex,
|
||||||
std::numeric_limits<UInt32>::max()>;
|
std::numeric_limits<UInt32>::max()>;
|
||||||
|
|
||||||
|
using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch<
|
||||||
|
MultiMatchAllIndicesImpl<UInt64, true>,
|
||||||
|
NameMultiFuzzyMatchAllIndices,
|
||||||
|
std::numeric_limits<UInt32>::max()>;
|
||||||
|
|
||||||
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
|
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
|
||||||
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
|
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
|
||||||
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
|
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
|
||||||
@ -940,8 +1052,10 @@ void registerFunctionsStringRegex(FunctionFactory & factory)
|
|||||||
|
|
||||||
factory.registerFunction<FunctionMultiMatchAny>();
|
factory.registerFunction<FunctionMultiMatchAny>();
|
||||||
factory.registerFunction<FunctionMultiMatchAnyIndex>();
|
factory.registerFunction<FunctionMultiMatchAnyIndex>();
|
||||||
|
factory.registerFunction<FunctionMultiMatchAllIndices>();
|
||||||
factory.registerFunction<FunctionMultiFuzzyMatchAny>();
|
factory.registerFunction<FunctionMultiFuzzyMatchAny>();
|
||||||
factory.registerFunction<FunctionMultiFuzzyMatchAnyIndex>();
|
factory.registerFunction<FunctionMultiFuzzyMatchAnyIndex>();
|
||||||
|
factory.registerFunction<FunctionMultiFuzzyMatchAllIndices>();
|
||||||
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
|
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -63,9 +63,7 @@ public:
|
|||||||
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
|
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
|
||||||
throw Exception(
|
throw Exception(
|
||||||
"Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
"Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||||
|
return Impl::ReturnType();
|
||||||
|
|
||||||
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
||||||
@ -115,20 +113,23 @@ public:
|
|||||||
for (const auto & el : src_arr)
|
for (const auto & el : src_arr)
|
||||||
refs.emplace_back(el.get<String>());
|
refs.emplace_back(el.get<String>());
|
||||||
|
|
||||||
const size_t column_haystack_size = column_haystack->size();
|
|
||||||
|
|
||||||
auto col_res = ColumnVector<ResultType>::create();
|
auto col_res = ColumnVector<ResultType>::create();
|
||||||
|
auto col_offsets = ColumnArray::ColumnOffsets::create();
|
||||||
|
|
||||||
auto & vec_res = col_res->getData();
|
auto & vec_res = col_res->getData();
|
||||||
|
auto & offsets_res = col_offsets->getData();
|
||||||
|
|
||||||
vec_res.resize(column_haystack_size);
|
/// The blame for resizing output is for the callee.
|
||||||
|
|
||||||
if (col_haystack_vector)
|
if (col_haystack_vector)
|
||||||
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, edit_distance);
|
Impl::vector_constant(
|
||||||
|
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance);
|
||||||
else
|
else
|
||||||
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
|
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||||
|
|
||||||
block.getByPosition(result).column = std::move(col_res);
|
if constexpr (Impl::is_column_array)
|
||||||
|
block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets));
|
||||||
|
else
|
||||||
|
block.getByPosition(result).column = std::move(col_res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ struct PositionCaseSensitiveASCII
|
|||||||
|
|
||||||
/// Convert string to lowercase. Only for case-insensitive search.
|
/// Convert string to lowercase. Only for case-insensitive search.
|
||||||
/// Implementation is permitted to be inefficient because it is called for single string.
|
/// Implementation is permitted to be inefficient because it is called for single string.
|
||||||
static void toLowerIfNeed(std::string &) {}
|
static void toLowerIfNeed(std::string &) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct PositionCaseInsensitiveASCII
|
struct PositionCaseInsensitiveASCII
|
||||||
@ -107,7 +107,7 @@ struct PositionCaseSensitiveUTF8
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void toLowerIfNeed(std::string &) {}
|
static void toLowerIfNeed(std::string &) { }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct PositionCaseInsensitiveUTF8
|
struct PositionCaseInsensitiveUTF8
|
||||||
@ -335,15 +335,21 @@ struct MultiSearchImpl
|
|||||||
{
|
{
|
||||||
using ResultType = UInt8;
|
using ResultType = UInt8;
|
||||||
static constexpr bool is_using_hyperscan = false;
|
static constexpr bool is_using_hyperscan = false;
|
||||||
|
/// Variable for understanding, if we used offsets for the output, most
|
||||||
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||||
|
static constexpr bool is_column_array = false;
|
||||||
|
static auto ReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
||||||
|
|
||||||
static void vector_constant(
|
static void vector_constant(
|
||||||
const ColumnString::Chars & haystack_data,
|
const ColumnString::Chars & haystack_data,
|
||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const std::vector<StringRef> & needles,
|
const std::vector<StringRef> & needles,
|
||||||
PaddedPODArray<UInt8> & res)
|
PaddedPODArray<UInt8> & res,
|
||||||
|
[[maybe_unused]] PaddedPODArray<UInt64> & offsets)
|
||||||
{
|
{
|
||||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||||
const size_t haystack_string_size = haystack_offsets.size();
|
const size_t haystack_string_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_string_size);
|
||||||
size_t iteration = 0;
|
size_t iteration = 0;
|
||||||
while (searcher.hasMoreToSearch())
|
while (searcher.hasMoreToSearch())
|
||||||
{
|
{
|
||||||
@ -366,12 +372,17 @@ struct MultiSearchFirstPositionImpl
|
|||||||
{
|
{
|
||||||
using ResultType = UInt64;
|
using ResultType = UInt64;
|
||||||
static constexpr bool is_using_hyperscan = false;
|
static constexpr bool is_using_hyperscan = false;
|
||||||
|
/// Variable for understanding, if we used offsets for the output, most
|
||||||
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||||
|
static constexpr bool is_column_array = false;
|
||||||
|
static auto ReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
||||||
|
|
||||||
static void vector_constant(
|
static void vector_constant(
|
||||||
const ColumnString::Chars & haystack_data,
|
const ColumnString::Chars & haystack_data,
|
||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const std::vector<StringRef> & needles,
|
const std::vector<StringRef> & needles,
|
||||||
PaddedPODArray<UInt64> & res)
|
PaddedPODArray<UInt64> & res,
|
||||||
|
[[maybe_unused]] PaddedPODArray<UInt64> & offsets)
|
||||||
{
|
{
|
||||||
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
|
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
|
||||||
{
|
{
|
||||||
@ -379,6 +390,7 @@ struct MultiSearchFirstPositionImpl
|
|||||||
};
|
};
|
||||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||||
const size_t haystack_string_size = haystack_offsets.size();
|
const size_t haystack_string_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_string_size);
|
||||||
size_t iteration = 0;
|
size_t iteration = 0;
|
||||||
while (searcher.hasMoreToSearch())
|
while (searcher.hasMoreToSearch())
|
||||||
{
|
{
|
||||||
@ -407,15 +419,21 @@ struct MultiSearchFirstIndexImpl
|
|||||||
{
|
{
|
||||||
using ResultType = UInt64;
|
using ResultType = UInt64;
|
||||||
static constexpr bool is_using_hyperscan = false;
|
static constexpr bool is_using_hyperscan = false;
|
||||||
|
/// Variable for understanding, if we used offsets for the output, most
|
||||||
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
||||||
|
static constexpr bool is_column_array = false;
|
||||||
|
static auto ReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
||||||
|
|
||||||
static void vector_constant(
|
static void vector_constant(
|
||||||
const ColumnString::Chars & haystack_data,
|
const ColumnString::Chars & haystack_data,
|
||||||
const ColumnString::Offsets & haystack_offsets,
|
const ColumnString::Offsets & haystack_offsets,
|
||||||
const std::vector<StringRef> & needles,
|
const std::vector<StringRef> & needles,
|
||||||
PaddedPODArray<UInt64> & res)
|
PaddedPODArray<UInt64> & res,
|
||||||
|
[[maybe_unused]] PaddedPODArray<UInt64> & offsets)
|
||||||
{
|
{
|
||||||
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
||||||
const size_t haystack_string_size = haystack_offsets.size();
|
const size_t haystack_string_size = haystack_offsets.size();
|
||||||
|
res.resize(haystack_string_size);
|
||||||
size_t iteration = 0;
|
size_t iteration = 0;
|
||||||
while (searcher.hasMoreToSearch())
|
while (searcher.hasMoreToSearch())
|
||||||
{
|
{
|
||||||
@ -598,30 +616,48 @@ struct NameHasTokenCaseInsensitive
|
|||||||
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
|
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
|
||||||
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
|
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
|
||||||
using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveASCII>, NamePositionCaseInsensitive>;
|
using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveASCII>, NamePositionCaseInsensitive>;
|
||||||
using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
|
using FunctionPositionCaseInsensitiveUTF8
|
||||||
|
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
|
||||||
|
|
||||||
using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
|
using FunctionMultiSearchAllPositions
|
||||||
using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
|
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
|
||||||
using FunctionMultiSearchAllPositionsCaseInsensitive = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
|
using FunctionMultiSearchAllPositionsUTF8
|
||||||
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
|
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
|
||||||
|
using FunctionMultiSearchAllPositionsCaseInsensitive
|
||||||
|
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
|
||||||
|
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition<
|
||||||
|
MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>,
|
||||||
|
NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
|
||||||
|
|
||||||
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
|
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
|
||||||
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
|
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
|
||||||
using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
|
using FunctionMultiSearchCaseInsensitive
|
||||||
using FunctionMultiSearchCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
|
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
|
||||||
|
using FunctionMultiSearchCaseInsensitiveUTF8
|
||||||
|
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
|
||||||
|
|
||||||
using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
|
using FunctionMultiSearchFirstIndex
|
||||||
using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
|
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
|
||||||
using FunctionMultiSearchFirstIndexCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
|
using FunctionMultiSearchFirstIndexUTF8
|
||||||
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
|
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
|
||||||
|
using FunctionMultiSearchFirstIndexCaseInsensitive
|
||||||
|
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
|
||||||
|
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8
|
||||||
|
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
|
||||||
|
|
||||||
using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
|
using FunctionMultiSearchFirstPosition
|
||||||
using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
|
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
|
||||||
using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
using FunctionMultiSearchFirstPositionUTF8
|
||||||
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
|
||||||
|
using FunctionMultiSearchFirstPositionCaseInsensitive
|
||||||
|
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
||||||
|
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<
|
||||||
|
MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>,
|
||||||
|
NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
||||||
|
|
||||||
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseSensitiveToken, false>, NameHasToken>;
|
using FunctionHasToken = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseSensitiveToken, false>, NameHasToken>;
|
||||||
using FunctionHasTokenCaseInsensitive = FunctionsStringSearch<HasTokenImpl<VolnitskyCaseInsensitiveToken, false>, NameHasTokenCaseInsensitive>;
|
using FunctionHasTokenCaseInsensitive
|
||||||
|
= FunctionsStringSearch<HasTokenImpl<VolnitskyCaseInsensitiveToken, false>, NameHasTokenCaseInsensitive>;
|
||||||
|
|
||||||
void registerFunctionsStringSearch(FunctionFactory & factory)
|
void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||||
{
|
{
|
||||||
|
@ -28,6 +28,7 @@ namespace DB
|
|||||||
* match(haystack, pattern) - search by regular expression re2; Returns 0 or 1.
|
* match(haystack, pattern) - search by regular expression re2; Returns 0 or 1.
|
||||||
* multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches.
|
* multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches.
|
||||||
* multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none;
|
* multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none;
|
||||||
|
* multiMatchAllIndices(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns an array of matched indices in any order;
|
||||||
*
|
*
|
||||||
* Applies regexp re2 and pulls:
|
* Applies regexp re2 and pulls:
|
||||||
* - the first subpattern, if the regexp has a subpattern;
|
* - the first subpattern, if the regexp has a subpattern;
|
||||||
@ -312,9 +313,7 @@ public:
|
|||||||
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
|
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
|
||||||
throw Exception(
|
throw Exception(
|
||||||
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||||
|
return Impl::ReturnType();
|
||||||
|
|
||||||
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
|
||||||
@ -347,20 +346,22 @@ public:
|
|||||||
for (const auto & el : src_arr)
|
for (const auto & el : src_arr)
|
||||||
refs.emplace_back(el.get<String>());
|
refs.emplace_back(el.get<String>());
|
||||||
|
|
||||||
const size_t column_haystack_size = column_haystack->size();
|
|
||||||
|
|
||||||
auto col_res = ColumnVector<ResultType>::create();
|
auto col_res = ColumnVector<ResultType>::create();
|
||||||
|
auto col_offsets = ColumnArray::ColumnOffsets::create();
|
||||||
|
|
||||||
auto & vec_res = col_res->getData();
|
auto & vec_res = col_res->getData();
|
||||||
|
auto & offsets_res = col_offsets->getData();
|
||||||
|
|
||||||
vec_res.resize(column_haystack_size);
|
/// The blame for resizing output is for the callee.
|
||||||
|
|
||||||
if (col_haystack_vector)
|
if (col_haystack_vector)
|
||||||
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
|
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res);
|
||||||
else
|
else
|
||||||
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
|
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
|
||||||
|
|
||||||
block.getByPosition(result).column = std::move(col_res);
|
if constexpr (Impl::is_column_array)
|
||||||
|
block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets));
|
||||||
|
else
|
||||||
|
block.getByPosition(result).column = std::move(col_res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -8,10 +8,10 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <Functions/likePatternToRegexp.h>
|
#include <Functions/likePatternToRegexp.h>
|
||||||
|
#include <Common/Exception.h>
|
||||||
#include <Common/ObjectPool.h>
|
#include <Common/ObjectPool.h>
|
||||||
#include <Common/OptimizedRegularExpression.h>
|
#include <Common/OptimizedRegularExpression.h>
|
||||||
#include <Common/ProfileEvents.h>
|
#include <Common/ProfileEvents.h>
|
||||||
#include <Common/Exception.h>
|
|
||||||
#include <common/StringRef.h>
|
#include <common/StringRef.h>
|
||||||
|
|
||||||
|
|
||||||
@ -87,18 +87,20 @@ namespace MultiRegexps
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception.
|
||||||
using CompilerError = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
|
using CompilerError = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
|
||||||
using ScratchPtr = std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
|
using ScratchPtr = std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
|
||||||
using DataBasePtr = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
|
using DataBasePtr = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
|
||||||
|
|
||||||
/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher
|
/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher.
|
||||||
class Regexps
|
class Regexps
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} {}
|
Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { }
|
||||||
|
|
||||||
hs_database_t * getDB() const { return db.get(); }
|
hs_database_t * getDB() const { return db.get(); }
|
||||||
hs_scratch_t * getScratch() const { return scratch.get(); }
|
hs_scratch_t * getScratch() const { return scratch.get(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DataBasePtr db;
|
DataBasePtr db;
|
||||||
ScratchPtr scratch;
|
ScratchPtr scratch;
|
||||||
@ -106,25 +108,25 @@ namespace MultiRegexps
|
|||||||
|
|
||||||
struct Pool
|
struct Pool
|
||||||
{
|
{
|
||||||
/// Mutex for finding in map
|
/// Mutex for finding in map.
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
/// Patterns + possible edit_distance to database and scratch
|
/// Patterns + possible edit_distance to database and scratch.
|
||||||
std::map<std::pair<std::vector<String>, std::optional<UInt32>>, Regexps> storage;
|
std::map<std::pair<std::vector<String>, std::optional<UInt32>>, Regexps> storage;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <bool FindAnyIndex, bool CompileForEditDistance>
|
template <bool SaveIndices, bool CompileForEditDistance>
|
||||||
inline Regexps constructRegexps(const std::vector<String> & str_patterns, std::optional<UInt32> edit_distance)
|
inline Regexps constructRegexps(const std::vector<String> & str_patterns, std::optional<UInt32> edit_distance)
|
||||||
{
|
{
|
||||||
(void)edit_distance;
|
(void)edit_distance;
|
||||||
/// Common pointers
|
/// Common pointers
|
||||||
std::vector<const char *> ptrns;
|
std::vector<const char *> patterns;
|
||||||
std::vector<unsigned int> flags;
|
std::vector<unsigned int> flags;
|
||||||
|
|
||||||
/// Pointer for external edit distance compilation
|
/// Pointer for external edit distance compilation
|
||||||
std::vector<hs_expr_ext> ext_exprs;
|
std::vector<hs_expr_ext> ext_exprs;
|
||||||
std::vector<const hs_expr_ext *> ext_exprs_ptrs;
|
std::vector<const hs_expr_ext *> ext_exprs_ptrs;
|
||||||
|
|
||||||
ptrns.reserve(str_patterns.size());
|
patterns.reserve(str_patterns.size());
|
||||||
flags.reserve(str_patterns.size());
|
flags.reserve(str_patterns.size());
|
||||||
|
|
||||||
if constexpr (CompileForEditDistance)
|
if constexpr (CompileForEditDistance)
|
||||||
@ -135,12 +137,22 @@ namespace MultiRegexps
|
|||||||
|
|
||||||
for (const StringRef ref : str_patterns)
|
for (const StringRef ref : str_patterns)
|
||||||
{
|
{
|
||||||
ptrns.push_back(ref.data);
|
patterns.push_back(ref.data);
|
||||||
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
|
/* Flags below are the pattern matching flags.
|
||||||
|
* HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good
|
||||||
|
* performance practice accrording to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode
|
||||||
|
* HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match.
|
||||||
|
* HS_FLAG_UTF8 is a flag where UTF8 literals are matched.
|
||||||
|
* HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice
|
||||||
|
* as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag
|
||||||
|
*/
|
||||||
|
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8);
|
||||||
if constexpr (CompileForEditDistance)
|
if constexpr (CompileForEditDistance)
|
||||||
{
|
{
|
||||||
|
/// Hyperscan currently does not support UTF8 matching with edit distance.
|
||||||
flags.back() &= ~HS_FLAG_UTF8;
|
flags.back() &= ~HS_FLAG_UTF8;
|
||||||
ext_exprs.emplace_back();
|
ext_exprs.emplace_back();
|
||||||
|
/// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance.
|
||||||
ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
|
ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
|
||||||
ext_exprs.back().edit_distance = edit_distance.value();
|
ext_exprs.back().edit_distance = edit_distance.value();
|
||||||
ext_exprs_ptrs.push_back(&ext_exprs.back());
|
ext_exprs_ptrs.push_back(&ext_exprs.back());
|
||||||
@ -152,31 +164,32 @@ namespace MultiRegexps
|
|||||||
|
|
||||||
std::unique_ptr<unsigned int[]> ids;
|
std::unique_ptr<unsigned int[]> ids;
|
||||||
|
|
||||||
if constexpr (FindAnyIndex)
|
/// We mark the patterns to provide the callback results.
|
||||||
|
if constexpr (SaveIndices)
|
||||||
{
|
{
|
||||||
ids.reset(new unsigned int[ptrns.size()]);
|
ids.reset(new unsigned int[patterns.size()]);
|
||||||
for (size_t i = 0; i < ptrns.size(); ++i)
|
for (size_t i = 0; i < patterns.size(); ++i)
|
||||||
ids[i] = i + 1;
|
ids[i] = i + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
hs_error_t err;
|
hs_error_t err;
|
||||||
if constexpr (!CompileForEditDistance)
|
if constexpr (!CompileForEditDistance)
|
||||||
err = hs_compile_multi(
|
err = hs_compile_multi(
|
||||||
ptrns.data(),
|
patterns.data(),
|
||||||
flags.data(),
|
flags.data(),
|
||||||
ids.get(),
|
ids.get(),
|
||||||
ptrns.size(),
|
patterns.size(),
|
||||||
HS_MODE_BLOCK,
|
HS_MODE_BLOCK,
|
||||||
nullptr,
|
nullptr,
|
||||||
&db,
|
&db,
|
||||||
&compile_error);
|
&compile_error);
|
||||||
else
|
else
|
||||||
err = hs_compile_ext_multi(
|
err = hs_compile_ext_multi(
|
||||||
ptrns.data(),
|
patterns.data(),
|
||||||
flags.data(),
|
flags.data(),
|
||||||
ids.get(),
|
ids.get(),
|
||||||
ext_exprs_ptrs.data(),
|
ext_exprs_ptrs.data(),
|
||||||
ptrns.size(),
|
patterns.size(),
|
||||||
HS_MODE_BLOCK,
|
HS_MODE_BLOCK,
|
||||||
nullptr,
|
nullptr,
|
||||||
&db,
|
&db,
|
||||||
@ -184,6 +197,7 @@ namespace MultiRegexps
|
|||||||
|
|
||||||
if (err != HS_SUCCESS)
|
if (err != HS_SUCCESS)
|
||||||
{
|
{
|
||||||
|
/// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
|
||||||
CompilerError error(compile_error);
|
CompilerError error(compile_error);
|
||||||
|
|
||||||
if (error->expression < 0)
|
if (error->expression < 0)
|
||||||
@ -196,9 +210,12 @@ namespace MultiRegexps
|
|||||||
|
|
||||||
ProfileEvents::increment(ProfileEvents::RegexpCreated);
|
ProfileEvents::increment(ProfileEvents::RegexpCreated);
|
||||||
|
|
||||||
|
/// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
|
||||||
|
/// function which is faster than allocating scratch space each time in each thread.
|
||||||
hs_scratch_t * scratch = nullptr;
|
hs_scratch_t * scratch = nullptr;
|
||||||
err = hs_alloc_scratch(db, &scratch);
|
err = hs_alloc_scratch(db, &scratch);
|
||||||
|
|
||||||
|
/// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
|
||||||
if (err != HS_SUCCESS)
|
if (err != HS_SUCCESS)
|
||||||
throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||||
|
|
||||||
@ -206,7 +223,10 @@ namespace MultiRegexps
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// If CompileForEditDistance is False, edit_distance must be nullopt
|
/// If CompileForEditDistance is False, edit_distance must be nullopt
|
||||||
template <bool FindAnyIndex, bool CompileForEditDistance>
|
/// Also, we use templates here because each instantiation of function
|
||||||
|
/// template has its own copy of local static variables which must not be the same
|
||||||
|
/// for different hyperscan compilations.
|
||||||
|
template <bool SaveIndices, bool CompileForEditDistance>
|
||||||
inline Regexps * get(const std::vector<StringRef> & patterns, std::optional<UInt32> edit_distance)
|
inline Regexps * get(const std::vector<StringRef> & patterns, std::optional<UInt32> edit_distance)
|
||||||
{
|
{
|
||||||
/// C++11 has thread-safe function-local statics on most modern compilers.
|
/// C++11 has thread-safe function-local statics on most modern compilers.
|
||||||
@ -217,15 +237,19 @@ namespace MultiRegexps
|
|||||||
for (const StringRef & ref : patterns)
|
for (const StringRef & ref : patterns)
|
||||||
str_patterns.push_back(ref.toString());
|
str_patterns.push_back(ref.toString());
|
||||||
|
|
||||||
|
/// Get the lock for finding database.
|
||||||
std::unique_lock lock(known_regexps.mutex);
|
std::unique_lock lock(known_regexps.mutex);
|
||||||
|
|
||||||
auto it = known_regexps.storage.find({str_patterns, edit_distance});
|
auto it = known_regexps.storage.find({str_patterns, edit_distance});
|
||||||
|
|
||||||
|
/// If not found, compile and let other threads wait.
|
||||||
if (known_regexps.storage.end() == it)
|
if (known_regexps.storage.end() == it)
|
||||||
it = known_regexps.storage.emplace(
|
it = known_regexps.storage
|
||||||
std::pair{str_patterns, edit_distance},
|
.emplace(
|
||||||
constructRegexps<FindAnyIndex, CompileForEditDistance>(str_patterns, edit_distance)).first;
|
std::pair{str_patterns, edit_distance},
|
||||||
|
constructRegexps<SaveIndices, CompileForEditDistance>(str_patterns, edit_distance))
|
||||||
|
.first;
|
||||||
|
/// If found, unlock and return the database.
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
|
|
||||||
return &it->second;
|
return &it->second;
|
||||||
|
@ -600,3 +600,26 @@
|
|||||||
1
|
1
|
||||||
1
|
1
|
||||||
1
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
All tests above must return 1, all tests below return something.
|
||||||
|
[]
|
||||||
|
[1,3]
|
||||||
|
[]
|
||||||
|
[1,2,3]
|
||||||
|
@ -73,10 +73,20 @@ select 1 = multiMatchAny(materialize('abcdef'), ['a......', 'a.....']) from syst
|
|||||||
select 0 = multiMatchAny(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10;
|
select 0 = multiMatchAny(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10;
|
||||||
select 1 = multiMatchAny(materialize('abc'), ['a\0d']) from system.numbers limit 10;
|
select 1 = multiMatchAny(materialize('abc'), ['a\0d']) from system.numbers limit 10;
|
||||||
|
|
||||||
select 1 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google', 'yandex1']) from system.numbers limit 10;;
|
select 1 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google', 'yandex1']) from system.numbers limit 10;
|
||||||
select 2 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google1', 'yandex']) from system.numbers limit 10;;
|
select 2 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google1', 'yandex']) from system.numbers limit 10;
|
||||||
select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*']) from system.numbers limit 10;;
|
select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*']) from system.numbers limit 10;
|
||||||
select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;;
|
select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;
|
||||||
|
|
||||||
SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']);
|
SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']);
|
||||||
SELECT 1 = multiMatchAny('фабрикант', ['f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]']);
|
SELECT 1 = multiMatchAny('фабрикант', ['f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]']);
|
||||||
|
|
||||||
|
-- All indices tests
|
||||||
|
SELECT [1, 2] = arraySort(multiMatchAllIndices(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*'])) from system.numbers limit 5;
|
||||||
|
SELECT [1, 3] = arraySort(multiMatchAllIndices(materialize('gogleuedeyandexgoogle'), ['.*goo.*', 'neverexisted', '.*yan.*'])) from system.numbers limit 5;
|
||||||
|
SELECT [] = multiMatchAllIndices(materialize('gogleuedeyandexgoogle'), ['neverexisted', 'anotherone', 'andanotherone']) from system.numbers limit 5;
|
||||||
|
SELECT [1, 2, 3, 11] = arraySort(multiMatchAllIndices('фабрикант', ['', 'рикан', 'а', 'f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]']));
|
||||||
|
SELECT [1] = multiMatchAllIndices(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']);
|
||||||
|
SELECT [] = multiMatchAllIndices(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']);
|
||||||
|
SELECT 'All tests above must return 1, all tests below return something.';
|
||||||
|
SELECT arraySort(multiMatchAllIndices(arrayJoin(['aaaa', 'aaaaaa', 'bbbb', 'aaaaaaaaaaaaaa']), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']));
|
||||||
|
@ -2,5 +2,6 @@ SET allow_hyperscan = 1;
|
|||||||
SELECT multiMatchAny(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']);
|
SELECT multiMatchAny(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']);
|
||||||
SET allow_hyperscan = 0;
|
SET allow_hyperscan = 0;
|
||||||
SELECT multiMatchAny(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']); -- { serverError 446 }
|
SELECT multiMatchAny(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']); -- { serverError 446 }
|
||||||
|
SELECT multiMatchAllIndices(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']); -- { serverError 446 }
|
||||||
|
|
||||||
SELECT multiSearchAny(arrayJoin(['hello', 'world', 'hello, world', 'abc']), ['hello', 'world']);
|
SELECT multiSearchAny(arrayJoin(['hello', 'world', 'hello, world', 'abc']), ['hello', 'world']);
|
||||||
|
@ -30,3 +30,5 @@
|
|||||||
1
|
1
|
||||||
1
|
1
|
||||||
1
|
1
|
||||||
|
1
|
||||||
|
1
|
||||||
|
@ -24,3 +24,6 @@ select 1 = multiFuzzyMatchAny('string', 1, ['zorro$', '^tring', 'ip$', 'how.*',
|
|||||||
select 2 = multiFuzzyMatchAnyIndex('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
|
select 2 = multiFuzzyMatchAnyIndex('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
|
||||||
select 2 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
|
select 2 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
|
||||||
select 1 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
|
select 1 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
|
||||||
|
|
||||||
|
select [2, 3, 4] = arraySort(multiFuzzyMatchAllIndices('halo some wrld', 2, ['some random string', '^halo.*world$', '^halo.*world$', '^halo.*world$', '^hallllo.*world$']));
|
||||||
|
select [] = multiFuzzyMatchAllIndices('halo some wrld', 2, ['^halllllo.*world$', 'some random string']);
|
||||||
|
@ -64,6 +64,10 @@ The same as `match`, but returns 0 if none of the regular expressions are matche
|
|||||||
|
|
||||||
The same as `multiMatchAny`, but returns any index that matches the haystack.
|
The same as `multiMatchAny`, but returns any index that matches the haystack.
|
||||||
|
|
||||||
|
## multiMatchAllIndices(haystack, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||||
|
|
||||||
|
The same as `multiMatchAny`, but returns the array of all indicies that match the haystack in any order.
|
||||||
|
|
||||||
## multiFuzzyMatchAny(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
## multiFuzzyMatchAny(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||||
|
|
||||||
The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within a constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within a constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
||||||
@ -72,6 +76,10 @@ The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack w
|
|||||||
|
|
||||||
The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within a constant edit distance.
|
The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within a constant edit distance.
|
||||||
|
|
||||||
|
## multiFuzzyMatchAllIndices(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||||
|
|
||||||
|
The same as `multiFuzzyMatchAny`, but returns the array of all indices that match the haystack within a constant edit distance.
|
||||||
|
|
||||||
!!! note "Note"
|
!!! note "Note"
|
||||||
`multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction.
|
`multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction.
|
||||||
|
|
||||||
|
@ -57,6 +57,10 @@
|
|||||||
|
|
||||||
То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
|
То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
|
||||||
|
|
||||||
|
## multiMatchAllIndices(haystack, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||||
|
|
||||||
|
То же, что и `multiMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке.
|
||||||
|
|
||||||
## multiFuzzyMatchAny(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
## multiFuzzyMatchAny(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||||
|
|
||||||
То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
||||||
@ -65,6 +69,10 @@
|
|||||||
|
|
||||||
То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
|
То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
|
||||||
|
|
||||||
|
## multiFuzzyMatchAllIndices(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||||
|
|
||||||
|
То же, что и `multiFuzzyMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке в пределах константного редакционного расстояния.
|
||||||
|
|
||||||
!!! note "Примечание"
|
!!! note "Примечание"
|
||||||
`multiFuzzyMatch*` функции не поддерживают UTF-8 закодированные регулярные выражения, и такие выражения рассматриваются как байтовые из-за ограничения hyperscan.
|
`multiFuzzyMatch*` функции не поддерживают UTF-8 закодированные регулярные выражения, и такие выражения рассматриваются как байтовые из-за ограничения hyperscan.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user