#pragma once
#include
#include
#include
#include
#include "Regexps.h"
#include "config_functions.h"
#include
#if USE_VECTORSCAN
# include
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int HYPERSCAN_CANNOT_SCAN_TEXT;
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int NOT_IMPLEMENTED;
extern const int TOO_MANY_BYTES;
}
template
struct MultiMatchAllIndicesImpl
{
using ResultType = ResultType_;
static constexpr bool is_using_hyperscan = true;
/// Variable for understanding, if we used offsets for the output, most
/// likely to determine whether the function returns ColumnVector of ColumnArray.
static constexpr bool is_column_array = true;
static constexpr auto name = Name::name;
static auto getReturnType()
{
return std::make_shared(std::make_shared());
}
static void vectorConstant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector & needles,
PaddedPODArray & res,
PaddedPODArray & offsets)
{
vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt);
}
static void vectorConstant(
[[maybe_unused]] const ColumnString::Chars & haystack_data,
[[maybe_unused]] const ColumnString::Offsets & haystack_offsets,
[[maybe_unused]] const std::vector & needles,
[[maybe_unused]] PaddedPODArray & res,
[[maybe_unused]] PaddedPODArray & offsets,
[[maybe_unused]] std::optional edit_distance)
{
offsets.resize(haystack_offsets.size());
#if USE_VECTORSCAN
const auto & hyperscan_regex = MultiRegexps::get*SaveIndices=*/true, MultiSearchDistance>(needles, edit_distance);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch);
if (err != HS_SUCCESS)
throw Exception("Could not clone scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
MultiRegexps::ScratchPtr smart_scratch(scratch);
auto on_match = [](unsigned int id,
unsigned long long /* from */, // NOLINT
unsigned long long /* to */, // NOLINT
unsigned int /* flags */,
void * context) -> int
{
static_cast*>(context)->push_back(id);
return 0;
};
const size_t haystack_offsets_size = haystack_offsets.size();
UInt64 offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
UInt64 length = haystack_offsets[i] - offset - 1;
/// Hyperscan restriction.
if (length > std::numeric_limits::max())
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
/// Scan, check, update the offsets array and the offset of haystack.
err = hs_scan(
hyperscan_regex->getDB(),
reinterpret_cast(haystack_data.data()) + offset,
length,
0,
smart_scratch.get(),
on_match,
&res);
if (err != HS_SUCCESS)
throw Exception("Failed to scan with vectorscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT);
offsets[i] = res.size();
offset = haystack_offsets[i];
}
#else
throw Exception(
"multi-search all indices is not implemented when vectorscan is off",
ErrorCodes::NOT_IMPLEMENTED);
#endif // USE_VECTORSCAN
}
};
}