2020-05-06 23:21:13 +00:00
|
|
|
#pragma once
|
|
|
|
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/types.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
2022-03-12 18:05:50 +00:00
|
|
|
#include <DataTypes/DataTypeArray.h>
|
2022-06-24 14:12:38 +00:00
|
|
|
#include <Functions/checkHyperscanRegexp.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
#include "Regexps.h"
|
|
|
|
|
2021-10-27 23:10:39 +00:00
|
|
|
#include "config_functions.h"
|
|
|
|
#include <Common/config.h>
|
2020-05-06 23:21:13 +00:00
|
|
|
|
2022-06-17 10:15:19 +00:00
|
|
|
#if USE_VECTORSCAN
|
2020-05-06 23:21:13 +00:00
|
|
|
# include <hs.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int HYPERSCAN_CANNOT_SCAN_TEXT;
|
|
|
|
extern const int CANNOT_ALLOCATE_MEMORY;
|
2020-05-06 23:51:41 +00:00
|
|
|
extern const int NOT_IMPLEMENTED;
|
2020-05-06 23:21:13 +00:00
|
|
|
extern const int TOO_MANY_BYTES;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-06-24 13:34:40 +00:00
|
|
|
template <typename Name, typename ResultType_, bool MultiSearchDistance>
|
2020-05-06 23:21:13 +00:00
|
|
|
struct MultiMatchAllIndicesImpl
|
|
|
|
{
|
2022-06-24 13:34:40 +00:00
|
|
|
using ResultType = ResultType_;
|
|
|
|
|
2020-05-06 23:21:13 +00:00
|
|
|
static constexpr bool is_using_hyperscan = true;
|
|
|
|
/// Variable for understanding, if we used offsets for the output, most
|
|
|
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
|
|
|
static constexpr bool is_column_array = true;
|
2021-09-21 16:43:46 +00:00
|
|
|
static constexpr auto name = Name::name;
|
|
|
|
|
2020-05-06 23:21:13 +00:00
|
|
|
static auto getReturnType()
|
|
|
|
{
|
|
|
|
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vectorConstant(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
2022-06-24 13:16:57 +00:00
|
|
|
const std::vector<std::string_view> & needles,
|
2022-06-24 13:34:40 +00:00
|
|
|
PaddedPODArray<ResultType> & res,
|
2022-06-24 14:12:38 +00:00
|
|
|
PaddedPODArray<UInt64> & offsets,
|
|
|
|
size_t max_hyperscan_regexp_length,
|
|
|
|
size_t max_hyperscan_regexp_total_length)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-06-24 14:12:38 +00:00
|
|
|
vectorConstant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vectorConstant(
|
2022-06-24 13:34:40 +00:00
|
|
|
[[maybe_unused]] const ColumnString::Chars & haystack_data,
|
|
|
|
[[maybe_unused]] const ColumnString::Offsets & haystack_offsets,
|
|
|
|
[[maybe_unused]] const std::vector<std::string_view> & needles,
|
|
|
|
[[maybe_unused]] PaddedPODArray<ResultType> & res,
|
|
|
|
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
|
2022-06-24 14:12:38 +00:00
|
|
|
[[maybe_unused]] std::optional<UInt32> edit_distance,
|
|
|
|
[[maybe_unused]] size_t max_hyperscan_regexp_length,
|
|
|
|
[[maybe_unused]] size_t max_hyperscan_regexp_total_length)
|
2020-05-06 23:21:13 +00:00
|
|
|
{
|
2022-06-17 10:15:19 +00:00
|
|
|
#if USE_VECTORSCAN
|
2022-06-24 14:12:38 +00:00
|
|
|
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
|
|
|
|
|
|
|
offsets.resize(haystack_offsets.size());
|
|
|
|
|
2020-05-06 23:21:13 +00:00
|
|
|
const auto & hyperscan_regex = MultiRegexps::get</*SaveIndices=*/true, MultiSearchDistance>(needles, edit_distance);
|
|
|
|
hs_scratch_t * scratch = nullptr;
|
|
|
|
hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch);
|
|
|
|
|
|
|
|
if (err != HS_SUCCESS)
|
|
|
|
throw Exception("Could not clone scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
|
|
|
|
|
|
|
MultiRegexps::ScratchPtr smart_scratch(scratch);
|
|
|
|
|
|
|
|
auto on_match = [](unsigned int id,
|
|
|
|
unsigned long long /* from */, // NOLINT
|
|
|
|
unsigned long long /* to */, // NOLINT
|
|
|
|
unsigned int /* flags */,
|
|
|
|
void * context) -> int
|
|
|
|
{
|
2022-06-24 13:34:40 +00:00
|
|
|
static_cast<PaddedPODArray<ResultType>*>(context)->push_back(id);
|
2020-05-06 23:21:13 +00:00
|
|
|
return 0;
|
|
|
|
};
|
|
|
|
const size_t haystack_offsets_size = haystack_offsets.size();
|
|
|
|
UInt64 offset = 0;
|
|
|
|
for (size_t i = 0; i < haystack_offsets_size; ++i)
|
|
|
|
{
|
|
|
|
UInt64 length = haystack_offsets[i] - offset - 1;
|
|
|
|
/// Hyperscan restriction.
|
|
|
|
if (length > std::numeric_limits<UInt32>::max())
|
|
|
|
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
|
|
|
|
/// Scan, check, update the offsets array and the offset of haystack.
|
|
|
|
err = hs_scan(
|
|
|
|
hyperscan_regex->getDB(),
|
|
|
|
reinterpret_cast<const char *>(haystack_data.data()) + offset,
|
|
|
|
length,
|
|
|
|
0,
|
|
|
|
smart_scratch.get(),
|
|
|
|
on_match,
|
|
|
|
&res);
|
|
|
|
if (err != HS_SUCCESS)
|
2022-06-17 10:15:19 +00:00
|
|
|
throw Exception("Failed to scan with vectorscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT);
|
2020-05-06 23:21:13 +00:00
|
|
|
offsets[i] = res.size();
|
|
|
|
offset = haystack_offsets[i];
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
throw Exception(
|
2022-06-17 10:15:19 +00:00
|
|
|
"multi-search all indices is not implemented when vectorscan is off",
|
2020-05-06 23:21:13 +00:00
|
|
|
ErrorCodes::NOT_IMPLEMENTED);
|
2022-06-17 10:15:19 +00:00
|
|
|
#endif // USE_VECTORSCAN
|
2020-05-06 23:21:13 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|