2020-03-29 17:04:16 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <vector>
|
2022-06-26 16:12:17 +00:00
|
|
|
#include <Columns/ColumnArray.h>
|
2020-03-29 17:04:16 +00:00
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2022-06-24 14:42:39 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
|
|
|
}
|
|
|
|
|
2021-09-21 16:43:46 +00:00
|
|
|
template <typename Name, typename Impl>
|
2020-03-29 17:04:16 +00:00
|
|
|
struct MultiSearchFirstIndexImpl
|
|
|
|
{
|
|
|
|
using ResultType = UInt64;
|
|
|
|
/// Variable for understanding, if we used offsets for the output, most
|
|
|
|
/// likely to determine whether the function returns ColumnVector of ColumnArray.
|
|
|
|
static constexpr bool is_column_array = false;
|
2021-09-21 16:43:46 +00:00
|
|
|
static constexpr auto name = Name::name;
|
|
|
|
|
2020-03-29 17:04:16 +00:00
|
|
|
static auto getReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
|
|
|
|
|
|
|
|
static void vectorConstant(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
2022-06-26 16:12:17 +00:00
|
|
|
const Array & needles_arr,
|
2020-03-29 17:04:16 +00:00
|
|
|
PaddedPODArray<UInt64> & res,
|
2022-06-29 10:37:42 +00:00
|
|
|
PaddedPODArray<UInt64> & /*offsets*/,
|
2022-06-25 15:53:11 +00:00
|
|
|
bool /*allow_hyperscan*/,
|
2022-06-24 14:12:38 +00:00
|
|
|
size_t /*max_hyperscan_regexp_length*/,
|
2023-02-08 13:07:27 +00:00
|
|
|
size_t /*max_hyperscan_regexp_total_length*/,
|
|
|
|
bool /*reject_expensive_hyperscan_regexps*/)
|
2020-03-29 17:04:16 +00:00
|
|
|
{
|
2022-06-24 14:42:39 +00:00
|
|
|
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
2022-06-26 16:12:17 +00:00
|
|
|
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
2022-06-24 14:42:39 +00:00
|
|
|
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
|
|
|
"Number of arguments for function {} doesn't match: passed {}, should be at most {}",
|
2022-06-26 16:12:17 +00:00
|
|
|
name, std::to_string(needles_arr.size()), std::to_string(std::numeric_limits<UInt8>::max()));
|
|
|
|
|
|
|
|
std::vector<std::string_view> needles;
|
|
|
|
needles.reserve(needles_arr.size());
|
|
|
|
for (const auto & needle : needles_arr)
|
|
|
|
needles.emplace_back(needle.get<String>());
|
2022-06-24 14:42:39 +00:00
|
|
|
|
2020-03-29 17:04:16 +00:00
|
|
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
|
2022-06-29 10:37:42 +00:00
|
|
|
|
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
res.resize(haystack_size);
|
|
|
|
|
2020-03-29 17:04:16 +00:00
|
|
|
size_t iteration = 0;
|
|
|
|
while (searcher.hasMoreToSearch())
|
|
|
|
{
|
2022-07-07 20:25:26 +00:00
|
|
|
size_t prev_haystack_offset = 0;
|
2022-06-29 10:37:42 +00:00
|
|
|
for (size_t j = 0; j < haystack_size; ++j)
|
2020-03-29 17:04:16 +00:00
|
|
|
{
|
2022-07-07 20:25:26 +00:00
|
|
|
const auto * haystack = &haystack_data[prev_haystack_offset];
|
|
|
|
const auto * haystack_end = haystack + haystack_offsets[j] - prev_haystack_offset - 1;
|
2020-03-29 17:04:16 +00:00
|
|
|
/// hasMoreToSearch traverse needles in increasing order
|
|
|
|
if (iteration == 0 || res[j] == 0)
|
|
|
|
res[j] = searcher.searchOneFirstIndex(haystack, haystack_end);
|
2022-07-07 20:25:26 +00:00
|
|
|
prev_haystack_offset = haystack_offsets[j];
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
|
|
|
++iteration;
|
|
|
|
}
|
2021-08-05 08:36:35 +00:00
|
|
|
if (iteration == 0)
|
|
|
|
std::fill(res.begin(), res.end(), 0);
|
2020-03-29 17:04:16 +00:00
|
|
|
}
|
2022-06-26 16:45:16 +00:00
|
|
|
|
2022-06-29 10:37:42 +00:00
|
|
|
static void vectorVector(
|
|
|
|
const ColumnString::Chars & haystack_data,
|
|
|
|
const ColumnString::Offsets & haystack_offsets,
|
2022-07-06 21:36:14 +00:00
|
|
|
const IColumn & needles_data,
|
|
|
|
const ColumnArray::Offsets & needles_offsets,
|
2022-06-29 10:37:42 +00:00
|
|
|
PaddedPODArray<ResultType> & res,
|
|
|
|
PaddedPODArray<UInt64> & /*offsets*/,
|
|
|
|
bool /*allow_hyperscan*/,
|
|
|
|
size_t /*max_hyperscan_regexp_length*/,
|
2023-02-08 13:07:27 +00:00
|
|
|
size_t /*max_hyperscan_regexp_total_length*/,
|
|
|
|
bool /*reject_expensive_hyperscan_regexps*/)
|
2022-06-26 16:45:16 +00:00
|
|
|
{
|
2022-06-29 10:37:42 +00:00
|
|
|
const size_t haystack_size = haystack_offsets.size();
|
|
|
|
res.resize(haystack_size);
|
|
|
|
|
2022-07-07 20:25:26 +00:00
|
|
|
size_t prev_haystack_offset = 0;
|
|
|
|
size_t prev_needles_offset = 0;
|
2022-06-29 10:37:42 +00:00
|
|
|
|
2022-07-06 21:36:14 +00:00
|
|
|
const ColumnString * needles_data_string = checkAndGetColumn<ColumnString>(&needles_data);
|
|
|
|
|
|
|
|
std::vector<std::string_view> needles;
|
|
|
|
|
2022-06-29 10:37:42 +00:00
|
|
|
for (size_t i = 0; i < haystack_size; ++i)
|
|
|
|
{
|
2022-07-07 20:25:26 +00:00
|
|
|
needles.reserve(needles_offsets[i] - prev_needles_offset);
|
2022-06-29 10:37:42 +00:00
|
|
|
|
2022-07-07 20:25:26 +00:00
|
|
|
for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j)
|
2022-07-06 21:36:14 +00:00
|
|
|
{
|
2022-07-07 20:25:26 +00:00
|
|
|
needles.emplace_back(needles_data_string->getDataAt(j).toView());
|
2022-07-06 21:36:14 +00:00
|
|
|
}
|
2022-06-29 10:37:42 +00:00
|
|
|
|
|
|
|
auto searcher = Impl::createMultiSearcherInBigHaystack(needles); // sub-optimal
|
|
|
|
|
2022-07-07 20:25:26 +00:00
|
|
|
const auto * const haystack = &haystack_data[prev_haystack_offset];
|
|
|
|
const auto * haystack_end = haystack + haystack_offsets[i] - prev_haystack_offset - 1;
|
2022-06-29 10:37:42 +00:00
|
|
|
|
|
|
|
size_t iteration = 0;
|
|
|
|
while (searcher.hasMoreToSearch())
|
|
|
|
{
|
|
|
|
if (iteration == 0 || res[i] == 0)
|
|
|
|
{
|
|
|
|
res[i] = searcher.searchOneFirstIndex(haystack, haystack_end);
|
|
|
|
}
|
|
|
|
++iteration;
|
|
|
|
}
|
|
|
|
if (iteration == 0)
|
|
|
|
{
|
|
|
|
res[i] = 0;
|
|
|
|
}
|
2022-07-06 21:36:14 +00:00
|
|
|
|
2022-07-07 20:25:26 +00:00
|
|
|
prev_haystack_offset = haystack_offsets[i];
|
|
|
|
prev_needles_offset = needles_offsets[i];
|
2022-07-06 21:36:14 +00:00
|
|
|
needles.clear();
|
2022-06-29 10:37:42 +00:00
|
|
|
}
|
2022-06-26 16:45:16 +00:00
|
|
|
}
|
2020-03-29 17:04:16 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|