ClickHouse/src/Functions/MultiSearchFirstIndexImpl.h
Robert Schulze 3478db9fb6
Move check for regexp array size into implementations
- This is not needed for non-const regexp array arguments (the
  cardinality of arrays is fixed per column) but it cleans up the code
  and runs the check only in functions which have restrictions on the
  number of patterns.

- For functions using hyperscans, it was checked that the number of
  regexes is < 2^32. Removed the check because I don't think anyone will
  every specify 4 billion patterns.
2022-06-26 15:38:12 +00:00

66 lines
2.4 KiB
C++

#pragma once
#include <vector>
#include <Columns/ColumnString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
template <typename Name, typename Impl>
struct MultiSearchFirstIndexImpl
{
using ResultType = UInt64;
static constexpr bool is_using_hyperscan = false;
/// Variable for understanding, if we used offsets for the output, most
/// likely to determine whether the function returns ColumnVector of ColumnArray.
static constexpr bool is_column_array = false;
static constexpr auto name = Name::name;
static auto getReturnType() { return std::make_shared<DataTypeNumber<ResultType>>(); }
static void vectorConstant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<std::string_view> & needles,
PaddedPODArray<UInt64> & res,
[[maybe_unused]] PaddedPODArray<UInt64> & offsets,
size_t /*max_hyperscan_regexp_length*/,
size_t /*max_hyperscan_regexp_total_length*/)
{
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
if (needles.size() > std::numeric_limits<UInt8>::max())
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be at most {}",
name, std::to_string(needles.size()), std::to_string(std::numeric_limits<UInt8>::max()));
auto searcher = Impl::createMultiSearcherInBigHaystack(needles);
const size_t haystack_string_size = haystack_offsets.size();
res.resize(haystack_string_size);
size_t iteration = 0;
while (searcher.hasMoreToSearch())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
/// hasMoreToSearch traverse needles in increasing order
if (iteration == 0 || res[j] == 0)
res[j] = searcher.searchOneFirstIndex(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
++iteration;
}
if (iteration == 0)
std::fill(res.begin(), res.end(), 0);
}
};
}