2020-11-26 20:29:37 +00:00
|
|
|
#pragma once
|
|
|
|
|
2021-05-17 07:30:42 +00:00
|
|
|
#include <Functions/IFunction.h>
|
2020-11-26 20:29:37 +00:00
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionHelpers.h>
|
|
|
|
#include <Columns/ColumnString.h>
|
|
|
|
#include <Columns/ColumnsNumber.h>
|
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
|
|
#include <DataTypes/DataTypeString.h>
|
|
|
|
#include <Functions/Regexps.h>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
using Pos = const char *;
|
|
|
|
|
|
|
|
template <class CountMatchesBase>
|
|
|
|
class FunctionCountMatches : public IFunction
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
static constexpr auto name = CountMatchesBase::name;
|
2021-06-01 12:20:52 +00:00
|
|
|
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionCountMatches<CountMatchesBase>>(); }
|
2020-11-26 20:29:37 +00:00
|
|
|
|
|
|
|
String getName() const override { return name; }
|
|
|
|
size_t getNumberOfArguments() const override { return 2; }
|
2021-06-22 16:21:23 +00:00
|
|
|
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
|
2020-11-26 20:29:37 +00:00
|
|
|
|
|
|
|
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
|
|
|
{
|
|
|
|
if (!isStringOrFixedString(arguments[1].type))
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
|
|
|
"Illegal type {} of second argument (pattern) of function {}. Must be String/FixedString.",
|
|
|
|
arguments[1].type->getName(), getName());
|
|
|
|
if (!isStringOrFixedString(arguments[0].type))
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
|
|
|
"Illegal type {} of first argument (haystack) of function {}. Must be String/FixedString.",
|
|
|
|
arguments[0].type->getName(), getName());
|
|
|
|
const auto * column = arguments[1].column.get();
|
|
|
|
if (!column || !checkAndGetColumnConstStringOrFixedString(column))
|
|
|
|
throw Exception(ErrorCodes::ILLEGAL_COLUMN,
|
|
|
|
"The second argument of function {} should be a constant string with the pattern",
|
|
|
|
getName());
|
|
|
|
|
|
|
|
return std::make_shared<DataTypeUInt64>();
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
|
|
|
{
|
|
|
|
const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
Regexps::RegexpPtr re = Regexps::get<false /* like */, true /* is_no_capture */, CountMatchesBase::case_insensitive>(column_pattern->getValue<String>());
|
2020-11-26 20:29:37 +00:00
|
|
|
OptimizedRegularExpression::MatchVec matches;
|
|
|
|
|
|
|
|
const IColumn * column_haystack = arguments[0].column.get();
|
|
|
|
|
|
|
|
if (const ColumnString * col_str = checkAndGetColumn<ColumnString>(column_haystack))
|
|
|
|
{
|
|
|
|
auto result_column = ColumnUInt64::create();
|
|
|
|
|
|
|
|
const ColumnString::Chars & src_chars = col_str->getChars();
|
|
|
|
const ColumnString::Offsets & src_offsets = col_str->getOffsets();
|
|
|
|
|
|
|
|
ColumnUInt64::Container & vec_res = result_column->getData();
|
|
|
|
vec_res.resize(input_rows_count);
|
|
|
|
|
|
|
|
size_t size = src_offsets.size();
|
|
|
|
ColumnString::Offset current_src_offset = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
|
|
|
|
current_src_offset = src_offsets[i];
|
|
|
|
Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;
|
|
|
|
|
|
|
|
StringRef str(pos, end - pos);
|
|
|
|
vec_res[i] = countMatches(str, re, matches);
|
|
|
|
}
|
|
|
|
|
|
|
|
return result_column;
|
|
|
|
}
|
|
|
|
else if (const ColumnConst * col_const_str = checkAndGetColumnConstStringOrFixedString(column_haystack))
|
|
|
|
{
|
|
|
|
StringRef str = col_const_str->getDataColumn().getDataAt(0);
|
|
|
|
uint64_t matches_count = countMatches(str, re, matches);
|
|
|
|
return result_type->createColumnConst(input_rows_count, matches_count);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()");
|
|
|
|
}
|
|
|
|
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches)
|
2020-11-26 20:29:37 +00:00
|
|
|
{
|
|
|
|
/// Only one match is required, no need to copy more.
|
|
|
|
static const unsigned matches_limit = 1;
|
|
|
|
|
|
|
|
Pos pos = reinterpret_cast<Pos>(src.data);
|
|
|
|
Pos end = reinterpret_cast<Pos>(src.data + src.size);
|
|
|
|
|
|
|
|
uint64_t match_count = 0;
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
if (pos >= end)
|
|
|
|
break;
|
|
|
|
if (!re->match(pos, end - pos, matches, matches_limit))
|
|
|
|
break;
|
|
|
|
/// Progress should be made, but with empty match the progress will not be done.
|
|
|
|
/// Also note that simply check is pattern empty is not enough,
|
|
|
|
/// since for example "'[f]{0}'" will match zero bytes:
|
|
|
|
if (!matches[0].length)
|
|
|
|
break;
|
|
|
|
pos += matches[0].offset + matches[0].length;
|
|
|
|
match_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return match_count;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|