Updated extractAllGroupsHorizontal - flexible limit on number of matches per row.

If it is not set via third argument, it deafults to previously hardcoded
value 1000.
This commit is contained in:
Vasily Nemkov 2021-07-29 15:36:55 +03:00
parent 32bd94a84b
commit ec77ba8bfc
5 changed files with 24 additions and 15 deletions

View File

@ -55,7 +55,8 @@ public:
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
size_t getNumberOfArguments() const override { return Kind == ExtractAllGroupsResultKind::HORIZONTAL ? 0 : 2; }
bool isVariadic() const override { return Kind == ExtractAllGroupsResultKind::HORIZONTAL; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
@ -66,7 +67,13 @@ public:
{"haystack", isStringOrFixedString, nullptr, "const String or const FixedString"},
{"needle", isStringOrFixedString, isColumnConst, "const String or const FixedString"},
};
validateFunctionArgumentTypes(*this, arguments, args);
FunctionArgumentDescriptors optional_args;
if constexpr (Kind == ExtractAllGroupsResultKind::HORIZONTAL)
{
optional_args.push_back(FunctionArgumentDescriptor{"max_matches_per_row", isUnsignedInteger, isColumnConst, "const Unsigned Int"});
}
validateFunctionArgumentTypes(*this, arguments, args, optional_args);
/// Two-dimensional array of strings, each `row` of top array represents matching groups.
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()));
@ -147,6 +154,10 @@ public:
}
else
{
/// Additional limit to fail fast on supposedly incorrect usage, arbitrary value.
static constexpr size_t MAX_MATCHES_PER_ROW = 1000;
const auto max_matches_per_row = arguments.size() >= 3 ? arguments[2].column->getUInt(0) : MAX_MATCHES_PER_ROW;
PODArray<StringPiece, 0> all_matches;
/// Number of times RE matched on each row of haystack column.
PODArray<size_t, 0> number_of_matches_per_row;
@ -172,16 +183,13 @@ public:
for (size_t group = 1; group <= groups_count; ++group)
all_matches.push_back(matched_groups[group]);
/// Additional limit to fail fast on supposedly incorrect usage, arbitrary value.
static constexpr size_t MAX_MATCHES_PER_ROW = 1000;
if (matches_per_row > MAX_MATCHES_PER_ROW)
++matches_per_row;
if (matches_per_row > max_matches_per_row)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too many matches per row (> {}) in the result of function {}",
MAX_MATCHES_PER_ROW, getName());
max_matches_per_row, getName());
pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());
++matches_per_row;
}
number_of_matches_per_row.push_back(matches_per_row);

View File

@ -1,4 +1,3 @@
0 groups, zero matches
1 group, multiple matches, String and FixedString
[['hello','world']]
[['hello','world']]

View File

@ -5,9 +5,13 @@ SELECT extractAllGroupsHorizontal('hello', 123); --{serverError 43} invalid arg
SELECT extractAllGroupsHorizontal(123, 'world'); --{serverError 43} invalid argument type
SELECT extractAllGroupsHorizontal('hello world', '((('); --{serverError 427} invalid re
SELECT extractAllGroupsHorizontal('hello world', materialize('\\w+')); --{serverError 44} non-cons needle
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)', 'foobar'); --{serverError 43} invalid argument type
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)', materialize(10)); --{serverError 44} non-const max_matches_per_row
SELECT extractAllGroupsHorizontal('hello world', '\\w+'); -- { serverError 36 } 0 groups
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)', 0); -- { serverError 128 } to many groups matched per row
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)', 1); -- { serverError 128 } to many groups matched per row
SELECT '0 groups, zero matches';
SELECT extractAllGroupsHorizontal('hello world', '\\w+'); -- { serverError 36 }
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)', 1000000000) FORMAT Null; -- users now can set limit bigger than previous 1000 matches per row
SELECT '1 group, multiple matches, String and FixedString';
SELECT extractAllGroupsHorizontal('hello world', '(\\w+)');

View File

@ -1,4 +1,3 @@
0 groups, zero matches
1 group, multiple matches, String and FixedString
[['hello'],['world']]
[['hello'],['world']]

View File

@ -5,9 +5,8 @@ SELECT extractAllGroupsVertical('hello', 123); --{serverError 43} invalid argum
SELECT extractAllGroupsVertical(123, 'world'); --{serverError 43} invalid argument type
SELECT extractAllGroupsVertical('hello world', '((('); --{serverError 427} invalid re
SELECT extractAllGroupsVertical('hello world', materialize('\\w+')); --{serverError 44} non-const needle
SELECT '0 groups, zero matches';
SELECT extractAllGroupsVertical('hello world', '\\w+'); -- { serverError 36 }
SELECT extractAllGroupsVertical('hello world', '(\\w+)', 123); --{serverError 42} only 2 arguments
SELECT extractAllGroupsVertical('hello world', '\\w+'); -- { serverError 36 } 0 groups
SELECT '1 group, multiple matches, String and FixedString';
SELECT extractAllGroupsVertical('hello world', '(\\w+)');