diff --git a/src/DataTypes/tests/gtest_data_type_get_common_type.cpp b/src/DataTypes/tests/gtest_data_type_get_common_type.cpp index fd511bfbbb4..bd13de79ef6 100644 --- a/src/DataTypes/tests/gtest_data_type_get_common_type.cpp +++ b/src/DataTypes/tests/gtest_data_type_get_common_type.cpp @@ -14,11 +14,6 @@ static bool operator==(const IDataType & left, const IDataType & right) return left.equals(right); } -std::ostream & operator<<(std::ostream & ostr, const IDataType & dt) -{ - return ostr << dt.getName(); -} - } using namespace DB; diff --git a/src/Functions/extractAllGroups.cpp b/src/Functions/extractAllGroups.cpp deleted file mode 100644 index a79efe86356..00000000000 --- a/src/Functions/extractAllGroups.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ARGUMENT_OUT_OF_BOUND; - extern const int BAD_ARGUMENTS; -} - - -/** Match all groups of given input string with given re, return array of arrays of matches. - * - * SELECT extractAllGroups('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') - * should produce: - * [['abc', '111'], ['def', '222'], ['ghi', '333']] - */ -class FunctionExtractAllGroups : public IFunction -{ -public: - static constexpr auto name = "extractAllGroups"; - static FunctionPtr create(const Context &) { return std::make_shared(); } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 2; } - - bool useDefaultImplementationForConstants() const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors args{ - {"haystack", isStringOrFixedString, nullptr, "const String or const FixedString"}, - {"needle", isStringOrFixedString, isColumnConst, "const String or const FixedString"}, - }; - validateFunctionArgumentTypes(*this, arguments, args); - - /// Two-dimensional array of strings, each `row` of top array represents matching groups. - return std::make_shared(std::make_shared(std::make_shared())); - } - - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override - { - const ColumnPtr column_haystack = block.getByPosition(arguments[0]).column; - const ColumnPtr column_needle = block.getByPosition(arguments[1]).column; - - const auto needle = typeid_cast(*column_needle).getValue(); - - if (needle.empty()) - throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); - - const auto regexp = Regexps::get(needle); - const auto & re2 = regexp->getRE2(); - - if (!re2) - throw Exception("There is no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS); - - const size_t groups_count = re2->NumberOfCapturingGroups(); - - if (!groups_count) - throw Exception("There is no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS); - - // Including 0-group, which is the whole regexp. - PODArrayWithStackMemory matched_groups(groups_count + 1); - - ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create(); - ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create(); - ColumnString::MutablePtr data_col = ColumnString::create(); - - auto & root_offsets_data = root_offsets_col->getData(); - auto & nested_offsets_data = nested_offsets_col->getData(); - - root_offsets_data.resize(input_rows_count); - ColumnArray::Offset current_root_offset = 0; - ColumnArray::Offset current_nested_offset = 0; - - for (size_t i = 0; i < input_rows_count; ++i) - { - StringRef current_row = column_haystack->getDataAt(i); - - // Extract all non-intersecting matches from haystack except group #0. - const auto * pos = current_row.data; - const auto * end = pos + current_row.size; - while (pos < end - && re2->Match(re2_st::StringPiece(pos, end - pos), - 0, end - pos, re2_st::RE2::UNANCHORED, matched_groups.data(), matched_groups.size())) - { - // 1 is to exclude group #0 which is whole re match. - for (size_t group = 1; group <= groups_count; ++group) - data_col->insertData(matched_groups[group].data(), matched_groups[group].size()); - - pos = matched_groups[0].data() + matched_groups[0].size(); - - current_nested_offset += groups_count; - nested_offsets_data.push_back(current_nested_offset); - - ++current_root_offset; - } - - root_offsets_data[i] = current_root_offset; - } - ColumnArray::MutablePtr nested_array_col = ColumnArray::create(std::move(data_col), std::move(nested_offsets_col)); - ColumnArray::MutablePtr root_array_col = ColumnArray::create(std::move(nested_array_col), std::move(root_offsets_col)); - block.getByPosition(result).column = std::move(root_array_col); - } -}; - -void registerFunctionExtractAllGroups(FunctionFactory & factory) -{ - factory.registerFunction(); -} - -} diff --git a/src/Functions/extractAllGroups.h b/src/Functions/extractAllGroups.h new file mode 100644 index 00000000000..a9206e7327e --- /dev/null +++ b/src/Functions/extractAllGroups.h @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + + +enum class ExtractAllGroupsResultKind +{ + VERTICAL, + HORIZONTAL +}; + + +/** Match all groups of given input string with given re, return array of arrays of matches. + * + * Depending on `Impl::Kind`, result is either grouped by grop id (Horizontal) or in order of appearance (Vertical): + * + * SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') + * => + * [['abc', '111'], ['def', '222'], ['ghi', '333']] + * + * SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)') + * => + * [['abc', 'def', 'ghi'], ['111', '222', '333'] +*/ +template +class FunctionExtractAllGroups : public IFunction +{ +public: + static constexpr auto Kind = Impl::Kind; + static constexpr auto name = Impl::Name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + bool useDefaultImplementationForConstants() const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors args{ + {"haystack", isStringOrFixedString, nullptr, "const String or const FixedString"}, + {"needle", isStringOrFixedString, isColumnConst, "const String or const FixedString"}, + }; + validateFunctionArgumentTypes(*this, arguments, args); + + /// Two-dimensional array of strings, each `row` of top array represents matching groups. + return std::make_shared(std::make_shared(std::make_shared())); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override + { + static const auto MAX_GROUPS_COUNT = 128; + + const ColumnPtr column_haystack = block.getByPosition(arguments[0]).column; + const ColumnPtr column_needle = block.getByPosition(arguments[1]).column; + + const auto needle = typeid_cast(*column_needle).getValue(); + + if (needle.empty()) + throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS); + + using StringPiece = typename Regexps::Regexp::StringPieceType; + const auto & regexp = Regexps::get(needle)->getRE2(); + + if (!regexp) + throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS); + + const size_t groups_count = regexp->NumberOfCapturingGroups(); + + if (!groups_count) + throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS); + + if (groups_count > MAX_GROUPS_COUNT - 1) + throw Exception("Too many groups in regexp: " + std::to_string(groups_count) + + ", max: " + std::to_string(MAX_GROUPS_COUNT - 1), + ErrorCodes::BAD_ARGUMENTS); + + // Including 0-group, which is the whole regexp. + PODArrayWithStackMemory matched_groups(groups_count + 1); + + ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create(); + ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create(); + ColumnString::MutablePtr data_col = ColumnString::create(); + + auto & root_offsets_data = root_offsets_col->getData(); + auto & nested_offsets_data = nested_offsets_col->getData(); + + ColumnArray::Offset current_root_offset = 0; + ColumnArray::Offset current_nested_offset = 0; + + if constexpr (Kind == ExtractAllGroupsResultKind::VERTICAL) + { + root_offsets_data.resize(input_rows_count); + for (size_t i = 0; i < input_rows_count; ++i) + { + StringRef current_row = column_haystack->getDataAt(i); + + // Extract all non-intersecting matches from haystack except group #0. + const auto * pos = current_row.data; + const auto * end = pos + current_row.size; + while (pos < end + && regexp->Match({pos, static_cast(end - pos)}, + 0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size())) + { + // 1 is to exclude group #0 which is whole re match. + for (size_t group = 1; group <= groups_count; ++group) + data_col->insertData(matched_groups[group].data(), matched_groups[group].size()); + + pos = matched_groups[0].data() + matched_groups[0].size(); + + current_nested_offset += groups_count; + nested_offsets_data.push_back(current_nested_offset); + + ++current_root_offset; + } + + root_offsets_data[i] = current_root_offset; + } + } + else + { + std::vector all_matches; + // number of times RE matched on each row of haystack column. + std::vector number_of_matches_per_row; + + // we expect RE to match multiple times on each row, `* 8` is arbitrary to reduce number of re-allocations. + all_matches.reserve(input_rows_count * groups_count * 8); + number_of_matches_per_row.reserve(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) + { + size_t matches_per_row = 0; + + const auto & current_row = column_haystack->getDataAt(i); + + // Extract all non-intersecting matches from haystack except group #0. + const auto * pos = current_row.data; + const auto * end = pos + current_row.size; + while (pos < end + && regexp->Match({pos, static_cast(end - pos)}, + 0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size())) + { + // 1 is to exclude group #0 which is whole re match. + for (size_t group = 1; group <= groups_count; ++group) + all_matches.push_back(matched_groups[group]); + + pos = matched_groups[0].data() + matched_groups[0].size(); + + ++matches_per_row; + } + + number_of_matches_per_row.push_back(matches_per_row); + } + + { + size_t total_matched_groups_string_len = 0; + for (const auto & m : all_matches) + total_matched_groups_string_len += m.length(); + + data_col->reserve(total_matched_groups_string_len); + } + + nested_offsets_col->reserve(matched_groups.size()); + root_offsets_col->reserve(groups_count); + + // Re-arrange `all_matches` from: + // [ + // "ROW 0: 1st group 1st match", + // "ROW 0: 2nd group 1st match", + // ..., + // "ROW 0: 1st group 2nd match", + // "ROW 0: 2nd group 2nd match", + // ..., + // "ROW 1: 1st group 1st match", + // ... + // ] + // + // into column of 2D arrays: + // [ + // /* all matchig groups from ROW 0 of haystack column */ + // ["ROW 0: 1st group 1st match", "ROW 0: 1st group 2nd match", ...], + // ["ROW 0: 2nd group 1st match", "ROW 0: 2nd group 2nd match", ...], + // ... + // ], + // [ + // /* all matchig groups from row 1 of haystack column */ + // ["ROW 1: 1st group 1st match", ...], + // ... + // ] + + size_t row_offset = 0; + for (const auto matches_per_row : number_of_matches_per_row) + { + const size_t next_row_offset = row_offset + matches_per_row * groups_count; + for (size_t group_id = 0; group_id < groups_count; ++group_id) + { + for (size_t i = row_offset + group_id; i < next_row_offset && i < all_matches.size(); i += groups_count) + { + const auto & match = all_matches[i]; + data_col->insertData(match.begin(), match.length()); + } + nested_offsets_col->insertValue(data_col->size()); + } + root_offsets_col->insertValue(nested_offsets_col->size()); + row_offset = next_row_offset; + } + } + DUMP(Kind, needle, column_haystack, root_offsets_col, nested_offsets_col); + + ColumnArray::MutablePtr nested_array_col = ColumnArray::create(std::move(data_col), std::move(nested_offsets_col)); + ColumnArray::MutablePtr root_array_col = ColumnArray::create(std::move(nested_array_col), std::move(root_offsets_col)); + block.getByPosition(result).column = std::move(root_array_col); + } +}; + +} diff --git a/src/Functions/extractAllGroupsHorizontal.cpp b/src/Functions/extractAllGroupsHorizontal.cpp new file mode 100644 index 00000000000..fba7483ba03 --- /dev/null +++ b/src/Functions/extractAllGroupsHorizontal.cpp @@ -0,0 +1,23 @@ +#include +#include + +namespace +{ + +struct HorizontalImpl +{ + static constexpr auto Kind = DB::ExtractAllGroupsResultKind::HORIZONTAL; + static constexpr auto Name = "extractAllGroupsHorizontal"; +}; + +} + +namespace DB +{ + +void registerFunctionExtractAllGroupsHorizontal(FunctionFactory & factory) +{ + factory.registerFunction>(); +} + +} diff --git a/src/Functions/extractAllGroupsVertical.cpp b/src/Functions/extractAllGroupsVertical.cpp new file mode 100644 index 00000000000..9cbd148b016 --- /dev/null +++ b/src/Functions/extractAllGroupsVertical.cpp @@ -0,0 +1,24 @@ +#include +#include + +namespace +{ + +struct VerticalImpl +{ + static constexpr auto Kind = DB::ExtractAllGroupsResultKind::VERTICAL; + static constexpr auto Name = "extractAllGroupsVertical"; +}; + +} + +namespace DB +{ + +void registerFunctionExtractAllGroupsVertical(FunctionFactory & factory) +{ + factory.registerFunction>(); + factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseInsensitive); +} + +} diff --git a/src/Functions/extractGroups.cpp b/src/Functions/extractGroups.cpp index 882147ef664..f24abd2d0ff 100644 --- a/src/Functions/extractGroups.cpp +++ b/src/Functions/extractGroups.cpp @@ -17,7 +17,6 @@ namespace DB namespace ErrorCodes { - extern const int ARGUMENT_OUT_OF_BOUND; extern const int BAD_ARGUMENTS; } @@ -49,7 +48,6 @@ public: }; validateFunctionArgumentTypes(*this, arguments, args); - /// Two-dimensional array of strings, each `row` of top array represents matching groups. return std::make_shared(std::make_shared()); } @@ -61,7 +59,7 @@ public: const auto needle = typeid_cast(*column_needle).getValue(); if (needle.empty()) - throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS); const auto regexp = Regexps::get(needle); const auto & re2 = regexp->getRE2(); diff --git a/src/Functions/registerFunctionsStringRegexp.cpp b/src/Functions/registerFunctionsStringRegexp.cpp index 350f7bd5d00..2a0a3c0ea1f 100644 --- a/src/Functions/registerFunctionsStringRegexp.cpp +++ b/src/Functions/registerFunctionsStringRegexp.cpp @@ -18,7 +18,8 @@ void registerFunctionMultiFuzzyMatchAny(FunctionFactory &); void registerFunctionMultiFuzzyMatchAnyIndex(FunctionFactory &); void registerFunctionMultiFuzzyMatchAllIndices(FunctionFactory &); void registerFunctionExtractGroups(FunctionFactory &); -void registerFunctionExtractAllGroups(FunctionFactory &); +void registerFunctionExtractAllGroupsVertical(FunctionFactory &); +void registerFunctionExtractAllGroupsHorizontal(FunctionFactory &); void registerFunctionsStringRegexp(FunctionFactory & factory) { @@ -37,7 +38,8 @@ void registerFunctionsStringRegexp(FunctionFactory & factory) registerFunctionMultiFuzzyMatchAnyIndex(factory); registerFunctionMultiFuzzyMatchAllIndices(factory); registerFunctionExtractGroups(factory); - registerFunctionExtractAllGroups(factory); + registerFunctionExtractAllGroupsVertical(factory); + registerFunctionExtractAllGroupsHorizontal(factory); } } diff --git a/tests/performance/extract.xml b/tests/performance/extract.xml index 71dd8ce775d..b370152c7b2 100644 --- a/tests/performance/extract.xml +++ b/tests/performance/extract.xml @@ -6,5 +6,6 @@ SELECT count() FROM test.hits WHERE NOT ignore(extract(URL, '(\\w+=\\w+)')) SELECT count() FROM test.hits WHERE NOT ignore(extractAll(URL, '(\\w+=\\w+)')) SELECT count() FROM test.hits WHERE NOT ignore(extractGroups(URL, '(\\w+)=(\\w+)')) - SELECT count() FROM test.hits WHERE NOT ignore(extractAllGroups(URL, '(\\w+)=(\\w+)')) + SELECT count() FROM test.hits WHERE NOT ignore(extractAllGroupsVertical(URL, '(\\w+)=(\\w+)')) + SELECT count() FROM test.hits WHERE NOT ignore(extractAllGroupsHorizontal(URL, '(\\w+)=(\\w+)')) diff --git a/tests/queries/0_stateless/01246_extractAllGroups.sql b/tests/queries/0_stateless/01246_extractAllGroups.sql deleted file mode 100644 index ade52117d76..00000000000 --- a/tests/queries/0_stateless/01246_extractAllGroups.sql +++ /dev/null @@ -1,51 +0,0 @@ --- error cases -SELECT extractAllGroups(); --{serverError 42} not enough arguments -SELECT extractAllGroups('hello'); --{serverError 42} not enough arguments -SELECT extractAllGroups('hello', 123); --{serverError 43} invalid argument type -SELECT extractAllGroups(123, 'world'); --{serverError 43} invalid argument type -SELECT extractAllGroups('hello world', '((('); --{serverError 427} invalid re -SELECT extractAllGroups('hello world', materialize('\\w+')); --{serverError 44} non-const needle - -SELECT '0 groups, zero matches'; -SELECT extractAllGroups('hello world', '\\w+'); -- { serverError 36 } - -SELECT '1 group, multiple matches, String and FixedString'; -SELECT extractAllGroups('hello world', '(\\w+)'); -SELECT extractAllGroups('hello world', CAST('(\\w+)' as FixedString(5))); -SELECT extractAllGroups(CAST('hello world' AS FixedString(12)), '(\\w+)'); -SELECT extractAllGroups(CAST('hello world' AS FixedString(12)), CAST('(\\w+)' as FixedString(5))); -SELECT extractAllGroups(materialize(CAST('hello world' AS FixedString(12))), '(\\w+)'); -SELECT extractAllGroups(materialize(CAST('hello world' AS FixedString(12))), CAST('(\\w+)' as FixedString(5))); - -SELECT 'mutiple groups, multiple matches'; -SELECT extractAllGroups('abc=111, def=222, ghi=333 "jkl mno"="444 foo bar"', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); - -SELECT 'big match'; -SELECT - length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches)) -FROM ( - SELECT - repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, - extractAllGroups(haystack, '(abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz)') AS matches - FROM numbers(3) -); - -SELECT 'lots of matches'; -SELECT - length(haystack), length(matches[1]), length(matches), arrayReduce('sum', arrayMap((x) -> length(x), arrayMap(x -> x[1], matches))) -FROM ( - SELECT - repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, - extractAllGroups(haystack, '(\\w)') AS matches - FROM numbers(3) -); - -SELECT 'lots of groups'; -SELECT - length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches)) -FROM ( - SELECT - repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, - extractAllGroups(haystack, repeat('(\\w)', 100)) AS matches - FROM numbers(3) -); diff --git a/tests/queries/0_stateless/01246_extractAllGroupsHorizontal.reference b/tests/queries/0_stateless/01246_extractAllGroupsHorizontal.reference new file mode 100644 index 00000000000..13e717485d8 --- /dev/null +++ b/tests/queries/0_stateless/01246_extractAllGroupsHorizontal.reference @@ -0,0 +1,22 @@ +0 groups, zero matches +1 group, multiple matches, String and FixedString +[['hello','world']] +[['hello','world']] +[['hello','world']] +[['hello','world']] +[['hello','world']] +[['hello','world']] +mutiple groups, multiple matches +[['abc','def','ghi','"jkl mno"'],['111','222','333','"444 foo bar"']] +big match +0 1 0 [] +260 1 1 [156] +520 1 3 [156,156,156] +lots of matches +0 1 0 0 +260 1 260 260 +520 1 520 520 +lots of groups +0 100 0 [] +260 100 2 [1,1] +520 100 5 [1,1,1,1,1] diff --git a/tests/queries/0_stateless/01246_extractAllGroupsHorizontal.sql b/tests/queries/0_stateless/01246_extractAllGroupsHorizontal.sql new file mode 100644 index 00000000000..b7a71415a9d --- /dev/null +++ b/tests/queries/0_stateless/01246_extractAllGroupsHorizontal.sql @@ -0,0 +1,51 @@ +-- error cases +SELECT extractAllGroupsHorizontal(); --{serverError 42} not enough arguments +SELECT extractAllGroupsHorizontal('hello'); --{serverError 42} not enough arguments +SELECT extractAllGroupsHorizontal('hello', 123); --{serverError 43} invalid argument type +SELECT extractAllGroupsHorizontal(123, 'world'); --{serverError 43} invalid argument type +SELECT extractAllGroupsHorizontal('hello world', '((('); --{serverError 427} invalid re +SELECT extractAllGroupsHorizontal('hello world', materialize('\\w+')); --{serverError 44} non-cons needle + +SELECT '0 groups, zero matches'; +SELECT extractAllGroupsHorizontal('hello world', '\\w+'); -- { serverError 36 } + +SELECT '1 group, multiple matches, String and FixedString'; +SELECT extractAllGroupsHorizontal('hello world', '(\\w+)'); +SELECT extractAllGroupsHorizontal('hello world', CAST('(\\w+)' as FixedString(5))); +SELECT extractAllGroupsHorizontal(CAST('hello world' AS FixedString(12)), '(\\w+)'); +SELECT extractAllGroupsHorizontal(CAST('hello world' AS FixedString(12)), CAST('(\\w+)' as FixedString(5))); +SELECT extractAllGroupsHorizontal(materialize(CAST('hello world' AS FixedString(12))), '(\\w+)'); +SELECT extractAllGroupsHorizontal(materialize(CAST('hello world' AS FixedString(12))), CAST('(\\w+)' as FixedString(5))); + +SELECT 'mutiple groups, multiple matches'; +SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333 "jkl mno"="444 foo bar"', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); + +SELECT 'big match'; +SELECT + length(haystack), length(matches), length(matches[1]), arrayMap((x) -> length(x), matches[1]) +FROM ( + SELECT + repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, + extractAllGroupsHorizontal(haystack, '(abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz)') AS matches + FROM numbers(3) +); + +SELECT 'lots of matches'; +SELECT + length(haystack), length(matches), length(matches[1]), arrayReduce('sum', arrayMap((x) -> length(x), matches[1])) +FROM ( + SELECT + repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, + extractAllGroupsHorizontal(haystack, '(\\w)') AS matches + FROM numbers(3) +); + +SELECT 'lots of groups'; +SELECT + length(haystack), length(matches), length(matches[1]), arrayMap((x) -> length(x), matches[1]) +FROM ( + SELECT + repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, + extractAllGroupsHorizontal(haystack, repeat('(\\w)', 100)) AS matches + FROM numbers(3) +); diff --git a/tests/queries/0_stateless/01246_extractAllGroups.reference b/tests/queries/0_stateless/01246_extractAllGroupsVertical.reference similarity index 100% rename from tests/queries/0_stateless/01246_extractAllGroups.reference rename to tests/queries/0_stateless/01246_extractAllGroupsVertical.reference diff --git a/tests/queries/0_stateless/01246_extractAllGroupsVertical.sql b/tests/queries/0_stateless/01246_extractAllGroupsVertical.sql new file mode 100644 index 00000000000..8edc3f3e741 --- /dev/null +++ b/tests/queries/0_stateless/01246_extractAllGroupsVertical.sql @@ -0,0 +1,51 @@ +-- error cases +SELECT extractAllGroupsVertical(); --{serverError 42} not enough arguments +SELECT extractAllGroupsVertical('hello'); --{serverError 42} not enough arguments +SELECT extractAllGroupsVertical('hello', 123); --{serverError 43} invalid argument type +SELECT extractAllGroupsVertical(123, 'world'); --{serverError 43} invalid argument type +SELECT extractAllGroupsVertical('hello world', '((('); --{serverError 427} invalid re +SELECT extractAllGroupsVertical('hello world', materialize('\\w+')); --{serverError 44} non-const needle + +SELECT '0 groups, zero matches'; +SELECT extractAllGroupsVertical('hello world', '\\w+'); -- { serverError 36 } + +SELECT '1 group, multiple matches, String and FixedString'; +SELECT extractAllGroupsVertical('hello world', '(\\w+)'); +SELECT extractAllGroupsVertical('hello world', CAST('(\\w+)' as FixedString(5))); +SELECT extractAllGroupsVertical(CAST('hello world' AS FixedString(12)), '(\\w+)'); +SELECT extractAllGroupsVertical(CAST('hello world' AS FixedString(12)), CAST('(\\w+)' as FixedString(5))); +SELECT extractAllGroupsVertical(materialize(CAST('hello world' AS FixedString(12))), '(\\w+)'); +SELECT extractAllGroupsVertical(materialize(CAST('hello world' AS FixedString(12))), CAST('(\\w+)' as FixedString(5))); + +SELECT 'mutiple groups, multiple matches'; +SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333 "jkl mno"="444 foo bar"', '("[^"]+"|\\w+)=("[^"]+"|\\w+)'); + +SELECT 'big match'; +SELECT + length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches)) +FROM ( + SELECT + repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, + extractAllGroupsVertical(haystack, '(abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz)') AS matches + FROM numbers(3) +); + +SELECT 'lots of matches'; +SELECT + length(haystack), length(matches[1]), length(matches), arrayReduce('sum', arrayMap((x) -> length(x), arrayMap(x -> x[1], matches))) +FROM ( + SELECT + repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, + extractAllGroupsVertical(haystack, '(\\w)') AS matches + FROM numbers(3) +); + +SELECT 'lots of groups'; +SELECT + length(haystack), length(matches[1]), length(matches), arrayMap((x) -> length(x), arrayMap(x -> x[1], matches)) +FROM ( + SELECT + repeat('abcdefghijklmnopqrstuvwxyz', number * 10) AS haystack, + extractAllGroupsVertical(haystack, repeat('(\\w)', 100)) AS matches + FROM numbers(3) +); diff --git a/tests/queries/0_stateless/01275_extract_groups_check.sql b/tests/queries/0_stateless/01275_extract_groups_check.sql index 2dd236f2a3b..f8bc5943a78 100644 --- a/tests/queries/0_stateless/01275_extract_groups_check.sql +++ b/tests/queries/0_stateless/01275_extract_groups_check.sql @@ -1,5 +1,5 @@ -SELECT extractGroups('hello', ''); -- { serverError 69 } -SELECT extractAllGroups('hello', ''); -- { serverError 69 } +SELECT extractGroups('hello', ''); -- { serverError 36 } +SELECT extractAllGroups('hello', ''); -- { serverError 36 } SELECT extractGroups('hello', ' '); -- { serverError 36 } SELECT extractAllGroups('hello', ' '); -- { serverError 36 }