ClickHouse/src/Functions/extractAllGroups.h

#pragma once
#include <Columns/ColumnArray.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Functions/Regexps.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>

#include <memory>
#include <string>
#include <vector>

#include <Core/iostream_debug_helpers.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int BAD_ARGUMENTS;
    extern const int TOO_LARGE_ARRAY_SIZE;
}


enum class ExtractAllGroupsResultKind
{
    VERTICAL,
    HORIZONTAL
};


/** Match all groups of given input string with given re, return array of arrays of matches.
 *
 * Depending on `Impl::Kind`, result is either grouped by group id (Horizontal) or in order of appearance (Vertical):
 *
 *  SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
 * =>
 *   [['abc', '111'], ['def', '222'], ['ghi', '333']]
 *
 *  SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
 * =>
 *   [['abc', 'def', 'ghi'], ['111', '222', '333']
*/
template <typename Impl>
class FunctionExtractAllGroups : public IFunction
{
    ContextPtr context;

public:
    static constexpr auto Kind = Impl::Kind;
    static constexpr auto name = Impl::Name;

    explicit FunctionExtractAllGroups(ContextPtr context_)
        : context(context_)
    {}

    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionExtractAllGroups>(context); }

    String getName() const override { return name; }

    size_t getNumberOfArguments() const override { return 2; }

    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

    bool useDefaultImplementationForConstants() const override { return true; }
    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }

    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
    {
        FunctionArgumentDescriptors args{
            {"haystack", &isStringOrFixedString<IDataType>, nullptr, "const String or const FixedString"},
            {"needle", &isStringOrFixedString<IDataType>, isColumnConst, "const String or const FixedString"},
        };
        validateFunctionArgumentTypes(*this, arguments, args);

        /// Two-dimensional array of strings, each `row` of top array represents matching groups.
        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()));
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
        static const auto MAX_GROUPS_COUNT = 128;

        const ColumnPtr column_haystack = arguments[0].column;
        const ColumnPtr column_needle = arguments[1].column;

        const auto needle = typeid_cast<const ColumnConst &>(*column_needle).getValue<String>();

        if (needle.empty())
            throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS);

        using StringPiece = typename Regexps::Regexp::StringPieceType;
        const Regexps::Regexp holder = Regexps::createRegexp<false, false, false>(needle);
        const auto & regexp = holder.getRE2();

        if (!regexp)
            throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);

        const size_t groups_count = regexp->NumberOfCapturingGroups();

        if (!groups_count)
            throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);

        if (groups_count > MAX_GROUPS_COUNT - 1)
            throw Exception("Too many groups in regexp: " + std::to_string(groups_count)
                            + ", max: " + std::to_string(MAX_GROUPS_COUNT - 1),
                            ErrorCodes::BAD_ARGUMENTS);

        // Including 0-group, which is the whole regexp.
        PODArrayWithStackMemory<StringPiece, MAX_GROUPS_COUNT> matched_groups(groups_count + 1);

        ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create();
        ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create();
        ColumnString::MutablePtr data_col = ColumnString::create();

        auto & root_offsets_data = root_offsets_col->getData();
        auto & nested_offsets_data = nested_offsets_col->getData();

        ColumnArray::Offset current_root_offset = 0;
        ColumnArray::Offset current_nested_offset = 0;

        if constexpr (Kind == ExtractAllGroupsResultKind::VERTICAL)
        {
            root_offsets_data.resize(input_rows_count);
            for (size_t i = 0; i < input_rows_count; ++i)
            {
                std::string_view current_row = column_haystack->getDataAt(i).toView();

                // Extract all non-intersecting matches from haystack except group #0.
                const auto * pos = current_row.data();
                const auto * end = pos + current_row.size();
                while (pos < end
                    && regexp->Match({pos, static_cast<size_t>(end - pos)},
                        0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size()))
                {
                    // 1 is to exclude group #0 which is whole re match.
                    for (size_t group = 1; group <= groups_count; ++group)
                        data_col->insertData(matched_groups[group].data(), matched_groups[group].size());

                    /// If match is empty - it's technically Ok but we have to shift one character nevertheless
                    /// to avoid infinite loop.
                    pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());

                    current_nested_offset += groups_count;
                    nested_offsets_data.push_back(current_nested_offset);

                    ++current_root_offset;
                }

                root_offsets_data[i] = current_root_offset;
            }
        }
        else
        {
            /// Additional limit to fail fast on supposedly incorrect usage.
            const auto max_matches_per_row = context->getSettingsRef().regexp_max_matches_per_row;

            PODArray<StringPiece, 0> all_matches;
            /// Number of times RE matched on each row of haystack column.
            PODArray<size_t, 0> number_of_matches_per_row;

            /// We expect RE to match multiple times on each row, `* 8` is arbitrary to reduce number of re-allocations.
            all_matches.reserve(input_rows_count * groups_count * 8);
            number_of_matches_per_row.reserve(input_rows_count);

            for (size_t i = 0; i < input_rows_count; ++i)
            {
                size_t matches_per_row = 0;

                const auto & current_row = column_haystack->getDataAt(i);

                // Extract all non-intersecting matches from haystack except group #0.
                const auto * pos = current_row.data;
                const auto * end = pos + current_row.size;
                while (pos < end
                    && regexp->Match({pos, static_cast<size_t>(end - pos)},
                        0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size()))
                {
                    // 1 is to exclude group #0 which is whole re match.
                    for (size_t group = 1; group <= groups_count; ++group)
                        all_matches.push_back(matched_groups[group]);

                    ++matches_per_row;
                    if (matches_per_row > max_matches_per_row)
                        throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
                                "Too many matches per row (> {}) in the result of function {}",
                                max_matches_per_row, getName());

                    pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());
                }

                number_of_matches_per_row.push_back(matches_per_row);
            }

            {
                size_t total_matched_groups_string_len = 0;
                for (const auto & m : all_matches)
                    total_matched_groups_string_len += m.length();

                data_col->reserve(total_matched_groups_string_len);
            }

            nested_offsets_col->reserve(matched_groups.size());
            root_offsets_col->reserve(groups_count);

            // Re-arrange `all_matches` from:
            // [
            //      "ROW 0: 1st group 1st match",
            //      "ROW 0: 2nd group 1st match",
            //      ...,
            //      "ROW 0: 1st group 2nd match",
            //      "ROW 0: 2nd group 2nd match",
            //      ...,
            //      "ROW 1: 1st group 1st match",
            //      ...
            // ]
            //
            // into column of 2D arrays:
            // [
            //      /* all matchig groups from ROW 0 of haystack column */
            //      ["ROW 0: 1st group 1st match", "ROW 0: 1st group 2nd match", ...],
            //      ["ROW 0: 2nd group 1st match", "ROW 0: 2nd group 2nd match", ...],
            //      ...
            // ],
            // [
            //      /* all matchig groups from row 1 of haystack column */
            //      ["ROW 1: 1st group 1st match", ...],
            //      ...
            // ]

            size_t row_offset = 0;
            for (const auto matches_per_row : number_of_matches_per_row)
            {
                const size_t next_row_offset = row_offset + matches_per_row * groups_count;
                for (size_t group_id = 0; group_id < groups_count; ++group_id)
                {
                    for (size_t i = row_offset + group_id; i < next_row_offset && i < all_matches.size(); i += groups_count)
                    {
                        const auto & match = all_matches[i];
                        data_col->insertData(match.begin(), match.length());
                    }
                    nested_offsets_col->insertValue(data_col->size());
                }
                root_offsets_col->insertValue(nested_offsets_col->size());
                row_offset = next_row_offset;
            }
        }

        ColumnArray::MutablePtr nested_array_col = ColumnArray::create(std::move(data_col), std::move(nested_offsets_col));
        ColumnArray::MutablePtr root_array_col = ColumnArray::create(std::move(nested_array_col), std::move(root_offsets_col));
        return root_array_col;
    }
};

}
Check for #pragma once in headers 2020-10-10 18:37:02 +00:00			`#pragma once`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`#include <Columns/ColumnArray.h>`
			`#include <Columns/ColumnConst.h>`
			`#include <Columns/ColumnString.h>`
			`#include <DataTypes/DataTypeArray.h>`
			`#include <DataTypes/DataTypeString.h>`
			`#include <Functions/FunctionHelpers.h>`
Function move file 2021-05-17 07:30:42 +00:00			`#include <Functions/IFunction.h>`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`#include <Functions/Regexps.h>`
setting regexp_max_matches_per_row instead of 3rd argument 2021-07-30 09:20:02 +00:00			`#include <Interpreters/Context.h>`
			`#include <Core/Settings.h>`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			`#include <memory>`
			`#include <string>`
			`#include <vector>`

			`#include <Core/iostream_debug_helpers.h>`

Fail fast in incorrect usage of extractAllGroups 2021-01-21 23:48:26 +00:00
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`namespace DB`
			`{`

			`namespace ErrorCodes`
			`{`
			`extern const int BAD_ARGUMENTS;`
Fail fast in incorrect usage of extractAllGroups 2021-01-21 23:48:26 +00:00			`extern const int TOO_LARGE_ARRAY_SIZE;`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`}`


			`enum class ExtractAllGroupsResultKind`
			`{`
			`VERTICAL,`
			`HORIZONTAL`
			`};`


			`/** Match all groups of given input string with given re, return array of arrays of matches.`
			`*`
Fix half of typos 2020-08-08 00:47:03 +00:00			* Depending on `Impl::Kind`, result is either grouped by group id (Horizontal) or in order of appearance (Vertical):
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`*`
			`* SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"\|\\w+)=("[^"]+"\|\\w+)')`
			`* =>`
			`* [['abc', '111'], ['def', '222'], ['ghi', '333']]`
			`*`
			`* SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"\|\\w+)=("[^"]+"\|\\w+)')`
			`* =>`
			`* [['abc', 'def', 'ghi'], ['111', '222', '333']`
			`*/`
			`template <typename Impl>`
			`class FunctionExtractAllGroups : public IFunction`
			`{`
setting regexp_max_matches_per_row instead of 3rd argument 2021-07-30 09:20:02 +00:00			`ContextPtr context;`

extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`public:`
			`static constexpr auto Kind = Impl::Kind;`
			`static constexpr auto name = Impl::Name;`

Fix clang-tidy warnings in Disks, Formats, Functions folders 2022-03-12 18:05:50 +00:00			`explicit FunctionExtractAllGroups(ContextPtr context_)`
setting regexp_max_matches_per_row instead of 3rd argument 2021-07-30 09:20:02 +00:00			`: context(context_)`
			`{}`

			`static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionExtractAllGroups>(context); }`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			`String getName() const override { return name; }`

setting regexp_max_matches_per_row instead of 3rd argument 2021-07-30 09:20:02 +00:00			`size_t getNumberOfArguments() const override { return 2; }`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
Try to simplify code 2021-06-22 16:21:23 +00:00			`bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /arguments/) const override { return true; }`
Mark all Functions as sutable or not for executing as short circuit arguments 2021-04-29 14:48:26 +00:00
Minor modifications after merging #11554 2020-06-12 14:03:00 +00:00			`bool useDefaultImplementationForConstants() const override { return true; }`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }`

			`DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override`
			`{`
			`FunctionArgumentDescriptors args{`
Fixed tests 2021-09-30 11:35:24 +00:00			`{"haystack", &isStringOrFixedString<IDataType>, nullptr, "const String or const FixedString"},`
			`{"needle", &isStringOrFixedString<IDataType>, isColumnConst, "const String or const FixedString"},`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`};`
setting regexp_max_matches_per_row instead of 3rd argument 2021-07-30 09:20:02 +00:00			`validateFunctionArgumentTypes(*this, arguments, args);`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			/// Two-dimensional array of strings, each `row` of top array represents matching groups.
			`return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()));`
			`}`

Refactor IFunction to execute with const arguments 2020-11-17 13:24:45 +00:00			`ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`{`
			`static const auto MAX_GROUPS_COUNT = 128;`

Part 4. 2020-10-17 21:41:50 +00:00			`const ColumnPtr column_haystack = arguments[0].column;`
			`const ColumnPtr column_needle = arguments[1].column;`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			`const auto needle = typeid_cast<const ColumnConst &>(*column_needle).getValue<String>();`

			`if (needle.empty())`
			`throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS);`

			`using StringPiece = typename Regexps::Regexp::StringPieceType;`
Measure and rework internal re2 caching This commit is based on local benchmarks of ClickHouse's re2 caching. Question 1: ----------------------------------------------------------- Is pattern caching useful for queries with const LIKE/REGEX patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T; The short answer is: no. Runtime is (unsurprisingly) dominated by pattern evaluation + other stuff going on in queries, but definitely not pattern compilation. For space reasons, I omit details of the local experiments. (Side note: the current caching scheme is unbounded in size which poses a DoS risk (think of multi-tenancy). This risk is more pronounced when unbounded caching is used with non-const patterns ..., see next question) Question 2: ----------------------------------------------------------- Is pattern caching useful for queries with non-const LIKE/REGEX patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T; I benchmarked five caching strategies: 1. no caching as a baseline (= recompile for each row) 2. unbounded cache (= threadsafe global hash-map) 3. LRU cache (= threadsafe global hash-map + LRU queue) 4. lightweight local cache 1 (= not threadsafe local hashmap with collision list which grows to a certain size (here: 10 elements) and afterwards never changes) 5. lightweight local cache 2 (not threadsafe local hashmap without collision list in which a collision replaces the stored element, idea by Alexey) ... using a haystack of 2 mio strings and A). 2 mio distinct simple patterns B). 10 simple patterns C) 2 mio distinct complex patterns D) 10 complex patterns Fo A) and C), caching does not help but these queries still allow to judge the static overhead of caching on query runtimes. B) and D) are extreme but common cases in practice. They include queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' : '%pattern2%'). Caching should help significantly. Because LIKE patterns are internally translated to re2 expressions, I show only measurements for MATCH queries. Results in sec, averaged over on multiple measurements; 1.A): 2.12 B): 1.68 C): 9.75 D): 9.45 2.A): 2.17 B): 1.73 C): 9.78 D): 9.47 3.A): 9.8 B): 0.63 C): 31.8 D): 0.98 4.A): 2.14 B): 0.29 C): 9.82 D): 0.41 5.A) 2.12 / 2.15 / 2.26 B) 1.51 / 0.43 / 0.30 C) 9.97 / 9.88 / 10.13 D) 5.70 / 0.42 / 0.43 (10/100/1000 buckets, resp. 10/1/0.1% collision rate) Evaluation: 1. This is the baseline. It was surprised that complex patterns (C, D) slow down the queries so badly compared to simple patterns (A, B). The runtime includes evaluation costs, but as caching only helps with compilation, and looking at 4.D and 5.D, compilation makes up over 90% of the runtime! 2. No speedup compared to 1, probably due to locking overhead. The cache is unbounded, and in experiments with data sets > 2 mio rows, 2. is the only scheme to throw OOM exceptions which is not acceptable. 3. Unique patterns (A and C) lead to thrashing of the LRU cache and very bad runtimes due to LRU queue maintenance and locking. Works pretty well however with few distinct patterns (B and D). 4. This scheme is tailored to queries B and D where it performs pretty good. More importantly, the caching is lightweight enough to not deteriorate performance on datasets A and C. 5. After some tuning of the hash map size, 100 buckets seem optimal to be in the same ballpark with 10 distinct patterns as 4. Performance also does not deteriorate on A and C compared to the baseline. Unlike 4., this scheme behaves LRU-like and can adjust to changing pattern distributions. As a conclusion, this commit implementes two things: 1. Based on Q1, pattern search with const needle no longer uses caching. This applies to LIKE and MATCH + a few (exotic) other SQL functions. The code for the unbounded caching was removed. 2. Based on Q2, pattern search with non-const needles now use method 5. 2022-05-27 10:40:53 +00:00			`const Regexps::Regexp holder = Regexps::createRegexp<false, false, false>(needle);`
			`const auto & regexp = holder.getRE2();`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			`if (!regexp)`
			`throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);`

			`const size_t groups_count = regexp->NumberOfCapturingGroups();`

			`if (!groups_count)`
			`throw Exception("There are no groups in regexp: " + needle, ErrorCodes::BAD_ARGUMENTS);`

			`if (groups_count > MAX_GROUPS_COUNT - 1)`
			`throw Exception("Too many groups in regexp: " + std::to_string(groups_count)`
			`+ ", max: " + std::to_string(MAX_GROUPS_COUNT - 1),`
			`ErrorCodes::BAD_ARGUMENTS);`

			`// Including 0-group, which is the whole regexp.`
			`PODArrayWithStackMemory<StringPiece, MAX_GROUPS_COUNT> matched_groups(groups_count + 1);`

			`ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create();`
			`ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create();`
			`ColumnString::MutablePtr data_col = ColumnString::create();`

			`auto & root_offsets_data = root_offsets_col->getData();`
			`auto & nested_offsets_data = nested_offsets_col->getData();`

			`ColumnArray::Offset current_root_offset = 0;`
			`ColumnArray::Offset current_nested_offset = 0;`

			`if constexpr (Kind == ExtractAllGroupsResultKind::VERTICAL)`
			`{`
			`root_offsets_data.resize(input_rows_count);`
			`for (size_t i = 0; i < input_rows_count; ++i)`
			`{`
Reduce some usage of StringRef 2022-08-17 12:32:13 +00:00			`std::string_view current_row = column_haystack->getDataAt(i).toView();`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			`// Extract all non-intersecting matches from haystack except group #0.`
Reduce some usage of StringRef 2022-08-17 12:32:13 +00:00			`const auto * pos = current_row.data();`
			`const auto * end = pos + current_row.size();`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`while (pos < end`
			`&& regexp->Match({pos, static_cast<size_t>(end - pos)},`
			`0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size()))`
			`{`
			`// 1 is to exclude group #0 which is whole re match.`
			`for (size_t group = 1; group <= groups_count; ++group)`
			`data_col->insertData(matched_groups[group].data(), matched_groups[group].size());`

Fix error in "extractAllGroups" function 2020-09-16 21:19:58 +00:00			`/// If match is empty - it's technically Ok but we have to shift one character nevertheless`
			`/// to avoid infinite loop.`
			`pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
			`current_nested_offset += groups_count;`
			`nested_offsets_data.push_back(current_nested_offset);`

			`++current_root_offset;`
			`}`

			`root_offsets_data[i] = current_root_offset;`
			`}`
			`}`
			`else`
			`{`
setting regexp_max_matches_per_row instead of 3rd argument 2021-07-30 09:20:02 +00:00			`/// Additional limit to fail fast on supposedly incorrect usage.`
			`const auto max_matches_per_row = context->getSettingsRef().regexp_max_matches_per_row;`
Updated extractAllGroupsHorizontal - flexible limit on number of matches per row. If it is not set via third argument, it deafults to previously hardcoded value 1000. 2021-07-29 12:36:55 +00:00
Fail fast in incorrect usage of extractAllGroups 2021-01-21 23:48:26 +00:00			`PODArray<StringPiece, 0> all_matches;`
			`/// Number of times RE matched on each row of haystack column.`
			`PODArray<size_t, 0> number_of_matches_per_row;`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00
Fail fast in incorrect usage of extractAllGroups 2021-01-21 23:48:26 +00:00			/// We expect RE to match multiple times on each row, `* 8` is arbitrary to reduce number of re-allocations.
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`all_matches.reserve(input_rows_count * groups_count * 8);`
			`number_of_matches_per_row.reserve(input_rows_count);`

			`for (size_t i = 0; i < input_rows_count; ++i)`
			`{`
			`size_t matches_per_row = 0;`

			`const auto & current_row = column_haystack->getDataAt(i);`

			`// Extract all non-intersecting matches from haystack except group #0.`
			`const auto * pos = current_row.data;`
			`const auto * end = pos + current_row.size;`
			`while (pos < end`
			`&& regexp->Match({pos, static_cast<size_t>(end - pos)},`
			`0, end - pos, regexp->UNANCHORED, matched_groups.data(), matched_groups.size()))`
			`{`
			`// 1 is to exclude group #0 which is whole re match.`
			`for (size_t group = 1; group <= groups_count; ++group)`
			`all_matches.push_back(matched_groups[group]);`

Updated extractAllGroupsHorizontal - flexible limit on number of matches per row. If it is not set via third argument, it deafults to previously hardcoded value 1000. 2021-07-29 12:36:55 +00:00			`++matches_per_row;`
			`if (matches_per_row > max_matches_per_row)`
Fixed erroneus failure of extractAllGroupsHorizontal on large columns 2021-04-13 10:50:58 +00:00			`throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,`
			`"Too many matches per row (> {}) in the result of function {}",`
Updated extractAllGroupsHorizontal - flexible limit on number of matches per row. If it is not set via third argument, it deafults to previously hardcoded value 1000. 2021-07-29 12:36:55 +00:00			`max_matches_per_row, getName());`
Fail fast in incorrect usage of extractAllGroups 2021-01-21 23:48:26 +00:00
Fix error in "extractAllGroups" function 2020-09-16 21:19:58 +00:00			`pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`}`

			`number_of_matches_per_row.push_back(matches_per_row);`
			`}`

			`{`
			`size_t total_matched_groups_string_len = 0;`
			`for (const auto & m : all_matches)`
			`total_matched_groups_string_len += m.length();`

			`data_col->reserve(total_matched_groups_string_len);`
			`}`

			`nested_offsets_col->reserve(matched_groups.size());`
			`root_offsets_col->reserve(groups_count);`

			// Re-arrange `all_matches` from:
			`// [`
			`// "ROW 0: 1st group 1st match",`
			`// "ROW 0: 2nd group 1st match",`
			`// ...,`
			`// "ROW 0: 1st group 2nd match",`
			`// "ROW 0: 2nd group 2nd match",`
			`// ...,`
			`// "ROW 1: 1st group 1st match",`
			`// ...`
			`// ]`
			`//`
			`// into column of 2D arrays:`
			`// [`
			`// /* all matchig groups from ROW 0 of haystack column */`
			`// ["ROW 0: 1st group 1st match", "ROW 0: 1st group 2nd match", ...],`
			`// ["ROW 0: 2nd group 1st match", "ROW 0: 2nd group 2nd match", ...],`
			`// ...`
			`// ],`
			`// [`
			`// /* all matchig groups from row 1 of haystack column */`
			`// ["ROW 1: 1st group 1st match", ...],`
			`// ...`
			`// ]`

			`size_t row_offset = 0;`
			`for (const auto matches_per_row : number_of_matches_per_row)`
			`{`
			`const size_t next_row_offset = row_offset + matches_per_row * groups_count;`
			`for (size_t group_id = 0; group_id < groups_count; ++group_id)`
			`{`
			`for (size_t i = row_offset + group_id; i < next_row_offset && i < all_matches.size(); i += groups_count)`
			`{`
			`const auto & match = all_matches[i];`
			`data_col->insertData(match.begin(), match.length());`
			`}`
			`nested_offsets_col->insertValue(data_col->size());`
			`}`
			`root_offsets_col->insertValue(nested_offsets_col->size());`
			`row_offset = next_row_offset;`
			`}`
			`}`

			`ColumnArray::MutablePtr nested_array_col = ColumnArray::create(std::move(data_col), std::move(nested_offsets_col));`
			`ColumnArray::MutablePtr root_array_col = ColumnArray::create(std::move(nested_array_col), std::move(root_offsets_col));`
Part 4. 2020-10-17 21:41:50 +00:00			`return root_array_col;`
extractAllGroupsHorizontal and extractAllGroupsVertical Split tests, fixed some error messages Fixed test and error reporting of extractGroups 2020-06-09 09:47:59 +00:00			`}`
			`};`

			`}`