ClickHouse/src/Functions/countMatches.h

#pragma once

#include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/Regexps.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int ILLEGAL_COLUMN;
    extern const int LOGICAL_ERROR;
}

using Pos = const char *;

template <class CountMatchesBase>
class FunctionCountMatches : public IFunction
{
public:
    static constexpr auto name = CountMatchesBase::name;
    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionCountMatches<CountMatchesBase>>(); }

    String getName() const override { return name; }
    size_t getNumberOfArguments() const override { return 2; }
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
    {
        if (!isStringOrFixedString(arguments[1].type))
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                "Illegal type {} of second argument (pattern) of function {}. Must be String/FixedString.",
                arguments[1].type->getName(), getName());
        if (!isStringOrFixedString(arguments[0].type))
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                "Illegal type {} of first argument (haystack) of function {}. Must be String/FixedString.",
                arguments[0].type->getName(), getName());
        const auto * column = arguments[1].column.get();
        if (!column || !checkAndGetColumnConstStringOrFixedString(column))
            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
                "The second argument of function {} should be a constant string with the pattern",
                getName());

        return std::make_shared<DataTypeUInt64>();
    }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
    {
        const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
        Regexps::RegexpPtr re = Regexps::get<false /* like */, true /* is_no_capture */, CountMatchesBase::case_insensitive>(column_pattern->getValue<String>());
        OptimizedRegularExpression::MatchVec matches;

        const IColumn * column_haystack = arguments[0].column.get();

        if (const ColumnString * col_str = checkAndGetColumn<ColumnString>(column_haystack))
        {
            auto result_column = ColumnUInt64::create();

            const ColumnString::Chars & src_chars = col_str->getChars();
            const ColumnString::Offsets & src_offsets = col_str->getOffsets();

            ColumnUInt64::Container & vec_res = result_column->getData();
            vec_res.resize(input_rows_count);

            size_t size = src_offsets.size();
            ColumnString::Offset current_src_offset = 0;

            for (size_t i = 0; i < size; ++i)
            {
                Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
                current_src_offset = src_offsets[i];
                Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;

                StringRef str(pos, end - pos);
                vec_res[i] = countMatches(str, re, matches);
            }

            return result_column;
        }
        else if (const ColumnConst * col_const_str = checkAndGetColumnConstStringOrFixedString(column_haystack))
        {
            StringRef str = col_const_str->getDataColumn().getDataAt(0);
            uint64_t matches_count = countMatches(str, re, matches);
            return result_type->createColumnConst(input_rows_count, matches_count);
        }
        else
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()");
    }

    static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches)
    {
        /// Only one match is required, no need to copy more.
        static const unsigned matches_limit = 1;

        Pos pos = reinterpret_cast<Pos>(src.data);
        Pos end = reinterpret_cast<Pos>(src.data + src.size);

        uint64_t match_count = 0;
        while (true)
        {
            if (pos >= end)
                break;
            if (!re->match(pos, end - pos, matches, matches_limit))
                break;
            /// Progress should be made, but with empty match the progress will not be done.
            /// Also note that simply check is pattern empty is not enough,
            /// since for example "'[f]{0}'" will match zero bytes:
            if (!matches[0].length)
                break;
            pos += matches[0].offset + matches[0].length;
            match_count++;
        }

        return match_count;
    }
};

}
Fix and optimize countMatches()/countMatchesCaseInsensitive() - Update after IFunction interfaces changes - move type checks into FunctionCountMatches::getReturnTypeImpl() - Use StringRef over String - Separate out logic for counting sub matches into separate helper - Do not copy other regular expression matches, only the first - Add some comments - Set is_no_capture, to avoid check for number of subpatterns - Add countMatchesCaseInsensitive() - Reguster functions in case-sensitive manner, since this is not SQL standard 2020-11-26 20:29:37 +00:00			`#pragma once`

Function move file 2021-05-17 07:30:42 +00:00			`#include <Functions/IFunction.h>`
Fix and optimize countMatches()/countMatchesCaseInsensitive() - Update after IFunction interfaces changes - move type checks into FunctionCountMatches::getReturnTypeImpl() - Use StringRef over String - Separate out logic for counting sub matches into separate helper - Do not copy other regular expression matches, only the first - Add some comments - Set is_no_capture, to avoid check for number of subpatterns - Add countMatchesCaseInsensitive() - Reguster functions in case-sensitive manner, since this is not SQL standard 2020-11-26 20:29:37 +00:00			`#include <Functions/FunctionFactory.h>`
			`#include <Functions/FunctionHelpers.h>`
			`#include <Columns/ColumnString.h>`
			`#include <Columns/ColumnsNumber.h>`
			`#include <DataTypes/DataTypesNumber.h>`
			`#include <DataTypes/DataTypeString.h>`
			`#include <Functions/Regexps.h>`


			`namespace DB`
			`{`

			`namespace ErrorCodes`
			`{`
			`extern const int ILLEGAL_TYPE_OF_ARGUMENT;`
			`extern const int ILLEGAL_COLUMN;`
			`extern const int LOGICAL_ERROR;`
			`}`

			`using Pos = const char *;`

			`template <class CountMatchesBase>`
			`class FunctionCountMatches : public IFunction`
			`{`
			`public:`
			`static constexpr auto name = CountMatchesBase::name;`
Rename ContextConstPtr to ContextPtr. 2021-06-01 12:20:52 +00:00			`static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionCountMatches<CountMatchesBase>>(); }`
Fix and optimize countMatches()/countMatchesCaseInsensitive() - Update after IFunction interfaces changes - move type checks into FunctionCountMatches::getReturnTypeImpl() - Use StringRef over String - Separate out logic for counting sub matches into separate helper - Do not copy other regular expression matches, only the first - Add some comments - Set is_no_capture, to avoid check for number of subpatterns - Add countMatchesCaseInsensitive() - Reguster functions in case-sensitive manner, since this is not SQL standard 2020-11-26 20:29:37 +00:00
			`String getName() const override { return name; }`
			`size_t getNumberOfArguments() const override { return 2; }`
Try to simplify code 2021-06-22 16:21:23 +00:00			`bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /arguments/) const override { return true; }`
Fix and optimize countMatches()/countMatchesCaseInsensitive() - Update after IFunction interfaces changes - move type checks into FunctionCountMatches::getReturnTypeImpl() - Use StringRef over String - Separate out logic for counting sub matches into separate helper - Do not copy other regular expression matches, only the first - Add some comments - Set is_no_capture, to avoid check for number of subpatterns - Add countMatchesCaseInsensitive() - Reguster functions in case-sensitive manner, since this is not SQL standard 2020-11-26 20:29:37 +00:00
			`DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override`
			`{`
			`if (!isStringOrFixedString(arguments[1].type))`
			`throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,`
			`"Illegal type {} of second argument (pattern) of function {}. Must be String/FixedString.",`
			`arguments[1].type->getName(), getName());`
			`if (!isStringOrFixedString(arguments[0].type))`
			`throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,`
			`"Illegal type {} of first argument (haystack) of function {}. Must be String/FixedString.",`
			`arguments[0].type->getName(), getName());`
			`const auto * column = arguments[1].column.get();`
			`if (!column \|\| !checkAndGetColumnConstStringOrFixedString(column))`
			`throw Exception(ErrorCodes::ILLEGAL_COLUMN,`
			`"The second argument of function {} should be a constant string with the pattern",`
			`getName());`

			`return std::make_shared<DataTypeUInt64>();`
			`}`

			`ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override`
			`{`
			`const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());`
Cache compiled regexps when evaluating non-const needles Needles in a (non-const) needle column may repeat and this commit allows to skip compilation for known needles. Out of the different design alternatives (see below, if someone is interested), we now maintain - one global pattern cache, - with a fixed size of 42k elements currently, - and use LRU as eviction strategy. ------------------------------------------------------------------------ (sorry for the wall of text, dumping it here not for reading but just for reference) Write-up about considered design alternatives: 1. Keep the current global cache of const needles. For non-const needles, probe the cache but don't store values in it. Pros: need to maintain just a single cache, no problem with cache pollution assuming there are few distinct constant needles Cons: only useful if a non-const needle occurred as already as a const needle --> overall too simplistic 2. Keep the current global cache for const needles. For non-const needles, create a local (e.g. per-query) cache Pros: unlike (1.), non-const needles can be skipped even if they did not occur yet, no pollution of the const pattern cache when there are very many non-const needles (e.g. large / highly distinct needle columns). Cons: caches may explode "horizontally", i.e. we'll end up with the const cache + caches for Q1, Q2, ... QN, this makes it harder to control the overall space consumption, also patterns residing in different caches cannot be reused between queries, another difficulty is that the concept of "query" does not really exist at matching level - there are only column chunks and we'd potentially end up with 1 cache / chunk 3. Queries with const and non-const needles insert into the same global cache. Pros: the advantages of (2.) + allows to reuse compiled patterns accross parallel queries Cons: needs an eviction strategy to control cache size and pollution (and btw. (2.) also needs eviction strategies for the individual caches) 4. Queries with const needle use global cache, queries with non-const needle use a different global cache --> Overall similar to (3) but ignores the (likely) edge case that const and non-const needles overlap. In sum, (3.) seems the simplest and most beneficial approach. Eviction strategies: 0. Don't ever evict --> cache may grow infinitely and eventually make the system unusable (may even pose a DoS risk) 1. Flush the cache after a certain threshold is exceeded --> very simple but may lead to peridic performance drops 2. Use LRU --> more graceful performance degradation at threshold but comes with a (constant) performance overhead to maintain the LRU queue In sum, given that the pattern compilation in RE2 should be quite costly (pattern-to-DFA/NFA), LRU may be acceptable. 2022-05-25 19:22:45 +00:00			`Regexps::RegexpPtr re = Regexps::get<false /* like /, true / is_no_capture */, CountMatchesBase::case_insensitive>(column_pattern->getValue<String>());`
Fix and optimize countMatches()/countMatchesCaseInsensitive() - Update after IFunction interfaces changes - move type checks into FunctionCountMatches::getReturnTypeImpl() - Use StringRef over String - Separate out logic for counting sub matches into separate helper - Do not copy other regular expression matches, only the first - Add some comments - Set is_no_capture, to avoid check for number of subpatterns - Add countMatchesCaseInsensitive() - Reguster functions in case-sensitive manner, since this is not SQL standard 2020-11-26 20:29:37 +00:00			`OptimizedRegularExpression::MatchVec matches;`

			`const IColumn * column_haystack = arguments[0].column.get();`

			`if (const ColumnString * col_str = checkAndGetColumn<ColumnString>(column_haystack))`
			`{`
			`auto result_column = ColumnUInt64::create();`

			`const ColumnString::Chars & src_chars = col_str->getChars();`
			`const ColumnString::Offsets & src_offsets = col_str->getOffsets();`

			`ColumnUInt64::Container & vec_res = result_column->getData();`
			`vec_res.resize(input_rows_count);`

			`size_t size = src_offsets.size();`
			`ColumnString::Offset current_src_offset = 0;`

			`for (size_t i = 0; i < size; ++i)`
			`{`
			`Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);`
			`current_src_offset = src_offsets[i];`
			`Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;`

			`StringRef str(pos, end - pos);`
			`vec_res[i] = countMatches(str, re, matches);`
			`}`

			`return result_column;`
			`}`
			`else if (const ColumnConst * col_const_str = checkAndGetColumnConstStringOrFixedString(column_haystack))`
			`{`
			`StringRef str = col_const_str->getDataColumn().getDataAt(0);`
			`uint64_t matches_count = countMatches(str, re, matches);`
			`return result_type->createColumnConst(input_rows_count, matches_count);`
			`}`
			`else`
			`throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()");`
			`}`

Cache compiled regexps when evaluating non-const needles Needles in a (non-const) needle column may repeat and this commit allows to skip compilation for known needles. Out of the different design alternatives (see below, if someone is interested), we now maintain - one global pattern cache, - with a fixed size of 42k elements currently, - and use LRU as eviction strategy. ------------------------------------------------------------------------ (sorry for the wall of text, dumping it here not for reading but just for reference) Write-up about considered design alternatives: 1. Keep the current global cache of const needles. For non-const needles, probe the cache but don't store values in it. Pros: need to maintain just a single cache, no problem with cache pollution assuming there are few distinct constant needles Cons: only useful if a non-const needle occurred as already as a const needle --> overall too simplistic 2. Keep the current global cache for const needles. For non-const needles, create a local (e.g. per-query) cache Pros: unlike (1.), non-const needles can be skipped even if they did not occur yet, no pollution of the const pattern cache when there are very many non-const needles (e.g. large / highly distinct needle columns). Cons: caches may explode "horizontally", i.e. we'll end up with the const cache + caches for Q1, Q2, ... QN, this makes it harder to control the overall space consumption, also patterns residing in different caches cannot be reused between queries, another difficulty is that the concept of "query" does not really exist at matching level - there are only column chunks and we'd potentially end up with 1 cache / chunk 3. Queries with const and non-const needles insert into the same global cache. Pros: the advantages of (2.) + allows to reuse compiled patterns accross parallel queries Cons: needs an eviction strategy to control cache size and pollution (and btw. (2.) also needs eviction strategies for the individual caches) 4. Queries with const needle use global cache, queries with non-const needle use a different global cache --> Overall similar to (3) but ignores the (likely) edge case that const and non-const needles overlap. In sum, (3.) seems the simplest and most beneficial approach. Eviction strategies: 0. Don't ever evict --> cache may grow infinitely and eventually make the system unusable (may even pose a DoS risk) 1. Flush the cache after a certain threshold is exceeded --> very simple but may lead to peridic performance drops 2. Use LRU --> more graceful performance degradation at threshold but comes with a (constant) performance overhead to maintain the LRU queue In sum, given that the pattern compilation in RE2 should be quite costly (pattern-to-DFA/NFA), LRU may be acceptable. 2022-05-25 19:22:45 +00:00			`static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches)`
Fix and optimize countMatches()/countMatchesCaseInsensitive() - Update after IFunction interfaces changes - move type checks into FunctionCountMatches::getReturnTypeImpl() - Use StringRef over String - Separate out logic for counting sub matches into separate helper - Do not copy other regular expression matches, only the first - Add some comments - Set is_no_capture, to avoid check for number of subpatterns - Add countMatchesCaseInsensitive() - Reguster functions in case-sensitive manner, since this is not SQL standard 2020-11-26 20:29:37 +00:00			`{`
			`/// Only one match is required, no need to copy more.`
			`static const unsigned matches_limit = 1;`

			`Pos pos = reinterpret_cast<Pos>(src.data);`
			`Pos end = reinterpret_cast<Pos>(src.data + src.size);`

			`uint64_t match_count = 0;`
			`while (true)`
			`{`
			`if (pos >= end)`
			`break;`
			`if (!re->match(pos, end - pos, matches, matches_limit))`
			`break;`
			`/// Progress should be made, but with empty match the progress will not be done.`
			`/// Also note that simply check is pattern empty is not enough,`
			`/// since for example "'[f]{0}'" will match zero bytes:`
			`if (!matches[0].length)`
			`break;`
			`pos += matches[0].offset + matches[0].length;`
			`match_count++;`
			`}`

			`return match_count;`
			`}`
			`};`

			`}`