Merge pull request #62692 from ClickHouse/revert-62392-opt_memchr

Revert "Speed up `splitByRegexp`"
This commit is contained in:
Robert Schulze 2024-04-16 12:39:40 +00:00 committed by GitHub
commit 38d02d78d8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 3 additions and 94 deletions

View File

@ -1,10 +1,8 @@
#include <Columns/ColumnConst.h>
#include <DataTypes/IDataType.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/FunctionTokens.h>
#include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h>
#include <base/map.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/assert_cast.h>
@ -104,7 +102,7 @@ public:
return false;
}
++pos;
pos += 1;
token_end = pos;
++splits;
}
@ -150,69 +148,11 @@ public:
using FunctionSplitByRegexp = FunctionTokens<SplitByRegexpImpl>;
/// Fallback splitByRegexp to splitByChar when its 1st argument is a trivial char for better performance
class SplitByRegexpOverloadResolver : public IFunctionOverloadResolver
{
public:
static constexpr auto name = "splitByRegexp";
static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique<SplitByRegexpOverloadResolver>(context); }
explicit SplitByRegexpOverloadResolver(ContextPtr context_)
: context(context_)
, split_by_regexp(FunctionSplitByRegexp::create(context)) {}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return SplitByRegexpImpl::getNumberOfArguments(); }
bool isVariadic() const override { return SplitByRegexpImpl::isVariadic(); }
FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
{
if (patternIsTrivialChar(arguments))
return FunctionFactory::instance().getImpl("splitByChar", context)->build(arguments);
else
return std::make_unique<FunctionToFunctionBaseAdaptor>(
split_by_regexp, collections::map<DataTypes>(arguments, [](const auto & elem) { return elem.type; }), return_type);
}
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
return split_by_regexp->getReturnTypeImpl(arguments);
}
private:
bool patternIsTrivialChar(const ColumnsWithTypeAndName & arguments) const
{
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
if (!col)
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal column {} of first argument of function {}. "
"Must be constant string.",
arguments[0].column->getName(),
getName());
String pattern = col->getValue<String>();
if (pattern.size() == 1)
{
OptimizedRegularExpression re = Regexps::createRegexp<false, false, false>(pattern);
std::string required_substring;
bool is_trivial;
bool required_substring_is_prefix;
re.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
return is_trivial && required_substring == pattern;
}
return false;
}
ContextPtr context;
FunctionPtr split_by_regexp;
};
}
REGISTER_FUNCTION(SplitByRegexp)
{
factory.registerFunction<SplitByRegexpOverloadResolver>();
factory.registerFunction<FunctionSplitByRegexp>();
}
}

View File

@ -1,5 +1,3 @@
<test>
<query>with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByChar(' ', materialize(s)) as w from numbers(1000000)</query>
<query>with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp(' ', materialize(s)) as w from numbers(1000000)</query>
<query>with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp('\s+', materialize(s)) as w from numbers(100000)</query>
</test>

View File

@ -5,15 +5,3 @@
['gbye','bug']
['']
[]
Test fallback of splitByRegexp to splitByChar if regexp is trivial
['a','b','c']
['a','b','c']
['','','','','','']
['a^b^c']
['a$b$c']
['a)b)c']
['a','b','c']
['a','b','c']
['a','b','c']
['a|b|c']
['a\\b\\c']

View File

@ -3,20 +3,3 @@ select splitByRegexp('', 'abcde');
select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['<h1>hello<h2>world</h2></h1>', 'gbye<split>bug']) x);
select splitByRegexp('ab', '');
select splitByRegexp('', '');
SELECT 'Test fallback of splitByRegexp to splitByChar if regexp is trivial';
select splitByRegexp(' ', 'a b c');
select splitByRegexp('-', 'a-b-c');
select splitByRegexp('.', 'a.b.c');
select splitByRegexp('^', 'a^b^c');
select splitByRegexp('$', 'a$b$c');
select splitByRegexp('+', 'a+b+c'); -- { serverError CANNOT_COMPILE_REGEXP }
select splitByRegexp('?', 'a?b?c'); -- { serverError CANNOT_COMPILE_REGEXP }
select splitByRegexp('(', 'a(b(c'); -- { serverError CANNOT_COMPILE_REGEXP }
select splitByRegexp(')', 'a)b)c');
select splitByRegexp('[', 'a[b[c'); -- { serverError CANNOT_COMPILE_REGEXP }
select splitByRegexp(']', 'a]b]c');
select splitByRegexp('{', 'a{b{c');
select splitByRegexp('}', 'a}b}c');
select splitByRegexp('|', 'a|b|c');
select splitByRegexp('\\', 'a\\b\\c');