mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge pull request #62692 from ClickHouse/revert-62392-opt_memchr
Revert "Speed up `splitByRegexp`"
This commit is contained in:
commit
38d02d78d8
@ -1,10 +1,8 @@
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/FunctionTokens.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/Regexps.h>
|
||||
#include <base/map.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Common/assert_cast.h>
|
||||
|
||||
@ -104,7 +102,7 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
++pos;
|
||||
pos += 1;
|
||||
token_end = pos;
|
||||
++splits;
|
||||
}
|
||||
@ -150,69 +148,11 @@ public:
|
||||
|
||||
using FunctionSplitByRegexp = FunctionTokens<SplitByRegexpImpl>;
|
||||
|
||||
/// Fallback splitByRegexp to splitByChar when its 1st argument is a trivial char for better performance
|
||||
class SplitByRegexpOverloadResolver : public IFunctionOverloadResolver
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "splitByRegexp";
|
||||
static FunctionOverloadResolverPtr create(ContextPtr context) { return std::make_unique<SplitByRegexpOverloadResolver>(context); }
|
||||
|
||||
explicit SplitByRegexpOverloadResolver(ContextPtr context_)
|
||||
: context(context_)
|
||||
, split_by_regexp(FunctionSplitByRegexp::create(context)) {}
|
||||
|
||||
String getName() const override { return name; }
|
||||
size_t getNumberOfArguments() const override { return SplitByRegexpImpl::getNumberOfArguments(); }
|
||||
bool isVariadic() const override { return SplitByRegexpImpl::isVariadic(); }
|
||||
|
||||
FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
|
||||
{
|
||||
if (patternIsTrivialChar(arguments))
|
||||
return FunctionFactory::instance().getImpl("splitByChar", context)->build(arguments);
|
||||
else
|
||||
return std::make_unique<FunctionToFunctionBaseAdaptor>(
|
||||
split_by_regexp, collections::map<DataTypes>(arguments, [](const auto & elem) { return elem.type; }), return_type);
|
||||
}
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
return split_by_regexp->getReturnTypeImpl(arguments);
|
||||
}
|
||||
|
||||
private:
|
||||
bool patternIsTrivialChar(const ColumnsWithTypeAndName & arguments) const
|
||||
{
|
||||
const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
|
||||
if (!col)
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}. "
|
||||
"Must be constant string.",
|
||||
arguments[0].column->getName(),
|
||||
getName());
|
||||
|
||||
String pattern = col->getValue<String>();
|
||||
if (pattern.size() == 1)
|
||||
{
|
||||
OptimizedRegularExpression re = Regexps::createRegexp<false, false, false>(pattern);
|
||||
|
||||
std::string required_substring;
|
||||
bool is_trivial;
|
||||
bool required_substring_is_prefix;
|
||||
re.getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
|
||||
return is_trivial && required_substring == pattern;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ContextPtr context;
|
||||
FunctionPtr split_by_regexp;
|
||||
};
|
||||
}
|
||||
|
||||
REGISTER_FUNCTION(SplitByRegexp)
|
||||
{
|
||||
factory.registerFunction<SplitByRegexpOverloadResolver>();
|
||||
factory.registerFunction<FunctionSplitByRegexp>();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,3 @@
|
||||
<test>
|
||||
<query>with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByChar(' ', materialize(s)) as w from numbers(1000000)</query>
|
||||
<query>with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp(' ', materialize(s)) as w from numbers(1000000)</query>
|
||||
<query>with 'Many years later as he faced the firing squad, Colonel Aureliano Buendia was to remember that distant afternoon when his father took him to discover ice.' as s select splitByRegexp('\s+', materialize(s)) as w from numbers(100000)</query>
|
||||
</test>
|
||||
|
@ -5,15 +5,3 @@
|
||||
['gbye','bug']
|
||||
['']
|
||||
[]
|
||||
Test fallback of splitByRegexp to splitByChar if regexp is trivial
|
||||
['a','b','c']
|
||||
['a','b','c']
|
||||
['','','','','','']
|
||||
['a^b^c']
|
||||
['a$b$c']
|
||||
['a)b)c']
|
||||
['a','b','c']
|
||||
['a','b','c']
|
||||
['a','b','c']
|
||||
['a|b|c']
|
||||
['a\\b\\c']
|
||||
|
@ -3,20 +3,3 @@ select splitByRegexp('', 'abcde');
|
||||
select splitByRegexp('<[^<>]*>', x) from (select arrayJoin(['<h1>hello<h2>world</h2></h1>', 'gbye<split>bug']) x);
|
||||
select splitByRegexp('ab', '');
|
||||
select splitByRegexp('', '');
|
||||
|
||||
SELECT 'Test fallback of splitByRegexp to splitByChar if regexp is trivial';
|
||||
select splitByRegexp(' ', 'a b c');
|
||||
select splitByRegexp('-', 'a-b-c');
|
||||
select splitByRegexp('.', 'a.b.c');
|
||||
select splitByRegexp('^', 'a^b^c');
|
||||
select splitByRegexp('$', 'a$b$c');
|
||||
select splitByRegexp('+', 'a+b+c'); -- { serverError CANNOT_COMPILE_REGEXP }
|
||||
select splitByRegexp('?', 'a?b?c'); -- { serverError CANNOT_COMPILE_REGEXP }
|
||||
select splitByRegexp('(', 'a(b(c'); -- { serverError CANNOT_COMPILE_REGEXP }
|
||||
select splitByRegexp(')', 'a)b)c');
|
||||
select splitByRegexp('[', 'a[b[c'); -- { serverError CANNOT_COMPILE_REGEXP }
|
||||
select splitByRegexp(']', 'a]b]c');
|
||||
select splitByRegexp('{', 'a{b{c');
|
||||
select splitByRegexp('}', 'a}b}c');
|
||||
select splitByRegexp('|', 'a|b|c');
|
||||
select splitByRegexp('\\', 'a\\b\\c');
|
||||
|
Loading…
Reference in New Issue
Block a user