mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge pull request #50642 from johanngan/regexptree-bad-opt
Revert invalid RegExpTreeDictionary optimization
This commit is contained in:
commit
6e9c08bbf4
@ -129,17 +129,6 @@ struct RegExpTreeDictionary::RegexTreeNode
|
||||
return searcher.Match(haystack, 0, size, re2_st::RE2::Anchor::UNANCHORED, nullptr, 0);
|
||||
}
|
||||
|
||||
/// check if this node can cover all the attributes from the query.
|
||||
bool containsAll(const std::unordered_map<String, const DictionaryAttribute &> & matching_attributes) const
|
||||
{
|
||||
for (const auto & [key, value] : matching_attributes)
|
||||
{
|
||||
if (!attributes.contains(key))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
struct AttributeValue
|
||||
{
|
||||
Field field;
|
||||
@ -691,9 +680,6 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
|
||||
if (node_ptr->match(reinterpret_cast<const char *>(keys_data.data()) + offset, length))
|
||||
{
|
||||
match_result.insertNodeID(node_ptr->id);
|
||||
/// When this node is leaf and contains all the required attributes, it means a match.
|
||||
if (node_ptr->containsAll(attributes) && node_ptr->children.empty())
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -11,3 +11,19 @@
|
||||
(['ClickHouse Documentation','ClickHouse'],[0,1],['/en'],['ClickHouse'])
|
||||
(['Documentation','GitHub'],[2,3],[NULL],[])
|
||||
(['Documentation','GitHub'],[2,3],[NULL],[])
|
||||
ClickHouse
|
||||
['ClickHouse']
|
||||
ClickHouse Documentation
|
||||
['ClickHouse Documentation','ClickHouse','Documentation']
|
||||
GitHub Documentation
|
||||
['GitHub Documentation','GitHub']
|
||||
Documentation
|
||||
['Documentation']
|
||||
ClickHouse
|
||||
['ClickHouse']
|
||||
ClickHouse Documentation
|
||||
['ClickHouse Documentation','ClickHouse','Documentation']
|
||||
GitHub Documentation
|
||||
['GitHub Documentation','GitHub']
|
||||
Documentation
|
||||
['Documentation']
|
||||
|
@ -175,6 +175,70 @@ select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'pare
|
||||
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2);
|
||||
"
|
||||
|
||||
# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2.
|
||||
# An easy way to force the use of RE2 is to disable Hyperscan.
|
||||
# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes
|
||||
cat > "$yaml" <<EOL
|
||||
- regexp: 'clickhouse\.com'
|
||||
tag: 'ClickHouse'
|
||||
paths:
|
||||
- regexp: 'docs'
|
||||
tag: 'ClickHouse Documentation'
|
||||
|
||||
- regexp: 'github\.com'
|
||||
tag: 'GitHub'
|
||||
paths:
|
||||
- regexp: 'docs'
|
||||
tag: 'GitHub Documentation'
|
||||
|
||||
- regexp: '/docs(/|$)'
|
||||
tag: 'Documentation'
|
||||
EOL
|
||||
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
drop dictionary if exists regexp_dict3;
|
||||
create dictionary regexp_dict3
|
||||
(
|
||||
regexp String,
|
||||
tag String
|
||||
)
|
||||
PRIMARY KEY(regexp)
|
||||
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
||||
LIFETIME(0)
|
||||
LAYOUT(regexp_tree)
|
||||
SETTINGS(regexp_dict_allow_hyperscan = true);
|
||||
|
||||
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
|
||||
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
|
||||
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
||||
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
||||
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
|
||||
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
|
||||
select dictGet('regexp_dict3', 'tag', '/docs');
|
||||
select dictGetAll('regexp_dict3', 'tag', '/docs');
|
||||
|
||||
drop dictionary if exists regexp_dict3;
|
||||
create dictionary regexp_dict3
|
||||
(
|
||||
regexp String,
|
||||
tag String
|
||||
)
|
||||
PRIMARY KEY(regexp)
|
||||
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
||||
LIFETIME(0)
|
||||
LAYOUT(regexp_tree)
|
||||
SETTINGS(regexp_dict_allow_hyperscan = false);
|
||||
|
||||
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
|
||||
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
|
||||
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
||||
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
||||
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
|
||||
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
|
||||
select dictGet('regexp_dict3', 'tag', '/docs');
|
||||
select dictGetAll('regexp_dict3', 'tag', '/docs');
|
||||
"
|
||||
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
drop dictionary regexp_dict1;
|
||||
drop dictionary regexp_dict2;
|
||||
|
Loading…
Reference in New Issue
Block a user