Merge pull request #50642 from johanngan/regexptree-bad-opt

Revert invalid RegExpTreeDictionary optimization
This commit is contained in:
Alexey Milovidov 2023-06-07 13:00:20 +03:00 committed by GitHub
commit 6e9c08bbf4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 14 deletions

View File

@ -129,17 +129,6 @@ struct RegExpTreeDictionary::RegexTreeNode
return searcher.Match(haystack, 0, size, re2_st::RE2::Anchor::UNANCHORED, nullptr, 0);
}
/// check if this node can cover all the attributes from the query.
bool containsAll(const std::unordered_map<String, const DictionaryAttribute &> & matching_attributes) const
{
for (const auto & [key, value] : matching_attributes)
{
if (!attributes.contains(key))
return false;
}
return true;
}
struct AttributeValue
{
Field field;
@ -691,9 +680,6 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
if (node_ptr->match(reinterpret_cast<const char *>(keys_data.data()) + offset, length))
{
match_result.insertNodeID(node_ptr->id);
/// When this node is leaf and contains all the required attributes, it means a match.
if (node_ptr->containsAll(attributes) && node_ptr->children.empty())
break;
}
}

View File

@ -11,3 +11,19 @@
(['ClickHouse Documentation','ClickHouse'],[0,1],['/en'],['ClickHouse'])
(['Documentation','GitHub'],[2,3],[NULL],[])
(['Documentation','GitHub'],[2,3],[NULL],[])
ClickHouse
['ClickHouse']
ClickHouse Documentation
['ClickHouse Documentation','ClickHouse','Documentation']
GitHub Documentation
['GitHub Documentation','GitHub']
Documentation
['Documentation']
ClickHouse
['ClickHouse']
ClickHouse Documentation
['ClickHouse Documentation','ClickHouse','Documentation']
GitHub Documentation
['GitHub Documentation','GitHub']
Documentation
['Documentation']

View File

@ -175,6 +175,70 @@ select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'pare
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2);
"
# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2.
# An easy way to force the use of RE2 is to disable Hyperscan.
# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes
cat > "$yaml" <<EOL
- regexp: 'clickhouse\.com'
tag: 'ClickHouse'
paths:
- regexp: 'docs'
tag: 'ClickHouse Documentation'
- regexp: 'github\.com'
tag: 'GitHub'
paths:
- regexp: 'docs'
tag: 'GitHub Documentation'
- regexp: '/docs(/|$)'
tag: 'Documentation'
EOL
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = true);
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = false);
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
"
$CLICKHOUSE_CLIENT -n --query="
drop dictionary regexp_dict1;
drop dictionary regexp_dict2;