ClickHouse/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh
johanngan be8e048799 Revert invalid RegExpTreeDictionary optimization
This reverts the following commits:
- e77dd81036
- e8527e720b

Additionally, functional tests are added.

When scanning complex regexp nodes sequentially with RE2, the old code
has an optimization to break out of the loop early upon finding a leaf
node that matches. This is an invalid optimization because there's no
guarantee that it's actually a VALID match, because its parents might
NOT have matched. Semantically, a user would expect this match to be
discarded and for the search to continue. Instead, since we skipped
matching after the first false positive, subsequent nodes that would
have matched are missing from the output value. This affects both
dictGet and dictGetAll.

It's difficult to distinguish a true positive from a false positive
while looping through complex_regexp_nodes because we would have to scan
all the parents of a matching node to confirm a true positive. Trying to
do this might actually end up being slower than just scanning every
complex regexp node, because complex_regexp_nodes is only a subset of
all the tree nodes; we may end up duplicating work with scanning
that Vectorscan has already done, depending on whether the parent nodes
are "simple" or "complex". So instead of trying to fix this
optimization, just remove it entirely.
2023-06-06 16:28:44 -05:00

249 lines
6.9 KiB
Bash
Executable File

#!/usr/bin/env bash
# Tags: use-vectorscan, no-fasttest, no-parallel
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
mkdir -p $USER_FILES_PATH/test_02504
yaml=$USER_FILES_PATH/test_02504/test.yaml
cat > "$yaml" <<EOL
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
name: 'TencentOS'
version: '\1'
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
name: 'Android'
versions:
- regexp: '33/tclwebkit'
version: '13'
- regexp: '3[12]/tclwebkit'
version: '12'
- regexp: '30/tclwebkit'
version: '11'
- regexp: '29/tclwebkit'
version: '10'
EOL
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict1;
create dictionary regexp_dict1
(
regexp String,
name String,
version Nullable(String) default 'default',
lucky Int64
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = true);
select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux');
select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024');
select dictGet('regexp_dict1', ('name', 'version'), '999/tclwebkit1024');
select dictGet('regexp_dict1', ('name', 'version'), '28/tclwebkit1024');
"
cat > "$yaml" <<EOL
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
name: 'TencentOS'
version: '\1'
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
name: 'Android'
versions:
- regexp: '33/tclwebkit'
version: '13'
- regexp: '3[12]/tclwebkit'
version: '12'
- regexp: '30/tclwebkit'
version: '11'
- regexp: '29/tclwebkit'
version: '10'
- regexp: '28/tclwebkit'
version:
lucky: 'abcde'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1; -- { serverError 489 }
"
cat > "$yaml" <<EOL
- regexp:
name: 'TencentOS'
version: '\1'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1; -- { serverError 318 }
"
cat > "$yaml" <<EOL
- name: BlackBerry WebKit
regexp: (PlayBook).{1,200}RIM Tablet OS (\d+)\.(\d+)\.(\d+)
version: '\2.\3'
- name: BlackBerry WebKit
regexp: (Black[bB]erry|BB10).{1,200}Version/(\d+)\.(\d+)\.(\d+)
version: '\2.\3'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1;
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.3+ (KHTML, like Gecko) Version/10.0.9.388 Mobile Safari/537.3+');
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 1.0.0; en-US) AppleWebKit/534.8+ (KHTML, like Gecko) Version/0.0.1 Safari/534.8+');
"
cat > "$yaml" <<EOL
- regexp: 'abc'
col_bool: 'true'
col_uuid: '61f0c404-5cb3-11e7-907b-a6006ad3dba0'
col_date: '2023-01-01'
col_datetime: '2023-01-01 01:01:01'
col_array: '[1,2,3,-1,-2,-3]'
EOL
$CLICKHOUSE_CLIENT -n --query="
create dictionary regexp_dict2
(
regexp String,
col_bool Boolean,
col_uuid UUID,
col_date Date,
col_datetime DateTime,
col_array Array(Int64)
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
select dictGet('regexp_dict2', ('col_bool','col_uuid', 'col_date', 'col_datetime', 'col_array'), 'abc');
"
cat > "$yaml" <<EOL
- regexp: 'clickhouse\.com'
tag: 'ClickHouse'
topological_index: 1
paths:
- regexp: 'clickhouse\.com/docs(.*)'
tag: 'ClickHouse Documentation'
topological_index: 0
captured: '\1'
parent: 'ClickHouse'
- regexp: '/docs(/|$)'
tag: 'Documentation'
topological_index: 2
- regexp: 'github.com'
tag: 'GitHub'
topological_index: 3
captured: 'NULL'
EOL
# dictGetAll
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String,
topological_index Int64,
captured Nullable(String),
parent String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com');
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com', 2);
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en');
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en', 2);
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs');
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2);
"
# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2.
# An easy way to force the use of RE2 is to disable Hyperscan.
# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes
cat > "$yaml" <<EOL
- regexp: 'clickhouse\.com'
tag: 'ClickHouse'
paths:
- regexp: 'docs'
tag: 'ClickHouse Documentation'
- regexp: 'github\.com'
tag: 'GitHub'
paths:
- regexp: 'docs'
tag: 'GitHub Documentation'
- regexp: '/docs(/|$)'
tag: 'Documentation'
EOL
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = true);
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = false);
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
"
$CLICKHOUSE_CLIENT -n --query="
drop dictionary regexp_dict1;
drop dictionary regexp_dict2;
drop dictionary regexp_dict3;
"
rm -rf "$USER_FILES_PATH/test_02504"