mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-19 04:42:37 +00:00
be8e048799
This reverts the following commits: -e77dd81036
-e8527e720b
Additionally, functional tests are added. When scanning complex regexp nodes sequentially with RE2, the old code has an optimization to break out of the loop early upon finding a leaf node that matches. This is an invalid optimization because there's no guarantee that it's actually a VALID match, because its parents might NOT have matched. Semantically, a user would expect this match to be discarded and for the search to continue. Instead, since we skipped matching after the first false positive, subsequent nodes that would have matched are missing from the output value. This affects both dictGet and dictGetAll. It's difficult to distinguish a true positive from a false positive while looping through complex_regexp_nodes because we would have to scan all the parents of a matching node to confirm a true positive. Trying to do this might actually end up being slower than just scanning every complex regexp node, because complex_regexp_nodes is only a subset of all the tree nodes; we may end up duplicating work with scanning that Vectorscan has already done, depending on whether the parent nodes are "simple" or "complex". So instead of trying to fix this optimization, just remove it entirely.
249 lines
6.9 KiB
Bash
Executable File
249 lines
6.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Tags: use-vectorscan, no-fasttest, no-parallel
|
|
|
|
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|
# shellcheck source=../shell_config.sh
|
|
. "$CURDIR"/../shell_config.sh
|
|
|
|
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
|
|
|
mkdir -p $USER_FILES_PATH/test_02504
|
|
|
|
yaml=$USER_FILES_PATH/test_02504/test.yaml
|
|
|
|
cat > "$yaml" <<EOL
|
|
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
|
|
name: 'TencentOS'
|
|
version: '\1'
|
|
|
|
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
|
|
name: 'Android'
|
|
versions:
|
|
- regexp: '33/tclwebkit'
|
|
version: '13'
|
|
- regexp: '3[12]/tclwebkit'
|
|
version: '12'
|
|
- regexp: '30/tclwebkit'
|
|
version: '11'
|
|
- regexp: '29/tclwebkit'
|
|
version: '10'
|
|
EOL
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
drop dictionary if exists regexp_dict1;
|
|
create dictionary regexp_dict1
|
|
(
|
|
regexp String,
|
|
name String,
|
|
version Nullable(String) default 'default',
|
|
lucky Int64
|
|
)
|
|
PRIMARY KEY(regexp)
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
LIFETIME(0)
|
|
LAYOUT(regexp_tree)
|
|
SETTINGS(regexp_dict_allow_hyperscan = true);
|
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux');
|
|
select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024');
|
|
select dictGet('regexp_dict1', ('name', 'version'), '999/tclwebkit1024');
|
|
select dictGet('regexp_dict1', ('name', 'version'), '28/tclwebkit1024');
|
|
"
|
|
|
|
cat > "$yaml" <<EOL
|
|
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
|
|
name: 'TencentOS'
|
|
version: '\1'
|
|
|
|
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
|
|
name: 'Android'
|
|
versions:
|
|
- regexp: '33/tclwebkit'
|
|
version: '13'
|
|
- regexp: '3[12]/tclwebkit'
|
|
version: '12'
|
|
- regexp: '30/tclwebkit'
|
|
version: '11'
|
|
- regexp: '29/tclwebkit'
|
|
version: '10'
|
|
- regexp: '28/tclwebkit'
|
|
version:
|
|
lucky: 'abcde'
|
|
EOL
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
system reload dictionary regexp_dict1; -- { serverError 489 }
|
|
"
|
|
|
|
cat > "$yaml" <<EOL
|
|
- regexp:
|
|
name: 'TencentOS'
|
|
version: '\1'
|
|
EOL
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
system reload dictionary regexp_dict1; -- { serverError 318 }
|
|
"
|
|
|
|
cat > "$yaml" <<EOL
|
|
- name: BlackBerry WebKit
|
|
regexp: (PlayBook).{1,200}RIM Tablet OS (\d+)\.(\d+)\.(\d+)
|
|
version: '\2.\3'
|
|
- name: BlackBerry WebKit
|
|
regexp: (Black[bB]erry|BB10).{1,200}Version/(\d+)\.(\d+)\.(\d+)
|
|
version: '\2.\3'
|
|
EOL
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
system reload dictionary regexp_dict1;
|
|
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.3+ (KHTML, like Gecko) Version/10.0.9.388 Mobile Safari/537.3+');
|
|
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 1.0.0; en-US) AppleWebKit/534.8+ (KHTML, like Gecko) Version/0.0.1 Safari/534.8+');
|
|
"
|
|
|
|
cat > "$yaml" <<EOL
|
|
- regexp: 'abc'
|
|
col_bool: 'true'
|
|
col_uuid: '61f0c404-5cb3-11e7-907b-a6006ad3dba0'
|
|
col_date: '2023-01-01'
|
|
col_datetime: '2023-01-01 01:01:01'
|
|
col_array: '[1,2,3,-1,-2,-3]'
|
|
EOL
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
create dictionary regexp_dict2
|
|
(
|
|
regexp String,
|
|
col_bool Boolean,
|
|
col_uuid UUID,
|
|
col_date Date,
|
|
col_datetime DateTime,
|
|
col_array Array(Int64)
|
|
)
|
|
PRIMARY KEY(regexp)
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
LIFETIME(0)
|
|
LAYOUT(regexp_tree);
|
|
|
|
select dictGet('regexp_dict2', ('col_bool','col_uuid', 'col_date', 'col_datetime', 'col_array'), 'abc');
|
|
"
|
|
|
|
cat > "$yaml" <<EOL
|
|
- regexp: 'clickhouse\.com'
|
|
tag: 'ClickHouse'
|
|
topological_index: 1
|
|
paths:
|
|
- regexp: 'clickhouse\.com/docs(.*)'
|
|
tag: 'ClickHouse Documentation'
|
|
topological_index: 0
|
|
captured: '\1'
|
|
parent: 'ClickHouse'
|
|
|
|
- regexp: '/docs(/|$)'
|
|
tag: 'Documentation'
|
|
topological_index: 2
|
|
|
|
- regexp: 'github.com'
|
|
tag: 'GitHub'
|
|
topological_index: 3
|
|
captured: 'NULL'
|
|
EOL
|
|
|
|
# dictGetAll
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
drop dictionary if exists regexp_dict3;
|
|
create dictionary regexp_dict3
|
|
(
|
|
regexp String,
|
|
tag String,
|
|
topological_index Int64,
|
|
captured Nullable(String),
|
|
parent String
|
|
)
|
|
PRIMARY KEY(regexp)
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
LIFETIME(0)
|
|
LAYOUT(regexp_tree);
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com');
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com', 2);
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en');
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en', 2);
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs');
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2);
|
|
"
|
|
|
|
# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2.
|
|
# An easy way to force the use of RE2 is to disable Hyperscan.
|
|
# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes
|
|
cat > "$yaml" <<EOL
|
|
- regexp: 'clickhouse\.com'
|
|
tag: 'ClickHouse'
|
|
paths:
|
|
- regexp: 'docs'
|
|
tag: 'ClickHouse Documentation'
|
|
|
|
- regexp: 'github\.com'
|
|
tag: 'GitHub'
|
|
paths:
|
|
- regexp: 'docs'
|
|
tag: 'GitHub Documentation'
|
|
|
|
- regexp: '/docs(/|$)'
|
|
tag: 'Documentation'
|
|
EOL
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
drop dictionary if exists regexp_dict3;
|
|
create dictionary regexp_dict3
|
|
(
|
|
regexp String,
|
|
tag String
|
|
)
|
|
PRIMARY KEY(regexp)
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
LIFETIME(0)
|
|
LAYOUT(regexp_tree)
|
|
SETTINGS(regexp_dict_allow_hyperscan = true);
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
|
|
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
|
|
select dictGet('regexp_dict3', 'tag', '/docs');
|
|
select dictGetAll('regexp_dict3', 'tag', '/docs');
|
|
|
|
drop dictionary if exists regexp_dict3;
|
|
create dictionary regexp_dict3
|
|
(
|
|
regexp String,
|
|
tag String
|
|
)
|
|
PRIMARY KEY(regexp)
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
LIFETIME(0)
|
|
LAYOUT(regexp_tree)
|
|
SETTINGS(regexp_dict_allow_hyperscan = false);
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
|
|
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
|
|
select dictGet('regexp_dict3', 'tag', '/docs');
|
|
select dictGetAll('regexp_dict3', 'tag', '/docs');
|
|
"
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
drop dictionary regexp_dict1;
|
|
drop dictionary regexp_dict2;
|
|
drop dictionary regexp_dict3;
|
|
"
|
|
|
|
rm -rf "$USER_FILES_PATH/test_02504"
|