ClickHouse/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh
johanngan bcb058f999 Add case insensitive and dot-all modes to RegExpTree dictionary
The new per-dictionary settings control regex match semantics around
case sensitivity and the '.' wildcard with newlines. They must be set at
the dictionary level since they're applied to regex engines at
pattern-compile-time.

- regexp_dict_flag_case_insensitive: case insensitive matching
- regexp_dict_flag_dotall: '.' matches all characters including newlines

They correspond to HS_FLAG_CASELESS and HS_FLAG_DOTALL in Vectorscan
and case_sensitive and dot_nl in RE2. These are the most useful options
compatible with the internal behavior of RegExpTreeDictionary around
splitting up simple and complex patterns between Vectorscan and RE2.

The alternative is to use (?i) and/or (?s) for all patterns. However,
(?s) isn't handled properly by OptimizedRegularExpression::analyze().
And while (?i) is, it still causes the dictionary to treat the pattern
as "complex" for sequential scanning with RE2 rather than multi-matching
with Vectorscan, even though Vectorscan supports case insensitive
literal matching. Setting dictionary-wide flags is both more convenient,
and circumvents these problems.
2023-09-06 11:28:53 -05:00

305 lines
8.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# Tags: use-vectorscan, no-fasttest, no-parallel
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
mkdir -p $USER_FILES_PATH/test_02504
yaml=$USER_FILES_PATH/test_02504/test.yaml
cat > "$yaml" <<EOL
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
name: 'TencentOS'
version: '\1'
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
name: 'Android'
versions:
- regexp: '33/tclwebkit'
version: '13'
- regexp: '3[12]/tclwebkit'
version: '12'
- regexp: '30/tclwebkit'
version: '11'
- regexp: '29/tclwebkit'
version: '10'
EOL
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict1;
create dictionary regexp_dict1
(
regexp String,
name String,
version Nullable(String) default 'default',
lucky Int64
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = true);
select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux');
select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024');
select dictGet('regexp_dict1', ('name', 'version'), '999/tclwebkit1024');
select dictGet('regexp_dict1', ('name', 'version'), '28/tclwebkit1024');
"
cat > "$yaml" <<EOL
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
name: 'TencentOS'
version: '\1'
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
name: 'Android'
versions:
- regexp: '33/tclwebkit'
version: '13'
- regexp: '3[12]/tclwebkit'
version: '12'
- regexp: '30/tclwebkit'
version: '11'
- regexp: '29/tclwebkit'
version: '10'
- regexp: '28/tclwebkit'
version:
lucky: 'abcde'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1; -- { serverError 489 }
"
cat > "$yaml" <<EOL
- regexp:
name: 'TencentOS'
version: '\1'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1; -- { serverError 318 }
"
cat > "$yaml" <<EOL
- name: BlackBerry WebKit
regexp: (PlayBook).{1,200}RIM Tablet OS (\d+)\.(\d+)\.(\d+)
version: '\2.\3'
- name: BlackBerry WebKit
regexp: (Black[bB]erry|BB10).{1,200}Version/(\d+)\.(\d+)\.(\d+)
version: '\2.\3'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1;
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.3+ (KHTML, like Gecko) Version/10.0.9.388 Mobile Safari/537.3+');
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 1.0.0; en-US) AppleWebKit/534.8+ (KHTML, like Gecko) Version/0.0.1 Safari/534.8+');
"
cat > "$yaml" <<EOL
- regexp: 'abc'
col_bool: 'true'
col_uuid: '61f0c404-5cb3-11e7-907b-a6006ad3dba0'
col_date: '2023-01-01'
col_datetime: '2023-01-01 01:01:01'
col_array: '[1,2,3,-1,-2,-3]'
EOL
$CLICKHOUSE_CLIENT -n --query="
create dictionary regexp_dict2
(
regexp String,
col_bool Boolean,
col_uuid UUID,
col_date Date,
col_datetime DateTime,
col_array Array(Int64)
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
select dictGet('regexp_dict2', ('col_bool','col_uuid', 'col_date', 'col_datetime', 'col_array'), 'abc');
"
cat > "$yaml" <<EOL
- regexp: 'clickhouse\.com'
tag: 'ClickHouse'
topological_index: 1
paths:
- regexp: 'clickhouse\.com/docs(.*)'
tag: 'ClickHouse Documentation'
topological_index: 0
captured: '\1'
parent: 'ClickHouse'
- regexp: '/docs(/|$)'
tag: 'Documentation'
topological_index: 2
- regexp: 'github.com'
tag: 'GitHub'
topological_index: 3
captured: 'NULL'
EOL
# dictGetAll
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String,
topological_index Int64,
captured Nullable(String),
parent String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com');
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com', 2);
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en');
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en', 2);
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs');
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2);
"
# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2.
# An easy way to force the use of RE2 is to disable Hyperscan.
# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes
cat > "$yaml" <<EOL
- regexp: 'clickhouse\.com'
tag: 'ClickHouse'
paths:
- regexp: 'docs'
tag: 'ClickHouse Documentation'
- regexp: 'github\.com'
tag: 'GitHub'
paths:
- regexp: 'docs'
tag: 'GitHub Documentation'
- regexp: '/docs(/|$)'
tag: 'Documentation'
EOL
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = true);
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
drop dictionary if exists regexp_dict3;
create dictionary regexp_dict3
(
regexp String,
tag String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = false);
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
"
# Test case-insensitive and dot-all match modes
cat > "$yaml" <<EOL
- regexp: 'foo'
pattern: 'foo'
- regexp: '(?i)foo'
pattern: '(?i)foo'
- regexp: '(?-i)foo'
pattern: '(?-i)foo'
- regexp: 'hello.*world'
pattern: 'hello.*world'
- regexp: '(?i)hello.*world'
pattern: '(?i)hello.*world'
- regexp: '(?-i)hello.*world'
pattern: '(?-i)hello.*world'
EOL
$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict4;
create dictionary regexp_dict4
(
regexp String,
pattern String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
select dictGetAll('regexp_dict4', 'pattern', 'foo');
select dictGetAll('regexp_dict4', 'pattern', 'FOO');
select dictGetAll('regexp_dict4', 'pattern', 'hello world');
select dictGetAll('regexp_dict4', 'pattern', 'hello\nworld');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO WORLD');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO\nWORLD');
drop dictionary if exists regexp_dict4;
create dictionary regexp_dict4
(
regexp String,
pattern String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_flag_case_insensitive = true, regexp_dict_flag_dotall = true);
select dictGetAll('regexp_dict4', 'pattern', 'foo');
select dictGetAll('regexp_dict4', 'pattern', 'FOO');
select dictGetAll('regexp_dict4', 'pattern', 'hello world');
select dictGetAll('regexp_dict4', 'pattern', 'hello\nworld');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO WORLD');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO\nWORLD');
"
$CLICKHOUSE_CLIENT -n --query="
drop dictionary regexp_dict1;
drop dictionary regexp_dict2;
drop dictionary regexp_dict3;
drop dictionary regexp_dict4;
"
rm -rf "$USER_FILES_PATH/test_02504"