2022-12-23 12:32:49 +00:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
2022-12-29 17:33:28 +00:00
|
|
|
# Tags: use-vectorscan, no-fasttest, no-parallel
|
2022-12-23 12:32:49 +00:00
|
|
|
|
|
|
|
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
|
|
|
# shellcheck source=../shell_config.sh
|
|
|
|
. "$CURDIR"/../shell_config.sh
|
|
|
|
|
|
|
|
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
|
|
|
|
|
|
|
mkdir -p $USER_FILES_PATH/test_02504
|
|
|
|
|
|
|
|
yaml=$USER_FILES_PATH/test_02504/test.yaml
|
|
|
|
|
|
|
|
cat > "$yaml" <<EOL
|
|
|
|
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
|
|
|
|
name: 'TencentOS'
|
|
|
|
version: '\1'
|
|
|
|
|
|
|
|
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
|
2023-01-25 09:38:51 +00:00
|
|
|
name: 'Android'
|
2022-12-23 12:32:49 +00:00
|
|
|
versions:
|
|
|
|
- regexp: '33/tclwebkit'
|
|
|
|
version: '13'
|
|
|
|
- regexp: '3[12]/tclwebkit'
|
|
|
|
version: '12'
|
|
|
|
- regexp: '30/tclwebkit'
|
|
|
|
version: '11'
|
|
|
|
- regexp: '29/tclwebkit'
|
|
|
|
version: '10'
|
|
|
|
EOL
|
|
|
|
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
drop dictionary if exists regexp_dict1;
|
|
|
|
create dictionary regexp_dict1
|
|
|
|
(
|
|
|
|
regexp String,
|
|
|
|
name String,
|
2022-12-31 19:36:23 +00:00
|
|
|
version Nullable(String) default 'default',
|
|
|
|
lucky Int64
|
2022-12-23 12:32:49 +00:00
|
|
|
)
|
|
|
|
PRIMARY KEY(regexp)
|
|
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
|
|
LIFETIME(0)
|
2023-02-06 16:26:20 +00:00
|
|
|
LAYOUT(regexp_tree)
|
|
|
|
SETTINGS(regexp_dict_allow_hyperscan = true);
|
2022-12-23 12:32:49 +00:00
|
|
|
|
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux');
|
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024');
|
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), '999/tclwebkit1024');
|
2022-12-31 19:36:23 +00:00
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), '28/tclwebkit1024');
|
|
|
|
"
|
|
|
|
|
|
|
|
cat > "$yaml" <<EOL
|
|
|
|
- regexp: 'Linux/(\d+[\.\d]*).+tlinux'
|
|
|
|
name: 'TencentOS'
|
|
|
|
version: '\1'
|
|
|
|
|
|
|
|
- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)'
|
2023-01-25 09:38:51 +00:00
|
|
|
name: 'Android'
|
2022-12-31 19:36:23 +00:00
|
|
|
versions:
|
|
|
|
- regexp: '33/tclwebkit'
|
|
|
|
version: '13'
|
|
|
|
- regexp: '3[12]/tclwebkit'
|
|
|
|
version: '12'
|
|
|
|
- regexp: '30/tclwebkit'
|
|
|
|
version: '11'
|
|
|
|
- regexp: '29/tclwebkit'
|
|
|
|
version: '10'
|
|
|
|
- regexp: '28/tclwebkit'
|
|
|
|
version:
|
|
|
|
lucky: 'abcde'
|
|
|
|
EOL
|
|
|
|
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
system reload dictionary regexp_dict1; -- { serverError 489 }
|
|
|
|
"
|
|
|
|
|
|
|
|
cat > "$yaml" <<EOL
|
|
|
|
- regexp:
|
|
|
|
name: 'TencentOS'
|
|
|
|
version: '\1'
|
|
|
|
EOL
|
|
|
|
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
system reload dictionary regexp_dict1; -- { serverError 318 }
|
|
|
|
"
|
2023-02-06 16:26:20 +00:00
|
|
|
|
|
|
|
cat > "$yaml" <<EOL
|
|
|
|
- name: BlackBerry WebKit
|
|
|
|
regexp: (PlayBook).{1,200}RIM Tablet OS (\d+)\.(\d+)\.(\d+)
|
|
|
|
version: '\2.\3'
|
|
|
|
- name: BlackBerry WebKit
|
|
|
|
regexp: (Black[bB]erry|BB10).{1,200}Version/(\d+)\.(\d+)\.(\d+)
|
|
|
|
version: '\2.\3'
|
|
|
|
EOL
|
|
|
|
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
system reload dictionary regexp_dict1;
|
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.3+ (KHTML, like Gecko) Version/10.0.9.388 Mobile Safari/537.3+');
|
|
|
|
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 1.0.0; en-US) AppleWebKit/534.8+ (KHTML, like Gecko) Version/0.0.1 Safari/534.8+');
|
|
|
|
"
|
|
|
|
|
2023-01-03 20:41:51 +00:00
|
|
|
cat > "$yaml" <<EOL
|
2023-01-04 11:45:12 +00:00
|
|
|
- regexp: 'abc'
|
2023-01-03 20:41:51 +00:00
|
|
|
col_bool: 'true'
|
|
|
|
col_uuid: '61f0c404-5cb3-11e7-907b-a6006ad3dba0'
|
|
|
|
col_date: '2023-01-01'
|
|
|
|
col_datetime: '2023-01-01 01:01:01'
|
2023-01-04 13:23:00 +00:00
|
|
|
col_array: '[1,2,3,-1,-2,-3]'
|
2023-01-03 20:41:51 +00:00
|
|
|
EOL
|
|
|
|
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
create dictionary regexp_dict2
|
|
|
|
(
|
|
|
|
regexp String,
|
|
|
|
col_bool Boolean,
|
|
|
|
col_uuid UUID,
|
|
|
|
col_date Date,
|
2023-01-04 13:23:00 +00:00
|
|
|
col_datetime DateTime,
|
|
|
|
col_array Array(Int64)
|
2023-01-03 20:41:51 +00:00
|
|
|
)
|
|
|
|
PRIMARY KEY(regexp)
|
|
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
|
|
LIFETIME(0)
|
|
|
|
LAYOUT(regexp_tree);
|
|
|
|
|
2023-01-04 13:23:00 +00:00
|
|
|
select dictGet('regexp_dict2', ('col_bool','col_uuid', 'col_date', 'col_datetime', 'col_array'), 'abc');
|
2023-01-03 20:41:51 +00:00
|
|
|
"
|
2022-12-31 19:36:23 +00:00
|
|
|
|
2023-05-26 20:56:40 +00:00
|
|
|
cat > "$yaml" <<EOL
|
|
|
|
- regexp: 'clickhouse\.com'
|
|
|
|
tag: 'ClickHouse'
|
|
|
|
topological_index: 1
|
|
|
|
paths:
|
|
|
|
- regexp: 'clickhouse\.com/docs(.*)'
|
|
|
|
tag: 'ClickHouse Documentation'
|
|
|
|
topological_index: 0
|
|
|
|
captured: '\1'
|
|
|
|
parent: 'ClickHouse'
|
|
|
|
|
|
|
|
- regexp: '/docs(/|$)'
|
|
|
|
tag: 'Documentation'
|
|
|
|
topological_index: 2
|
|
|
|
|
|
|
|
- regexp: 'github.com'
|
|
|
|
tag: 'GitHub'
|
|
|
|
topological_index: 3
|
|
|
|
captured: 'NULL'
|
|
|
|
EOL
|
|
|
|
|
|
|
|
# dictGetAll
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
drop dictionary if exists regexp_dict3;
|
|
|
|
create dictionary regexp_dict3
|
|
|
|
(
|
|
|
|
regexp String,
|
|
|
|
tag String,
|
|
|
|
topological_index Int64,
|
|
|
|
captured Nullable(String),
|
|
|
|
parent String
|
|
|
|
)
|
|
|
|
PRIMARY KEY(regexp)
|
|
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
|
|
LIFETIME(0)
|
|
|
|
LAYOUT(regexp_tree);
|
|
|
|
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com');
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com', 2);
|
|
|
|
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en');
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'clickhouse.com/docs/en', 2);
|
|
|
|
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs');
|
|
|
|
select dictGetAll('regexp_dict3', ('tag', 'topological_index', 'captured', 'parent'), 'github.com/clickhouse/tree/master/docs', 2);
|
|
|
|
"
|
|
|
|
|
2023-06-06 21:28:44 +00:00
|
|
|
# Test that things work the same for "simple" regexps that go through Hyperscan and "complex" regexps that go through RE2.
|
|
|
|
# An easy way to force the use of RE2 is to disable Hyperscan.
|
|
|
|
# This tree is constructed purposely so that text might (falsely) match leaf nodes without matching their corresponding parent nodes
|
|
|
|
cat > "$yaml" <<EOL
|
|
|
|
- regexp: 'clickhouse\.com'
|
|
|
|
tag: 'ClickHouse'
|
|
|
|
paths:
|
|
|
|
- regexp: 'docs'
|
|
|
|
tag: 'ClickHouse Documentation'
|
|
|
|
|
|
|
|
- regexp: 'github\.com'
|
|
|
|
tag: 'GitHub'
|
|
|
|
paths:
|
|
|
|
- regexp: 'docs'
|
|
|
|
tag: 'GitHub Documentation'
|
|
|
|
|
|
|
|
- regexp: '/docs(/|$)'
|
|
|
|
tag: 'Documentation'
|
|
|
|
EOL
|
|
|
|
|
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
|
|
|
drop dictionary if exists regexp_dict3;
|
|
|
|
create dictionary regexp_dict3
|
|
|
|
(
|
|
|
|
regexp String,
|
|
|
|
tag String
|
|
|
|
)
|
|
|
|
PRIMARY KEY(regexp)
|
|
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
|
|
LIFETIME(0)
|
|
|
|
LAYOUT(regexp_tree)
|
|
|
|
SETTINGS(regexp_dict_allow_hyperscan = true);
|
|
|
|
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
|
|
|
|
select dictGet('regexp_dict3', 'tag', '/docs');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', '/docs');
|
|
|
|
|
|
|
|
drop dictionary if exists regexp_dict3;
|
|
|
|
create dictionary regexp_dict3
|
|
|
|
(
|
|
|
|
regexp String,
|
|
|
|
tag String
|
|
|
|
)
|
|
|
|
PRIMARY KEY(regexp)
|
|
|
|
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
|
|
|
LIFETIME(0)
|
|
|
|
LAYOUT(regexp_tree)
|
|
|
|
SETTINGS(regexp_dict_allow_hyperscan = false);
|
|
|
|
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com');
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', 'clickhouse.com/docs');
|
|
|
|
select dictGet('regexp_dict3', 'tag', 'docs.github.com');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', 'docs.github.com');
|
|
|
|
select dictGet('regexp_dict3', 'tag', '/docs');
|
|
|
|
select dictGetAll('regexp_dict3', 'tag', '/docs');
|
|
|
|
"
|
|
|
|
|
2022-12-31 19:36:23 +00:00
|
|
|
$CLICKHOUSE_CLIENT -n --query="
|
2022-12-23 12:32:49 +00:00
|
|
|
drop dictionary regexp_dict1;
|
2023-01-03 20:41:51 +00:00
|
|
|
drop dictionary regexp_dict2;
|
2023-05-26 20:56:40 +00:00
|
|
|
drop dictionary regexp_dict3;
|
2022-12-23 12:32:49 +00:00
|
|
|
"
|
2022-12-31 19:36:23 +00:00
|
|
|
|
2022-12-29 17:33:35 +00:00
|
|
|
rm -rf "$USER_FILES_PATH/test_02504"
|