mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
address comments and add one more test
This commit is contained in:
parent
baa345fa64
commit
eb76041312
@ -157,10 +157,10 @@ namespace
|
||||
/// hyper scan is not good at processing regex containing {0, 200}
|
||||
/// This will make re compilation slow and failed. So we select this heavy regular expressions and
|
||||
/// process it with re2.
|
||||
struct ComplexRegexChecker
|
||||
struct RegexChecker
|
||||
{
|
||||
re2_st::RE2 searcher;
|
||||
ComplexRegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {}
|
||||
RegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {}
|
||||
|
||||
static bool isFigureLargerThanFifty(const String & str)
|
||||
try
|
||||
@ -174,15 +174,15 @@ namespace
|
||||
}
|
||||
|
||||
[[maybe_unused]]
|
||||
bool check(const String & data) const
|
||||
bool isSimpleRegex(const String & regex) const
|
||||
{
|
||||
|
||||
re2_st::StringPiece haystack(data.data(), data.size());
|
||||
re2_st::StringPiece haystack(regex.data(), regex.size());
|
||||
re2_st::StringPiece matches[10];
|
||||
size_t start_pos = 0;
|
||||
while (start_pos < data.size())
|
||||
while (start_pos < regex.size())
|
||||
{
|
||||
if (searcher.Match(haystack, start_pos, data.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10))
|
||||
if (searcher.Match(haystack, start_pos, regex.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10))
|
||||
{
|
||||
const auto & match = matches[0];
|
||||
start_pos += match.length();
|
||||
@ -207,7 +207,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
|
||||
auto keys_column = block.getByName(kKeys).column;
|
||||
auto values_column = block.getByName(kValues).column;
|
||||
|
||||
ComplexRegexChecker checker;
|
||||
RegexChecker checker;
|
||||
|
||||
size_t size = block.rows();
|
||||
for (size_t i = 0; i < size; i++)
|
||||
@ -253,7 +253,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
|
||||
}
|
||||
regex_nodes.emplace(id, node);
|
||||
#if USE_VECTORSCAN
|
||||
if (use_vectorscan && !checker.check(regex))
|
||||
if (use_vectorscan && !checker.isSimpleRegex(regex))
|
||||
{
|
||||
simple_regexps.push_back(regex);
|
||||
regexp_ids.push_back(id);
|
||||
@ -312,6 +312,9 @@ void RegExpTreeDictionary::loadData()
|
||||
if (simple_regexps.empty() && complex_regexp_nodes.empty())
|
||||
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "There are no available regular expression. Please check your config");
|
||||
LOG_INFO(logger, "There are {} simple regexps and {} complex regexps", simple_regexps.size(), complex_regexp_nodes.size());
|
||||
/// If all the regexps cannot work with hyperscan, we should set this flag off to avoid exceptions.
|
||||
if (simple_regexps.empty())
|
||||
use_vectorscan = false;
|
||||
if (!use_vectorscan)
|
||||
return;
|
||||
#if USE_VECTORSCAN
|
||||
|
@ -2,4 +2,6 @@
|
||||
('Android','12')
|
||||
('Android','default')
|
||||
('Android','default')
|
||||
('BlackBerry WebKit','10.0')
|
||||
('BlackBerry WebKit','1.0')
|
||||
(true,'61f0c404-5cb3-11e7-907b-a6006ad3dba0','2023-01-01','2023-01-01 01:01:01',[1,2,3,-1,-2,-3])
|
||||
|
@ -42,7 +42,8 @@ create dictionary regexp_dict1
|
||||
PRIMARY KEY(regexp)
|
||||
SOURCE(YAMLRegExpTree(PATH '$yaml'))
|
||||
LIFETIME(0)
|
||||
LAYOUT(regexp_tree);
|
||||
LAYOUT(regexp_tree)
|
||||
SETTINGS(regexp_dict_allow_hyperscan = true);
|
||||
|
||||
select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux');
|
||||
select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024');
|
||||
@ -84,6 +85,22 @@ EOL
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
system reload dictionary regexp_dict1; -- { serverError 318 }
|
||||
"
|
||||
|
||||
cat > "$yaml" <<EOL
|
||||
- name: BlackBerry WebKit
|
||||
regexp: (PlayBook).{1,200}RIM Tablet OS (\d+)\.(\d+)\.(\d+)
|
||||
version: '\2.\3'
|
||||
- name: BlackBerry WebKit
|
||||
regexp: (Black[bB]erry|BB10).{1,200}Version/(\d+)\.(\d+)\.(\d+)
|
||||
version: '\2.\3'
|
||||
EOL
|
||||
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
system reload dictionary regexp_dict1;
|
||||
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.3+ (KHTML, like Gecko) Version/10.0.9.388 Mobile Safari/537.3+');
|
||||
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 1.0.0; en-US) AppleWebKit/534.8+ (KHTML, like Gecko) Version/0.0.1 Safari/534.8+');
|
||||
"
|
||||
|
||||
cat > "$yaml" <<EOL
|
||||
- regexp: 'abc'
|
||||
col_bool: 'true'
|
||||
|
Loading…
Reference in New Issue
Block a user