From eb760413125fbfc1679c19e8cb26647cb1847374 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 6 Feb 2023 17:26:20 +0100 Subject: [PATCH] address comments and add one more test --- src/Dictionaries/RegExpTreeDictionary.cpp | 19 +++++++++++-------- ...04_regexp_dictionary_yaml_source.reference | 2 ++ .../02504_regexp_dictionary_yaml_source.sh | 19 ++++++++++++++++++- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index ca1fc4e48bc..8f82e642462 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -157,10 +157,10 @@ namespace /// hyper scan is not good at processing regex containing {0, 200} /// This will make re compilation slow and failed. So we select this heavy regular expressions and /// process it with re2. - struct ComplexRegexChecker + struct RegexChecker { re2_st::RE2 searcher; - ComplexRegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {} + RegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {} static bool isFigureLargerThanFifty(const String & str) try @@ -174,15 +174,15 @@ namespace } [[maybe_unused]] - bool check(const String & data) const + bool isSimpleRegex(const String & regex) const { - re2_st::StringPiece haystack(data.data(), data.size()); + re2_st::StringPiece haystack(regex.data(), regex.size()); re2_st::StringPiece matches[10]; size_t start_pos = 0; - while (start_pos < data.size()) + while (start_pos < regex.size()) { - if (searcher.Match(haystack, start_pos, data.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10)) + if (searcher.Match(haystack, start_pos, regex.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10)) { const auto & match = matches[0]; start_pos += match.length(); @@ -207,7 +207,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) auto keys_column = block.getByName(kKeys).column; auto values_column = block.getByName(kValues).column; - ComplexRegexChecker checker; + RegexChecker checker; size_t size = block.rows(); for (size_t i = 0; i < size; i++) @@ -253,7 +253,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) } regex_nodes.emplace(id, node); #if USE_VECTORSCAN - if (use_vectorscan && !checker.check(regex)) + if (use_vectorscan && !checker.isSimpleRegex(regex)) { simple_regexps.push_back(regex); regexp_ids.push_back(id); @@ -312,6 +312,9 @@ void RegExpTreeDictionary::loadData() if (simple_regexps.empty() && complex_regexp_nodes.empty()) throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "There are no available regular expression. Please check your config"); LOG_INFO(logger, "There are {} simple regexps and {} complex regexps", simple_regexps.size(), complex_regexp_nodes.size()); + /// If all the regexps cannot work with hyperscan, we should set this flag off to avoid exceptions. + if (simple_regexps.empty()) + use_vectorscan = false; if (!use_vectorscan) return; #if USE_VECTORSCAN diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference index 6d5372cc484..dfcd170e8f4 100644 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference @@ -2,4 +2,6 @@ ('Android','12') ('Android','default') ('Android','default') +('BlackBerry WebKit','10.0') +('BlackBerry WebKit','1.0') (true,'61f0c404-5cb3-11e7-907b-a6006ad3dba0','2023-01-01','2023-01-01 01:01:01',[1,2,3,-1,-2,-3]) diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh index 8d80f9f1fea..1b5a9cdeea4 100755 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh @@ -42,7 +42,8 @@ create dictionary regexp_dict1 PRIMARY KEY(regexp) SOURCE(YAMLRegExpTree(PATH '$yaml')) LIFETIME(0) -LAYOUT(regexp_tree); +LAYOUT(regexp_tree) +SETTINGS(regexp_dict_allow_hyperscan = true); select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux'); select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024'); @@ -84,6 +85,22 @@ EOL $CLICKHOUSE_CLIENT -n --query=" system reload dictionary regexp_dict1; -- { serverError 318 } " + +cat > "$yaml" < "$yaml" <