From ea5976180969fafb7dc7db8b5be5ff2d9c2976f0 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 16 May 2023 15:25:04 +0200 Subject: [PATCH 1/3] fix OptimizeRegularExpression --- src/Common/OptimizedRegularExpression.cpp | 33 +++++++++++++++-------- src/Common/tests/gtest_optimize_re.cpp | 32 ++++++++++++---------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index 68f5b86877e..92b3ad32ecd 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -63,12 +63,13 @@ const char * analyzeImpl( bool is_first_call = begin == regexp.data(); int depth = 0; is_trivial = true; + bool is_prefix = true; required_substring.clear(); bool has_alternative_on_depth_0 = false; bool has_case_insensitive_flag = false; /// Substring with a position. - using Substring = std::pair; + using Substring = std::pair; using Substrings = std::vector; Substrings trivial_substrings(1); @@ -98,6 +99,9 @@ const char * analyzeImpl( auto finish_non_trivial_char = [&](bool create_new_substr = true) { + is_trivial = false; + if (create_new_substr) + is_prefix = false; if (depth != 0) return; @@ -106,6 +110,7 @@ const char * analyzeImpl( if (alter.suffix) { alter.literal += last_substring->first; + alter.suffix = false; } } @@ -126,16 +131,24 @@ const char * analyzeImpl( if (alter.prefix) { alter.literal = last_substring->first + alter.literal; + alter.prefix = is_prefix; } } if (group_required_string.prefix) + { last_substring->first += group_required_string.literal; + last_substring->second = is_prefix; + } else { finish_non_trivial_char(); last_substring->first = group_required_string.literal; + last_substring->second = false; } + + is_prefix = is_prefix && group_required_string.prefix && group_required_string.suffix; + /// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg if (!last_substring->first.empty() && !group_required_string.suffix) { @@ -185,7 +198,6 @@ const char * analyzeImpl( goto ordinary; default: /// all other escape sequences are not supported - is_trivial = false; finish_non_trivial_char(); break; } @@ -196,6 +208,7 @@ const char * analyzeImpl( case '|': is_trivial = false; + is_prefix = false; ++pos; if (depth == 0) { @@ -205,6 +218,7 @@ const char * analyzeImpl( break; case '(': + /// bracket does not break is_prefix. for example abc(d) has a prefix 'abcd' is_trivial = false; if (!in_square_braces) { @@ -258,7 +272,6 @@ const char * analyzeImpl( case '[': in_square_braces = true; ++depth; - is_trivial = false; finish_non_trivial_char(); ++pos; break; @@ -270,7 +283,6 @@ const char * analyzeImpl( --depth; if (depth == 0) in_square_braces = false; - is_trivial = false; finish_non_trivial_char(); ++pos; break; @@ -284,7 +296,6 @@ const char * analyzeImpl( break; case '^': case '$': case '.': case '+': - is_trivial = false; finish_non_trivial_char(); ++pos; break; @@ -296,7 +307,6 @@ const char * analyzeImpl( case '?': [[fallthrough]]; case '*': - is_trivial = false; if (depth == 0 && !last_substring->first.empty() && !in_square_braces) { last_substring->first.resize(last_substring->first.size() - 1); @@ -318,8 +328,9 @@ const char * analyzeImpl( default: if (depth == 0 && !in_curly_braces && !in_square_braces) { + /// record the first position of last string. if (last_substring->first.empty()) - last_substring->second = pos - begin; + last_substring->second = is_prefix; last_substring->first.push_back(*pos); } ++pos; @@ -328,10 +339,9 @@ const char * analyzeImpl( } finish: - finish_non_trivial_char(false); - if (!is_trivial) { + finish_non_trivial_char(false); /// we calculate required substring even though has_alternative_on_depth_0. /// we will clear the required substring after putting it to alternatives. if (!has_case_insensitive_flag) @@ -357,7 +367,7 @@ finish: if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0)) { required_substring.literal = candidate_it->first; - required_substring.prefix = candidate_it->second == 0; + required_substring.prefix = candidate_it->second; required_substring.suffix = candidate_it + 1 == trivial_substrings.end(); } } @@ -365,7 +375,8 @@ finish: else if (!trivial_substrings.empty()) { required_substring.literal = trivial_substrings.front().first; - required_substring.prefix = trivial_substrings.front().second == 0; + /// trivial string means the whole regex is a simple string literal, so the prefix and suffix should be true. + required_substring.prefix = true; required_substring.suffix = true; } diff --git a/src/Common/tests/gtest_optimize_re.cpp b/src/Common/tests/gtest_optimize_re.cpp index 556700f1fcc..3710666d336 100644 --- a/src/Common/tests/gtest_optimize_re.cpp +++ b/src/Common/tests/gtest_optimize_re.cpp @@ -4,37 +4,40 @@ TEST(OptimizeRE, analyze) { - auto test_f = [](const std::string & regexp, const std::string & answer, std::vector expect_alternatives = {}, bool trival_expected = false) + auto test_f = [](const std::string & regexp, const std::string & required, std::vector expect_alternatives = {}, bool trival_expected = false, bool prefix_expected = false) { - std::string required; + std::string answer; bool is_trivial; bool is_prefix; std::vector alternatives; - OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives); + OptimizedRegularExpression::analyze(regexp, answer, is_trivial, is_prefix, alternatives); std::cerr << regexp << std::endl; EXPECT_EQ(required, answer); EXPECT_EQ(alternatives, expect_alternatives); EXPECT_EQ(is_trivial, trival_expected); + EXPECT_EQ(is_prefix, prefix_expected); }; - test_f("abc", "abc", {}, true); + test_f("abc", "abc", {}, true, true); test_f("c([^k]*)de", ""); - test_f("abc(de)fg", "abcdefg"); - test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}); - test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}); + test_f("abc(de)fg", "abcdefg", {}, false, true); + test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true); + test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true); test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"}); - test_f("(abc)", "abc"); + test_f("(abc)", "abc", {}, false, true); test_f("(abc|fgk)", "", {"abc","fgk"}); test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"}); test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"}); + test_f("((abc|fg)kkk*)xyzz", "xyzz", {"abckk", "fgkk"}); + test_f("abc(*(abc|fg)*)xyzz", "xyzz"); test_f("abc[k]xyzz", "xyzz"); test_f("(abc[k]xyzz)", "xyzz"); - test_f("abc((de)fg(hi))jk", "abcdefghijk"); - test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk"); - test_f("abc((de)fghi+zzz)jk", "abcdefghi"); - test_f("abc((de)fg(hi))?jk", "abc"); - test_f("abc((de)fghi?zzz)jk", "abcdefgh"); + test_f("abc((de)fg(hi))jk", "abcdefghijk", {}, false, true); + test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk", {}, false, true); + test_f("abc((de)fghi+zzz)jk", "abcdefghi", {}, false, true); + test_f("abc((de)fg(hi))?jk", "abc", {}, false, true); + test_f("abc((de)fghi?zzz)jk", "abcdefgh", {}, false, true); test_f("abc(*cd)jk", "cdjk"); - test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"}); + test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"}, false, true); test_f("abc(abc|fg)?xyzz", "xyzz"); test_f("abc(abc|fg){0,1}xyzz", "xyzz"); test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"}); @@ -43,4 +46,5 @@ TEST(OptimizeRE, analyze) test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"}); test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"}); test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"}); + test_f(R"(\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z)", "/k8s1"); } From 35f00f72b3bad1025453181f785c20c9945dda36 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 16 May 2023 22:42:43 +0200 Subject: [PATCH 2/3] add functional test --- tests/queries/0_stateless/02751_match_constant_needle.reference | 1 + tests/queries/0_stateless/02751_match_constant_needle.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/queries/0_stateless/02751_match_constant_needle.reference create mode 100644 tests/queries/0_stateless/02751_match_constant_needle.sql diff --git a/tests/queries/0_stateless/02751_match_constant_needle.reference b/tests/queries/0_stateless/02751_match_constant_needle.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02751_match_constant_needle.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02751_match_constant_needle.sql b/tests/queries/0_stateless/02751_match_constant_needle.sql new file mode 100644 index 00000000000..71bdcc7cb0a --- /dev/null +++ b/tests/queries/0_stateless/02751_match_constant_needle.sql @@ -0,0 +1 @@ +select match('default/k8s1', '\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z'); From a257ff6cf3ba1124e861452e4b3b52ada3ea2d5c Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 22 May 2023 10:41:22 +0200 Subject: [PATCH 3/3] address comment --- src/Common/OptimizedRegularExpression.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index 92b3ad32ecd..f2fe922ef19 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -68,7 +68,7 @@ const char * analyzeImpl( bool has_alternative_on_depth_0 = false; bool has_case_insensitive_flag = false; - /// Substring with a position. + /// Substring with is_prefix. using Substring = std::pair; using Substrings = std::vector;