From ea5976180969fafb7dc7db8b5be5ff2d9c2976f0 Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Tue, 16 May 2023 15:25:04 +0200
Subject: [PATCH 1/3] fix OptimizeRegularExpression

---
 src/Common/OptimizedRegularExpression.cpp | 33 +++++++++++++++--------
 src/Common/tests/gtest_optimize_re.cpp    | 32 ++++++++++++----------
 2 files changed, 40 insertions(+), 25 deletions(-)
diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp
index 68f5b86877e..92b3ad32ecd 100644
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@@ -63,12 +63,13 @@ const char * analyzeImpl(
     bool is_first_call = begin == regexp.data();
     int depth = 0;
     is_trivial = true;
+    bool is_prefix = true;
     required_substring.clear();
     bool has_alternative_on_depth_0 = false;
     bool has_case_insensitive_flag = false;
 
     /// Substring with a position.
-    using Substring = std::pair<std::string, size_t>;
+    using Substring = std::pair<std::string, bool>;
     using Substrings = std::vector<Substring>;
 
     Substrings trivial_substrings(1);
@@ -98,6 +99,9 @@ const char * analyzeImpl(
 
     auto finish_non_trivial_char = [&](bool create_new_substr = true)
     {
+        is_trivial = false;
+        if (create_new_substr)
+            is_prefix = false;
         if (depth != 0)
             return;
 
@@ -106,6 +110,7 @@ const char * analyzeImpl(
             if (alter.suffix)
             {
                 alter.literal += last_substring->first;
+                alter.suffix = false;
             }
         }
 
@@ -126,16 +131,24 @@ const char * analyzeImpl(
             if (alter.prefix)
             {
                 alter.literal = last_substring->first + alter.literal;
+                alter.prefix = is_prefix;
             }
         }
 
         if (group_required_string.prefix)
+        {
             last_substring->first += group_required_string.literal;
+            last_substring->second = is_prefix;
+        }
         else
         {
             finish_non_trivial_char();
             last_substring->first = group_required_string.literal;
+            last_substring->second = false;
         }
+
+        is_prefix = is_prefix && group_required_string.prefix && group_required_string.suffix;
+
         /// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
         if (!last_substring->first.empty() && !group_required_string.suffix)
         {
@@ -185,7 +198,6 @@ const char * analyzeImpl(
                         goto ordinary;
                     default:
                         /// all other escape sequences are not supported
-                        is_trivial = false;
                         finish_non_trivial_char();
                         break;
                 }
@@ -196,6 +208,7 @@ const char * analyzeImpl(
 
             case '|':
                 is_trivial = false;
+                is_prefix = false;
                 ++pos;
                 if (depth == 0)
                 {
@@ -205,6 +218,7 @@ const char * analyzeImpl(
                 break;
 
             case '(':
+                /// bracket does not break is_prefix. for example abc(d) has a prefix 'abcd'
                 is_trivial = false;
                 if (!in_square_braces)
                 {
@@ -258,7 +272,6 @@ const char * analyzeImpl(
             case '[':
                 in_square_braces = true;
                 ++depth;
-                is_trivial = false;
                 finish_non_trivial_char();
                 ++pos;
                 break;
@@ -270,7 +283,6 @@ const char * analyzeImpl(
                 --depth;
                 if (depth == 0)
                     in_square_braces = false;
-                is_trivial = false;
                 finish_non_trivial_char();
                 ++pos;
                 break;
@@ -284,7 +296,6 @@ const char * analyzeImpl(
                 break;
 
             case '^': case '$': case '.': case '+':
-                is_trivial = false;
                 finish_non_trivial_char();
                 ++pos;
                 break;
@@ -296,7 +307,6 @@ const char * analyzeImpl(
             case '?':
                 [[fallthrough]];
             case '*':
-                is_trivial = false;
                 if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
                 {
                     last_substring->first.resize(last_substring->first.size() - 1);
@@ -318,8 +328,9 @@ const char * analyzeImpl(
             default:
                 if (depth == 0 && !in_curly_braces && !in_square_braces)
                 {
+                    /// record the first position of last string.
                     if (last_substring->first.empty())
-                        last_substring->second = pos - begin;
+                        last_substring->second = is_prefix;
                     last_substring->first.push_back(*pos);
                 }
                 ++pos;
@@ -328,10 +339,9 @@ const char * analyzeImpl(
     }
 finish:
 
-    finish_non_trivial_char(false);
-
     if (!is_trivial)
     {
+        finish_non_trivial_char(false);
         /// we calculate required substring even though has_alternative_on_depth_0.
         /// we will clear the required substring after putting it to alternatives.
         if (!has_case_insensitive_flag)
@@ -357,7 +367,7 @@ finish:
             if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
             {
                 required_substring.literal = candidate_it->first;
-                required_substring.prefix = candidate_it->second == 0;
+                required_substring.prefix = candidate_it->second;
                 required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
             }
         }
@@ -365,7 +375,8 @@ finish:
     else if (!trivial_substrings.empty())
     {
         required_substring.literal = trivial_substrings.front().first;
-        required_substring.prefix = trivial_substrings.front().second == 0;
+        /// trivial string means the whole regex is a simple string literal, so the prefix and suffix should be true.
+        required_substring.prefix = true;
         required_substring.suffix = true;
     }
 
diff --git a/src/Common/tests/gtest_optimize_re.cpp b/src/Common/tests/gtest_optimize_re.cpp
index 556700f1fcc..3710666d336 100644
--- a/src/Common/tests/gtest_optimize_re.cpp
+++ b/src/Common/tests/gtest_optimize_re.cpp
@@ -4,37 +4,40 @@
 
 TEST(OptimizeRE, analyze)
 {
-    auto test_f = [](const std::string & regexp, const std::string & answer, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false)
+    auto test_f = [](const std::string & regexp, const std::string & required, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false, bool prefix_expected = false)
     {
-        std::string required;
+        std::string answer;
         bool is_trivial;
         bool is_prefix;
         std::vector<std::string> alternatives;
-        OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives);
+        OptimizedRegularExpression::analyze(regexp, answer, is_trivial, is_prefix, alternatives);
         std::cerr << regexp << std::endl;
         EXPECT_EQ(required, answer);
         EXPECT_EQ(alternatives, expect_alternatives);
         EXPECT_EQ(is_trivial, trival_expected);
+        EXPECT_EQ(is_prefix, prefix_expected);
     };
-    test_f("abc", "abc", {}, true);
+    test_f("abc", "abc", {}, true, true);
     test_f("c([^k]*)de", "");
-    test_f("abc(de)fg", "abcdefg");
-    test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"});
-    test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"});
+    test_f("abc(de)fg", "abcdefg", {}, false, true);
+    test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
+    test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);
     test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"});
-    test_f("(abc)", "abc");
+    test_f("(abc)", "abc", {}, false, true);
     test_f("(abc|fgk)", "", {"abc","fgk"});
     test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"});
     test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"});
+    test_f("((abc|fg)kkk*)xyzz", "xyzz", {"abckk", "fgkk"});
+    test_f("abc(*(abc|fg)*)xyzz", "xyzz");
     test_f("abc[k]xyzz", "xyzz");
     test_f("(abc[k]xyzz)", "xyzz");
-    test_f("abc((de)fg(hi))jk", "abcdefghijk");
-    test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk");
-    test_f("abc((de)fghi+zzz)jk", "abcdefghi");
-    test_f("abc((de)fg(hi))?jk", "abc");
-    test_f("abc((de)fghi?zzz)jk", "abcdefgh");
+    test_f("abc((de)fg(hi))jk", "abcdefghijk", {}, false, true);
+    test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk", {}, false, true);
+    test_f("abc((de)fghi+zzz)jk", "abcdefghi", {}, false, true);
+    test_f("abc((de)fg(hi))?jk", "abc", {}, false, true);
+    test_f("abc((de)fghi?zzz)jk", "abcdefgh", {}, false, true);
     test_f("abc(*cd)jk", "cdjk");
-    test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"});
+    test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"}, false, true);
     test_f("abc(abc|fg)?xyzz", "xyzz");
     test_f("abc(abc|fg){0,1}xyzz", "xyzz");
     test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"});
@@ -43,4 +46,5 @@ TEST(OptimizeRE, analyze)
     test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"});
     test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"});
     test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"});
+    test_f(R"(\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z)", "/k8s1");
 }

From 35f00f72b3bad1025453181f785c20c9945dda36 Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Tue, 16 May 2023 22:42:43 +0200
Subject: [PATCH 2/3] add functional test

---
 tests/queries/0_stateless/02751_match_constant_needle.reference | 1 +
 tests/queries/0_stateless/02751_match_constant_needle.sql       | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tests/queries/0_stateless/02751_match_constant_needle.reference
 create mode 100644 tests/queries/0_stateless/02751_match_constant_needle.sql

diff --git a/tests/queries/0_stateless/02751_match_constant_needle.reference b/tests/queries/0_stateless/02751_match_constant_needle.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/02751_match_constant_needle.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02751_match_constant_needle.sql b/tests/queries/0_stateless/02751_match_constant_needle.sql
new file mode 100644
index 00000000000..71bdcc7cb0a
--- /dev/null
+++ b/tests/queries/0_stateless/02751_match_constant_needle.sql
@@ -0,0 +1 @@
+select match('default/k8s1', '\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z');

From a257ff6cf3ba1124e861452e4b3b52ada3ea2d5c Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Mon, 22 May 2023 10:41:22 +0200
Subject: [PATCH 3/3] address comment

---
 src/Common/OptimizedRegularExpression.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp
index 92b3ad32ecd..f2fe922ef19 100644
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@@ -68,7 +68,7 @@ const char * analyzeImpl(
     bool has_alternative_on_depth_0 = false;
     bool has_case_insensitive_flag = false;
 
-    /// Substring with a position.
+    /// Substring with is_prefix.
     using Substring = std::pair<std::string, bool>;
     using Substrings = std::vector<Substring>;