mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Merge pull request #49919 from hanfei1991/hanfei/fix-optimize-regexp-prefix
fix `is_prefix` in OptimizeRegularExpression
This commit is contained in:
commit
a2c0a65344
@ -63,12 +63,13 @@ const char * analyzeImpl(
|
||||
bool is_first_call = begin == regexp.data();
|
||||
int depth = 0;
|
||||
is_trivial = true;
|
||||
bool is_prefix = true;
|
||||
required_substring.clear();
|
||||
bool has_alternative_on_depth_0 = false;
|
||||
bool has_case_insensitive_flag = false;
|
||||
|
||||
/// Substring with a position.
|
||||
using Substring = std::pair<std::string, size_t>;
|
||||
/// Substring with is_prefix.
|
||||
using Substring = std::pair<std::string, bool>;
|
||||
using Substrings = std::vector<Substring>;
|
||||
|
||||
Substrings trivial_substrings(1);
|
||||
@ -98,6 +99,9 @@ const char * analyzeImpl(
|
||||
|
||||
auto finish_non_trivial_char = [&](bool create_new_substr = true)
|
||||
{
|
||||
is_trivial = false;
|
||||
if (create_new_substr)
|
||||
is_prefix = false;
|
||||
if (depth != 0)
|
||||
return;
|
||||
|
||||
@ -106,6 +110,7 @@ const char * analyzeImpl(
|
||||
if (alter.suffix)
|
||||
{
|
||||
alter.literal += last_substring->first;
|
||||
alter.suffix = false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -126,16 +131,24 @@ const char * analyzeImpl(
|
||||
if (alter.prefix)
|
||||
{
|
||||
alter.literal = last_substring->first + alter.literal;
|
||||
alter.prefix = is_prefix;
|
||||
}
|
||||
}
|
||||
|
||||
if (group_required_string.prefix)
|
||||
{
|
||||
last_substring->first += group_required_string.literal;
|
||||
last_substring->second = is_prefix;
|
||||
}
|
||||
else
|
||||
{
|
||||
finish_non_trivial_char();
|
||||
last_substring->first = group_required_string.literal;
|
||||
last_substring->second = false;
|
||||
}
|
||||
|
||||
is_prefix = is_prefix && group_required_string.prefix && group_required_string.suffix;
|
||||
|
||||
/// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
|
||||
if (!last_substring->first.empty() && !group_required_string.suffix)
|
||||
{
|
||||
@ -185,7 +198,6 @@ const char * analyzeImpl(
|
||||
goto ordinary;
|
||||
default:
|
||||
/// all other escape sequences are not supported
|
||||
is_trivial = false;
|
||||
finish_non_trivial_char();
|
||||
break;
|
||||
}
|
||||
@ -196,6 +208,7 @@ const char * analyzeImpl(
|
||||
|
||||
case '|':
|
||||
is_trivial = false;
|
||||
is_prefix = false;
|
||||
++pos;
|
||||
if (depth == 0)
|
||||
{
|
||||
@ -205,6 +218,7 @@ const char * analyzeImpl(
|
||||
break;
|
||||
|
||||
case '(':
|
||||
/// bracket does not break is_prefix. for example abc(d) has a prefix 'abcd'
|
||||
is_trivial = false;
|
||||
if (!in_square_braces)
|
||||
{
|
||||
@ -258,7 +272,6 @@ const char * analyzeImpl(
|
||||
case '[':
|
||||
in_square_braces = true;
|
||||
++depth;
|
||||
is_trivial = false;
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
@ -270,7 +283,6 @@ const char * analyzeImpl(
|
||||
--depth;
|
||||
if (depth == 0)
|
||||
in_square_braces = false;
|
||||
is_trivial = false;
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
@ -284,7 +296,6 @@ const char * analyzeImpl(
|
||||
break;
|
||||
|
||||
case '^': case '$': case '.': case '+':
|
||||
is_trivial = false;
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
@ -296,7 +307,6 @@ const char * analyzeImpl(
|
||||
case '?':
|
||||
[[fallthrough]];
|
||||
case '*':
|
||||
is_trivial = false;
|
||||
if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
|
||||
{
|
||||
last_substring->first.resize(last_substring->first.size() - 1);
|
||||
@ -318,8 +328,9 @@ const char * analyzeImpl(
|
||||
default:
|
||||
if (depth == 0 && !in_curly_braces && !in_square_braces)
|
||||
{
|
||||
/// record the first position of last string.
|
||||
if (last_substring->first.empty())
|
||||
last_substring->second = pos - begin;
|
||||
last_substring->second = is_prefix;
|
||||
last_substring->first.push_back(*pos);
|
||||
}
|
||||
++pos;
|
||||
@ -328,10 +339,9 @@ const char * analyzeImpl(
|
||||
}
|
||||
finish:
|
||||
|
||||
finish_non_trivial_char(false);
|
||||
|
||||
if (!is_trivial)
|
||||
{
|
||||
finish_non_trivial_char(false);
|
||||
/// we calculate required substring even though has_alternative_on_depth_0.
|
||||
/// we will clear the required substring after putting it to alternatives.
|
||||
if (!has_case_insensitive_flag)
|
||||
@ -357,7 +367,7 @@ finish:
|
||||
if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
|
||||
{
|
||||
required_substring.literal = candidate_it->first;
|
||||
required_substring.prefix = candidate_it->second == 0;
|
||||
required_substring.prefix = candidate_it->second;
|
||||
required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
|
||||
}
|
||||
}
|
||||
@ -365,7 +375,8 @@ finish:
|
||||
else if (!trivial_substrings.empty())
|
||||
{
|
||||
required_substring.literal = trivial_substrings.front().first;
|
||||
required_substring.prefix = trivial_substrings.front().second == 0;
|
||||
/// trivial string means the whole regex is a simple string literal, so the prefix and suffix should be true.
|
||||
required_substring.prefix = true;
|
||||
required_substring.suffix = true;
|
||||
}
|
||||
|
||||
|
@ -4,37 +4,40 @@
|
||||
|
||||
TEST(OptimizeRE, analyze)
|
||||
{
|
||||
auto test_f = [](const std::string & regexp, const std::string & answer, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false)
|
||||
auto test_f = [](const std::string & regexp, const std::string & required, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false, bool prefix_expected = false)
|
||||
{
|
||||
std::string required;
|
||||
std::string answer;
|
||||
bool is_trivial;
|
||||
bool is_prefix;
|
||||
std::vector<std::string> alternatives;
|
||||
OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives);
|
||||
OptimizedRegularExpression::analyze(regexp, answer, is_trivial, is_prefix, alternatives);
|
||||
std::cerr << regexp << std::endl;
|
||||
EXPECT_EQ(required, answer);
|
||||
EXPECT_EQ(alternatives, expect_alternatives);
|
||||
EXPECT_EQ(is_trivial, trival_expected);
|
||||
EXPECT_EQ(is_prefix, prefix_expected);
|
||||
};
|
||||
test_f("abc", "abc", {}, true);
|
||||
test_f("abc", "abc", {}, true, true);
|
||||
test_f("c([^k]*)de", "");
|
||||
test_f("abc(de)fg", "abcdefg");
|
||||
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"});
|
||||
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"});
|
||||
test_f("abc(de)fg", "abcdefg", {}, false, true);
|
||||
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
|
||||
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);
|
||||
test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"});
|
||||
test_f("(abc)", "abc");
|
||||
test_f("(abc)", "abc", {}, false, true);
|
||||
test_f("(abc|fgk)", "", {"abc","fgk"});
|
||||
test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"});
|
||||
test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"});
|
||||
test_f("((abc|fg)kkk*)xyzz", "xyzz", {"abckk", "fgkk"});
|
||||
test_f("abc(*(abc|fg)*)xyzz", "xyzz");
|
||||
test_f("abc[k]xyzz", "xyzz");
|
||||
test_f("(abc[k]xyzz)", "xyzz");
|
||||
test_f("abc((de)fg(hi))jk", "abcdefghijk");
|
||||
test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk");
|
||||
test_f("abc((de)fghi+zzz)jk", "abcdefghi");
|
||||
test_f("abc((de)fg(hi))?jk", "abc");
|
||||
test_f("abc((de)fghi?zzz)jk", "abcdefgh");
|
||||
test_f("abc((de)fg(hi))jk", "abcdefghijk", {}, false, true);
|
||||
test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk", {}, false, true);
|
||||
test_f("abc((de)fghi+zzz)jk", "abcdefghi", {}, false, true);
|
||||
test_f("abc((de)fg(hi))?jk", "abc", {}, false, true);
|
||||
test_f("abc((de)fghi?zzz)jk", "abcdefgh", {}, false, true);
|
||||
test_f("abc(*cd)jk", "cdjk");
|
||||
test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"});
|
||||
test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"}, false, true);
|
||||
test_f("abc(abc|fg)?xyzz", "xyzz");
|
||||
test_f("abc(abc|fg){0,1}xyzz", "xyzz");
|
||||
test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"});
|
||||
@ -43,4 +46,5 @@ TEST(OptimizeRE, analyze)
|
||||
test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"});
|
||||
test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"});
|
||||
test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"});
|
||||
test_f(R"(\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z)", "/k8s1");
|
||||
}
|
||||
|
@ -0,0 +1 @@
|
||||
1
|
@ -0,0 +1 @@
|
||||
select match('default/k8s1', '\\A(?:(?:[-0-9_a-z]+(?:\\.[-0-9_a-z]+)*)/k8s1)\\z');
|
Loading…
Reference in New Issue
Block a user