process regexp flags correctly

This commit is contained in:
Han Fei 2024-08-15 11:47:41 +02:00
parent 008d02880b
commit 9f6e472b0c
2 changed files with 26 additions and 16 deletions

View File

@ -244,33 +244,41 @@ const char * analyzeImpl(
is_trivial = false; is_trivial = false;
if (!in_square_braces) if (!in_square_braces)
{ {
/// Check for case-insensitive flag. /// it means flag negation
if (pos + 1 < end && pos[1] == '?') /// there are various possible flags
/// actually only imsU are supported by re2
auto is_flag_char = [](char x)
{ {
for (size_t offset = 2; pos + offset < end; ++offset) return x == '-' || x == 'i' || x == 'm' || x == 's' || x == 'U' || x == 'u';
};
/// Check for case-insensitive flag.
if (pos + 2 < end && pos[1] == '?' && is_flag_char(pos[2]))
{
size_t offset = 2;
for (; pos + offset < end; ++offset)
{ {
if (pos[offset] == '-' /// it means flag negation if (pos[offset] == 'i')
/// various possible flags, actually only imsU are supported by re2
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
{ {
if (pos[offset] == 'i') /// Actually it can be negated case-insensitive flag. But we don't care.
{ has_case_insensitive_flag = true;
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
break;
}
} }
else else if (!is_flag_char(pos[offset]))
break; break;
} }
pos += offset;
/// if this group only contains flags, we have nothing to do.
if (*pos == ')')
{
++pos;
break;
}
} }
/// (?:regex) means non-capturing parentheses group /// (?:regex) means non-capturing parentheses group
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':') else if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
{ {
pos += 2; pos += 2;
} }
if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<'))) else if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
{ {
pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end); pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end);
} }

View File

@ -19,6 +19,8 @@ TEST(OptimizeRE, analyze)
}; };
test_f("abc", "abc", {}, true, true); test_f("abc", "abc", {}, true, true);
test_f("c([^k]*)de", ""); test_f("c([^k]*)de", "");
test_f("(?-s)bob", "bob", {}, false, true);
test_f("(?s)bob", "bob", {}, false, true);
test_f("abc(de)fg", "abcdefg", {}, false, true); test_f("abc(de)fg", "abcdefg", {}, false, true);
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true); test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true); test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);