Merge pull request #11649 from ClickHouse/case-insensitive-regexp

Allow case-insensitive regexps; added a test
This commit is contained in:
alexey-milovidov 2020-06-14 07:55:09 +03:00 committed by GitHub
commit b9b725a39f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 2 deletions

View File

@ -38,6 +38,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
required_substring_is_prefix = false; required_substring_is_prefix = false;
required_substring.clear(); required_substring.clear();
bool has_alternative_on_depth_0 = false; bool has_alternative_on_depth_0 = false;
bool has_case_insensitive_flag = false;
/// Substring with a position. /// Substring with a position.
using Substring = std::pair<std::string, size_t>; using Substring = std::pair<std::string, size_t>;
@ -65,7 +66,17 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
switch (*pos) switch (*pos)
{ {
case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{': case '|':
case '(':
case ')':
case '^':
case '$':
case '.':
case '[':
case '?':
case '*':
case '+':
case '{':
if (depth == 0 && !in_curly_braces && !in_square_braces) if (depth == 0 && !in_curly_braces && !in_square_braces)
{ {
if (last_substring->first.empty()) if (last_substring->first.empty())
@ -110,6 +121,28 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
trivial_substrings.resize(trivial_substrings.size() + 1); trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back(); last_substring = &trivial_substrings.back();
} }
/// Check for case-insensitive flag.
if (pos + 1 < end && pos[1] == '?')
{
for (size_t offset = 2; pos + offset < end; ++offset)
{
if (pos[offset] == '-' /// it means flag negation
/// various possible flags, actually only imsU are supported by re2
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
{
if (pos[offset] == 'i')
{
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
break;
}
}
else
break;
}
}
} }
++pos; ++pos;
break; break;
@ -209,7 +242,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
if (!is_trivial) if (!is_trivial)
{ {
if (!has_alternative_on_depth_0) if (!has_alternative_on_depth_0 && !has_case_insensitive_flag)
{ {
/// We choose the non-alternative substring of the maximum length for first search. /// We choose the non-alternative substring of the maximum length for first search.

View File

@ -0,0 +1,8 @@
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,8 @@
SELECT match('Too late', 'Too late');
select match('Too late', '(?i)Too late');
select match('Too late', '(?i)too late');
select match('Too late', '(?i:too late)');
select match('Too late', '(?i)to{2} late');
select match('Too late', '(?i)to(?)o late');
select match('Too late', '(?i)to+ late');
select match('Too late', '(?i)to(?:o|o) late');