Merge pull request #11649 from ClickHouse/case-insensitive-regexp

Allow case-insensitive regexps; added a test
This commit is contained in:
alexey-milovidov 2020-06-14 07:55:09 +03:00 committed by GitHub
commit b9b725a39f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 2 deletions

View File

@ -38,6 +38,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
required_substring_is_prefix = false;
required_substring.clear();
bool has_alternative_on_depth_0 = false;
bool has_case_insensitive_flag = false;
/// Substring with a position.
using Substring = std::pair<std::string, size_t>;
@ -65,7 +66,17 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
switch (*pos)
{
case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
case '|':
case '(':
case ')':
case '^':
case '$':
case '.':
case '[':
case '?':
case '*':
case '+':
case '{':
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
if (last_substring->first.empty())
@ -110,6 +121,28 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
/// Check for case-insensitive flag.
if (pos + 1 < end && pos[1] == '?')
{
for (size_t offset = 2; pos + offset < end; ++offset)
{
if (pos[offset] == '-' /// it means flag negation
/// various possible flags, actually only imsU are supported by re2
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
{
if (pos[offset] == 'i')
{
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
break;
}
}
else
break;
}
}
}
++pos;
break;
@ -209,7 +242,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
if (!is_trivial)
{
if (!has_alternative_on_depth_0)
if (!has_alternative_on_depth_0 && !has_case_insensitive_flag)
{
/// We choose the non-alternative substring of the maximum length for first search.

View File

@ -0,0 +1,8 @@
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,8 @@
SELECT match('Too late', 'Too late');
select match('Too late', '(?i)Too late');
select match('Too late', '(?i)too late');
select match('Too late', '(?i:too late)');
select match('Too late', '(?i)to{2} late');
select match('Too late', '(?i)to(?)o late');
select match('Too late', '(?i)to+ late');
select match('Too late', '(?i)to(?:o|o) late');