2017-03-10 17:52:36 +00:00
|
|
|
#pragma once
|
|
|
|
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
#include <Common/Exception.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/types.h>
|
2017-03-10 17:52:36 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-07-05 15:57:59 +00:00
|
|
|
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_PARSE_ESCAPE_SEQUENCE;
|
|
|
|
}
|
|
|
|
|
2020-07-05 15:57:59 +00:00
|
|
|
/// Transforms the [I]LIKE expression into regexp re2. For example, abc%def -> ^abc.*def$
|
2022-05-24 19:29:43 +00:00
|
|
|
inline String likePatternToRegexp(std::string_view pattern)
|
2017-03-10 17:52:36 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
String res;
|
|
|
|
res.reserve(pattern.size() * 2);
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
const char * pos = pattern.data();
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
const char * const end = pattern.begin() + pattern.size();
|
2017-03-10 17:52:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (pos < end && *pos == '%')
|
|
|
|
++pos;
|
|
|
|
else
|
|
|
|
res = "^";
|
2017-03-10 17:52:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (pos < end)
|
|
|
|
{
|
|
|
|
switch (*pos)
|
|
|
|
{
|
|
|
|
case '^': case '$': case '.': case '[': case '|': case '(': case ')': case '?': case '*': case '+': case '{':
|
|
|
|
res += '\\';
|
|
|
|
res += *pos;
|
|
|
|
break;
|
|
|
|
case '%':
|
|
|
|
if (pos + 1 != end)
|
|
|
|
res += ".*";
|
|
|
|
else
|
|
|
|
return res;
|
|
|
|
break;
|
|
|
|
case '_':
|
|
|
|
res += ".";
|
|
|
|
break;
|
|
|
|
case '\\':
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
if (pos + 1 == end)
|
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE, "Invalid escape sequence");
|
2021-04-25 00:58:44 +00:00
|
|
|
/// Known escape sequences.
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
if (pos[1] == '%' || pos[1] == '_')
|
2021-04-25 00:58:44 +00:00
|
|
|
{
|
|
|
|
res += pos[1];
|
|
|
|
++pos;
|
|
|
|
}
|
Disallow LIKE patterns with trailing escape
Trailing escape ('ab\') is disallowed in SQL, in standardese:
"If an escape character is specified, then [...] If there is not a
partitioning of the string PVC into substrings such that each substring
has length 1 (one) or 2, no substring of length 1 (one) is the escape
character ECV, and each substring of length 2 is the escape character
ECV followed by either the escape character ECV, an <underscore>
character, or the <percent> character, then an exception condition is
raised: data exception - invalid escape sequence."
I first thought this is checked already higher up in the stack, at least
for const needles, as single trailing backslashes ('ab\') are rejected,
but then I realized that ClickHouse quotes by default. I.e., double
trailing backslashes ('ab\\') are not rejected but when interpreted as
LIKE needle ('ab\') they should.
2022-06-01 19:06:37 +00:00
|
|
|
else if (pos[1] == '\\')
|
2021-04-25 00:58:44 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
res += "\\\\";
|
2021-04-25 00:58:44 +00:00
|
|
|
++pos;
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
else
|
|
|
|
{
|
2021-04-25 00:58:44 +00:00
|
|
|
/// Unknown escape sequence treated literally: as backslash and the following character.
|
|
|
|
res += "\\\\";
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
res += *pos;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++pos;
|
|
|
|
}
|
2017-03-10 17:52:36 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
res += '$';
|
|
|
|
return res;
|
2017-03-10 17:52:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|