Merge pull request #10903 from ClickHouse/lexer-unicode-whitespaces

Support for unicode whitespaces in queries.
This commit is contained in:
alexey-milovidov 2020-05-15 10:31:59 +03:00 committed by GitHub
commit 985b83a228
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 78 additions and 1 deletions

View File

@ -3,6 +3,7 @@
#include <string> #include <string>
#include <cstring> #include <cstring>
#include <cstddef> #include <cstddef>
#include <cstdint>
#include <type_traits> #include <type_traits>
@ -131,6 +132,69 @@ inline char alternateCaseIfAlphaASCII(char c)
return c ^ 0x20; return c ^ 0x20;
} }
inline const char * skipWhitespacesUTF8(const char * pos, const char * end)
{
/// https://en.wikipedia.org/wiki/Whitespace_character
/// with some adjustments.
/// Code points: 0085 00A0 180E 2000..200A 2028..2029 200B..200D 202F 205F 2060 3000 FEFF
/// The corresponding UTF-8 is: C285 C2A0 E1A08E E28080..E2808A E280A8..E280A9 E2808B..E2808D E280AF E2819F E281A0 E38080 EFBBBF
/// We check for these bytes directly in UTF8 for simplicity reasons.
/** C2
* 85
* A0
* E1 A0 8E
* E2
* 80
* 80..8A
* A8..A9
* 8B..8D
* AF
* 81
* 9F
* A0
* E3 80 80
* EF BB BF
*/
while (pos < end)
{
if (isWhitespaceASCII(*pos))
{
++pos;
}
else
{
const uint8_t * upos = reinterpret_cast<const uint8_t *>(pos);
if (pos + 1 < end && upos[0] == 0xC2 && (upos[1] == 0x85 || upos[1] == 0xA0))
{
pos += 2;
}
else if (pos + 2 < end
&& ((upos[0] == 0xE1 && upos[1] == 0xA0 && upos[2] == 0x8E)
|| (upos[0] == 0xE2
&& ((upos[1] == 0x80
&& ((upos[2] >= 0x80 && upos[2] <= 0x8A)
|| (upos[2] >= 0xA8 && upos[2] <= 0xA9)
|| (upos[2] >= 0x8B && upos[2] <= 0x8D)
|| (upos[2] == 0xAF)))
|| (upos[1] == 0x81 && (upos[2] == 0x9F || upos[2] == 0xA0))))
|| (upos[0] == 0xE3 && upos[1] == 0x80 && upos[2] == 0x80)
|| (upos[0] == 0xEF && upos[1] == 0xBB && upos[2] == 0xBF)))
{
pos += 3;
}
else
break;
}
}
return pos;
}
inline bool equalsCaseInsensitive(char a, char b) inline bool equalsCaseInsensitive(char a, char b)
{ {
return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b); return a == b || (isAlphaASCII(a) && alternateCaseIfAlphaASCII(a) == b);

View File

@ -316,7 +316,14 @@ Token Lexer::nextTokenImpl()
return Token(TokenType::BareWord, token_begin, pos); return Token(TokenType::BareWord, token_begin, pos);
} }
else else
return Token(TokenType::Error, token_begin, ++pos); {
/// We will also skip unicode whitespaces in UTF-8 to support for queries copy-pasted from MS Word and similar.
pos = skipWhitespacesUTF8(pos, end);
if (pos > token_begin)
return Token(TokenType::Whitespace, token_begin, pos);
else
return Token(TokenType::Error, token_begin, ++pos);
}
} }
} }

View File

@ -0,0 +1,3 @@
1
2
3

View File

@ -0,0 +1,3 @@
SELECT1;
SELECT 2;
…  SELECT1 +2;