Fancy quotes

This commit is contained in:
Alexey Milovidov 2024-04-30 05:15:35 +02:00
parent 7b1c39acbc
commit a501887f15

View File

@ -11,8 +11,9 @@ namespace
{
/// This must be consistent with functions in ReadHelpers.h
template <char quote, TokenType success_token, TokenType error_token>
Token quotedString(const char *& pos, const char * const token_begin, const char * const end)
template <char quote>
Token quotedString(const char *& pos, const char * const token_begin, const char * const end,
TokenType success_token, TokenType error_token)
{
++pos;
while (true)
@ -45,6 +46,37 @@ Token quotedString(const char *& pos, const char * const token_begin, const char
}
}
Token quotedStringWithUnicodeQuotes(const char *& pos, const char * const token_begin, const char * const end,
char expected_end_byte, TokenType success_token, TokenType error_token)
{
/// : e2 80 98
/// : e2 80 99
/// “: e2 80 9c
/// ”: e2 80 9d
while (true)
{
pos = find_first_symbols<'\xE2', '\\'>(pos, end);
if (pos + 2 >= end)
return Token(error_token, token_begin, end);
if (pos[0] == '\xE2' && pos[1] == '\x80' && pos[2] == expected_end_byte)
{
pos += 3;
return Token(success_token, token_begin, pos);
}
if (*pos == '\\')
{
++pos;
if (pos >= end)
return Token(error_token, token_begin, end);
++pos;
continue;
}
}
}
Token quotedHexOrBinString(const char *& pos, const char * const token_begin, const char * const end)
{
constexpr char quote = '\'';
@ -224,11 +256,11 @@ Token Lexer::nextTokenImpl()
}
case '\'':
return quotedString<'\'', TokenType::StringLiteral, TokenType::ErrorSingleQuoteIsNotClosed>(pos, token_begin, end);
return quotedString<'\''>(pos, token_begin, end, TokenType::StringLiteral, TokenType::ErrorSingleQuoteIsNotClosed);
case '"':
return quotedString<'"', TokenType::QuotedIdentifier, TokenType::ErrorDoubleQuoteIsNotClosed>(pos, token_begin, end);
return quotedString<'"'>(pos, token_begin, end, TokenType::QuotedIdentifier, TokenType::ErrorDoubleQuoteIsNotClosed);
case '`':
return quotedString<'`', TokenType::QuotedIdentifier, TokenType::ErrorBackQuoteIsNotClosed>(pos, token_begin, end);
return quotedString<'`'>(pos, token_begin, end, TokenType::QuotedIdentifier, TokenType::ErrorBackQuoteIsNotClosed);
case '(':
return Token(TokenType::OpeningRoundBracket, token_begin, ++pos);
@ -434,6 +466,16 @@ Token Lexer::nextTokenImpl()
pos += 3;
return Token(TokenType::Minus, token_begin, pos);
}
/// Unicode quoted string, Hello or “World”.
if (pos + 5 < end && pos[0] == '\xE2' && pos[1] == '\x80' && (pos[2] == '\x98' || pos[2] == '\x9C'))
{
const char expected_end_byte = pos[2] + 1;
pos += 3;
TokenType success_token = pos[2] == '\x98' ? TokenType::StringLiteral : TokenType::QuotedIdentifier;
TokenType error_token = pos[2] == '\x98' ? TokenType::ErrorSingleQuoteIsNotClosed : TokenType::ErrorDoubleQuoteIsNotClosed;
return quotedStringWithUnicodeQuotes(pos, token_begin, end, expected_end_byte, success_token, error_token);
}
/// Other characters starting at E2 can be parsed, see skipWhitespacesUTF8
[[fallthrough]];
}