Using lexer (development) [#CLICKHOUSE-2].

This commit is contained in:
Alexey Milovidov 2017-07-13 04:54:51 +03:00 committed by alexey-milovidov
parent f6ff1f7e72
commit d4974b0724
3 changed files with 54 additions and 35 deletions

View File

@ -277,10 +277,10 @@ private:
ErrorCodes::SYNTAX_ERROR};
};
auto match = [&](const char * str)
auto match = [&pos](const char * str) mutable
{
size_t length = strlen(str);
if (pos + length < end && 0 == memcmp(pos, str, length))
if (pos + length <= end && 0 == memcmp(pos, str, length))
{
pos += length;
return true;

View File

@ -431,7 +431,7 @@ bool ParserNumber::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
char * pos_double = buf;
errno = 0; /// Functions strto* don't clear errno.
Float64 float_value = std::strtod(buf, &pos_double);
if (pos_double == buf || errno == ERANGE)
if (pos_double != buf + pos->size() || errno == ERANGE)
{
expected = "number";
return false;
@ -475,7 +475,7 @@ bool ParserUnsignedInteger::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
UInt64 x = 0;
ReadBufferFromMemory in(pos->begin, pos->size());
if (!tryReadIntText(x, in) || in.count() == 0)
if (!tryReadIntText(x, in) || in.count() != pos->size())
{
expected = "unsigned integer";
return false;
@ -506,6 +506,12 @@ bool ParserStringLiteral::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte
return false;
}
if (in.count() != pos->size())
{
expected = "string literal";
return false;
}
++pos;
node = std::make_shared<ASTLiteral>(StringRange(pos->begin, pos->end), s);
return true;

View File

@ -98,40 +98,52 @@ Token Lexer::nextTokenImpl()
{
/// The task is not to parse a number or check correctness, but only to skip it.
/// 0x, 0b
bool hex = false;
if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
{
if (pos[1] == 'x' || pos[1] == 'X')
hex = true;
pos += 2;
}
else
++pos;
while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
++pos;
/// decimal point
if (pos < end && *pos == '.')
/// Disambiguation: if previous token was dot, then we could parse only simple integer,
/// for chained tuple access operators (x.1.1) to work.
// Otherwise it will be tokenized as x . 1.1, not as x . 1 . 1
if (prev_significant_token_type == TokenType::Dot)
{
++pos;
while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
++pos;
}
/// exponentation (base 10 or base 2)
if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E')))
{
++pos;
/// sign of exponent. It is always decimal.
if (pos + 1 < end && (*pos == '-' || *pos == '+'))
++pos;
while (pos < end && isNumericASCII(*pos))
++pos;
}
else
{
/// 0x, 0b
bool hex = false;
if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
{
if (pos[1] == 'x' || pos[1] == 'X')
hex = true;
pos += 2;
}
else
++pos;
while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
++pos;
/// decimal point
if (pos < end && *pos == '.')
{
++pos;
while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
++pos;
}
/// exponentation (base 10 or base 2)
if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E')))
{
++pos;
/// sign of exponent. It is always decimal.
if (pos + 1 < end && (*pos == '-' || *pos == '+'))
++pos;
while (pos < end && isNumericASCII(*pos))
++pos;
}
}
/// word character cannot go just after number (SELECT 123FROM)
if (pos < end && isWordCharASCII(*pos))
@ -168,13 +180,14 @@ Token Lexer::nextTokenImpl()
case '.': /// qualifier, tuple access operator or start of floating point number
{
/// Just after identifier or complex expression.
/// Just after identifier or complex expression or number (for chained tuple access like x.1.1 to work properly).
if (pos > begin
&& (!(pos + 1 < end && isNumericASCII(pos[1]))
|| prev_significant_token_type == TokenType::ClosingRoundBracket
|| prev_significant_token_type == TokenType::ClosingSquareBracket
|| prev_significant_token_type == TokenType::BareWord
|| prev_significant_token_type == TokenType::QuotedIdentifier))
|| prev_significant_token_type == TokenType::QuotedIdentifier
|| prev_significant_token_type == TokenType::Number))
return Token(TokenType::Dot, token_begin, ++pos);
++pos;