Using lexer (development) [#CLICKHOUSE-2].

This commit is contained in:
Alexey Milovidov 2017-07-13 04:12:13 +03:00 committed by alexey-milovidov
parent 2e6ceeba1c
commit f6ff1f7e72
6 changed files with 44 additions and 17 deletions

View File

@ -48,6 +48,15 @@ Token quotedString(const char *& pos, const char * const token_begin, const char
Token Lexer::nextToken()
{
Token res = nextTokenImpl();
if (res.isSignificant())
prev_significant_token_type = res.type;
return res;
}
Token Lexer::nextTokenImpl()
{
if (pos >= end)
return Token(TokenType::EndOfStream, end, end);
@ -91,7 +100,7 @@ Token Lexer::nextToken()
/// 0x, 0b
bool hex = false;
if (pos < end - 2 && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
{
if (pos[1] == 'x' || pos[1] == 'X')
hex = true;
@ -117,7 +126,7 @@ Token Lexer::nextToken()
++pos;
/// sign of exponent. It is always decimal.
if (pos < end - 1 && (*pos == '-' || *pos == '+'))
if (pos + 1 < end && (*pos == '-' || *pos == '+'))
++pos;
while (pos < end && isNumericASCII(*pos))
@ -160,7 +169,12 @@ Token Lexer::nextToken()
case '.': /// qualifier, tuple access operator or start of floating point number
{
/// Just after identifier or complex expression.
if (pos > begin && (pos[-1] == ')' || pos[-1] == ']' || isAlphaNumericASCII(pos[-1])))
if (pos > begin
&& (!(pos + 1 < end && isNumericASCII(pos[1]))
|| prev_significant_token_type == TokenType::ClosingRoundBracket
|| prev_significant_token_type == TokenType::ClosingSquareBracket
|| prev_significant_token_type == TokenType::BareWord
|| prev_significant_token_type == TokenType::QuotedIdentifier))
return Token(TokenType::Dot, token_begin, ++pos);
++pos;
@ -168,12 +182,12 @@ Token Lexer::nextToken()
++pos;
/// exponentation
if (pos < end - 1 && (*pos == 'e' || *pos == 'E'))
if (pos + 1 < end && (*pos == 'e' || *pos == 'E'))
{
++pos;
/// sign of exponent
if (pos < end - 1 && (*pos == '-' || *pos == '+'))
if (pos + 1 < end && (*pos == '-' || *pos == '+'))
++pos;
while (pos < end && isNumericASCII(*pos))
@ -215,7 +229,7 @@ Token Lexer::nextToken()
else
{
++pos;
while (pos <= end - 2)
while (pos + 2 <= end)
{
/// This means that nested multiline comments are not supported.
if (pos[0] == '*' && pos[1] == '/')

View File

@ -44,7 +44,9 @@ enum class TokenType
Greater,
LessOrEquals,
GreaterOrEquals,
Concatenation, /// ||
Concatenation, /// String concatenation operator: ||
/// Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream.
EndOfStream,
@ -71,6 +73,10 @@ struct Token
Token() = default;
Token(TokenType type, const char * begin, const char * end) : type(type), begin(begin), end(end) {}
bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; }
bool isError() const { return type > TokenType::EndOfStream; }
bool isEnd() const { return type == TokenType::EndOfStream; }
};
@ -84,6 +90,11 @@ private:
const char * const begin;
const char * pos;
const char * const end;
Token nextTokenImpl();
/// This is needed to disambiguate tuple access operator from floating point number (.1).
TokenType prev_significant_token_type = TokenType::Whitespace; /// No previous token.
};
}

View File

@ -91,6 +91,7 @@ bool ParserSampleRatio::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
if (!parseDecimal(pos->begin, pos->end, numerator))
return false;
++pos;
bool has_slash = pos->type == TokenType::Slash;
@ -100,6 +101,7 @@ bool ParserSampleRatio::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
if (!parseDecimal(pos->begin, pos->end, denominator))
return false;
++pos;
res.numerator = numerator.numerator * denominator.denominator;
res.denominator = numerator.denominator * denominator.numerator;

View File

@ -30,13 +30,12 @@ public:
if (index < data.size())
return data[index];
if (!data.empty() && data.back().type == TokenType::EndOfStream)
if (!data.empty() && data.back().isEnd())
return data.back();
Token token = lexer.nextToken();
if (token.type != TokenType::Whitespace
&& token.type != TokenType::Comment)
if (token.isSignificant())
data.emplace_back(token);
}
}
@ -72,6 +71,7 @@ public:
bool isValid() { return get().type < TokenType::EndOfStream; }
/// Rightmost token we had looked.
const Token & max() { return tokens->max(); }
};

View File

@ -122,7 +122,7 @@ ASTPtr tryParseQuery(
Tokens tokens(pos, end);
TokenIterator token_iterator(tokens);
if (token_iterator->type == TokenType::EndOfStream
if (token_iterator->isEnd()
|| token_iterator->type == TokenType::Semicolon)
{
out_error_message = "Empty query";
@ -137,7 +137,7 @@ ASTPtr tryParseQuery(
const char * max_parsed_pos = token_iterator.max().begin;
/// Lexical error
if (!parse_res && token_iterator->type > TokenType::EndOfStream)
if (!parse_res && token_iterator->isError())
{
expected = "any valid token";
out_error_message = getSyntaxErrorMessage(begin, end, max_parsed_pos, expected, hilite, description);
@ -150,7 +150,7 @@ ASTPtr tryParseQuery(
insert = typeid_cast<ASTInsertQuery *>(res.get());
if (parse_res
&& token_iterator->type != TokenType::EndOfStream
&& !token_iterator->isEnd()
&& token_iterator->type != TokenType::Semicolon
&& !(insert && insert->data))
{
@ -164,7 +164,7 @@ ASTPtr tryParseQuery(
/// If multi-statements are not allowed, then after semicolon, there must be no non-space characters.
if (parse_res && !allow_multi_statements
&& token_iterator->type != TokenType::EndOfStream
&& !token_iterator->isEnd()
&& !(insert && insert->data))
{
out_error_message = getSyntaxErrorMessage(begin, end, max_parsed_pos, nullptr, hilite,

View File

@ -73,7 +73,7 @@ int main(int argc, char ** argv)
{
Token token = lexer.nextToken();
if (token.type == TokenType::EndOfStream)
if (token.isEnd())
break;
writeChar(' ', out);
@ -89,7 +89,7 @@ int main(int argc, char ** argv)
writeChar(' ', out);
if (token.type >= TokenType::Error)
if (token.isError())
return 1;
}
@ -98,7 +98,7 @@ int main(int argc, char ** argv)
Tokens tokens(query.data(), query.data() + query.size());
TokenIterator token(tokens);
while (token->type != TokenType::EndOfStream)
while (token->type.isEnd())
{
auto it = hilite.find(token->type);
if (it != hilite.end())