mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-23 18:20:50 +00:00
Using lexer (development) [#CLICKHOUSE-2].
This commit is contained in:
parent
2e6ceeba1c
commit
f6ff1f7e72
@ -48,6 +48,15 @@ Token quotedString(const char *& pos, const char * const token_begin, const char
|
||||
|
||||
|
||||
Token Lexer::nextToken()
|
||||
{
|
||||
Token res = nextTokenImpl();
|
||||
if (res.isSignificant())
|
||||
prev_significant_token_type = res.type;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
Token Lexer::nextTokenImpl()
|
||||
{
|
||||
if (pos >= end)
|
||||
return Token(TokenType::EndOfStream, end, end);
|
||||
@ -91,7 +100,7 @@ Token Lexer::nextToken()
|
||||
|
||||
/// 0x, 0b
|
||||
bool hex = false;
|
||||
if (pos < end - 2 && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
|
||||
if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
|
||||
{
|
||||
if (pos[1] == 'x' || pos[1] == 'X')
|
||||
hex = true;
|
||||
@ -117,7 +126,7 @@ Token Lexer::nextToken()
|
||||
++pos;
|
||||
|
||||
/// sign of exponent. It is always decimal.
|
||||
if (pos < end - 1 && (*pos == '-' || *pos == '+'))
|
||||
if (pos + 1 < end && (*pos == '-' || *pos == '+'))
|
||||
++pos;
|
||||
|
||||
while (pos < end && isNumericASCII(*pos))
|
||||
@ -160,7 +169,12 @@ Token Lexer::nextToken()
|
||||
case '.': /// qualifier, tuple access operator or start of floating point number
|
||||
{
|
||||
/// Just after identifier or complex expression.
|
||||
if (pos > begin && (pos[-1] == ')' || pos[-1] == ']' || isAlphaNumericASCII(pos[-1])))
|
||||
if (pos > begin
|
||||
&& (!(pos + 1 < end && isNumericASCII(pos[1]))
|
||||
|| prev_significant_token_type == TokenType::ClosingRoundBracket
|
||||
|| prev_significant_token_type == TokenType::ClosingSquareBracket
|
||||
|| prev_significant_token_type == TokenType::BareWord
|
||||
|| prev_significant_token_type == TokenType::QuotedIdentifier))
|
||||
return Token(TokenType::Dot, token_begin, ++pos);
|
||||
|
||||
++pos;
|
||||
@ -168,12 +182,12 @@ Token Lexer::nextToken()
|
||||
++pos;
|
||||
|
||||
/// exponentation
|
||||
if (pos < end - 1 && (*pos == 'e' || *pos == 'E'))
|
||||
if (pos + 1 < end && (*pos == 'e' || *pos == 'E'))
|
||||
{
|
||||
++pos;
|
||||
|
||||
/// sign of exponent
|
||||
if (pos < end - 1 && (*pos == '-' || *pos == '+'))
|
||||
if (pos + 1 < end && (*pos == '-' || *pos == '+'))
|
||||
++pos;
|
||||
|
||||
while (pos < end && isNumericASCII(*pos))
|
||||
@ -215,7 +229,7 @@ Token Lexer::nextToken()
|
||||
else
|
||||
{
|
||||
++pos;
|
||||
while (pos <= end - 2)
|
||||
while (pos + 2 <= end)
|
||||
{
|
||||
/// This means that nested multiline comments are not supported.
|
||||
if (pos[0] == '*' && pos[1] == '/')
|
||||
|
@ -44,7 +44,9 @@ enum class TokenType
|
||||
Greater,
|
||||
LessOrEquals,
|
||||
GreaterOrEquals,
|
||||
Concatenation, /// ||
|
||||
Concatenation, /// String concatenation operator: ||
|
||||
|
||||
/// Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream.
|
||||
|
||||
EndOfStream,
|
||||
|
||||
@ -71,6 +73,10 @@ struct Token
|
||||
|
||||
Token() = default;
|
||||
Token(TokenType type, const char * begin, const char * end) : type(type), begin(begin), end(end) {}
|
||||
|
||||
bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; }
|
||||
bool isError() const { return type > TokenType::EndOfStream; }
|
||||
bool isEnd() const { return type == TokenType::EndOfStream; }
|
||||
};
|
||||
|
||||
|
||||
@ -84,6 +90,11 @@ private:
|
||||
const char * const begin;
|
||||
const char * pos;
|
||||
const char * const end;
|
||||
|
||||
Token nextTokenImpl();
|
||||
|
||||
/// This is needed to disambiguate tuple access operator from floating point number (.1).
|
||||
TokenType prev_significant_token_type = TokenType::Whitespace; /// No previous token.
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -91,6 +91,7 @@ bool ParserSampleRatio::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
|
||||
|
||||
if (!parseDecimal(pos->begin, pos->end, numerator))
|
||||
return false;
|
||||
++pos;
|
||||
|
||||
bool has_slash = pos->type == TokenType::Slash;
|
||||
|
||||
@ -100,6 +101,7 @@ bool ParserSampleRatio::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
|
||||
|
||||
if (!parseDecimal(pos->begin, pos->end, denominator))
|
||||
return false;
|
||||
++pos;
|
||||
|
||||
res.numerator = numerator.numerator * denominator.denominator;
|
||||
res.denominator = numerator.denominator * denominator.numerator;
|
||||
|
@ -30,13 +30,12 @@ public:
|
||||
if (index < data.size())
|
||||
return data[index];
|
||||
|
||||
if (!data.empty() && data.back().type == TokenType::EndOfStream)
|
||||
if (!data.empty() && data.back().isEnd())
|
||||
return data.back();
|
||||
|
||||
Token token = lexer.nextToken();
|
||||
|
||||
if (token.type != TokenType::Whitespace
|
||||
&& token.type != TokenType::Comment)
|
||||
if (token.isSignificant())
|
||||
data.emplace_back(token);
|
||||
}
|
||||
}
|
||||
@ -72,6 +71,7 @@ public:
|
||||
|
||||
bool isValid() { return get().type < TokenType::EndOfStream; }
|
||||
|
||||
/// Rightmost token we had looked.
|
||||
const Token & max() { return tokens->max(); }
|
||||
};
|
||||
|
||||
|
@ -122,7 +122,7 @@ ASTPtr tryParseQuery(
|
||||
Tokens tokens(pos, end);
|
||||
TokenIterator token_iterator(tokens);
|
||||
|
||||
if (token_iterator->type == TokenType::EndOfStream
|
||||
if (token_iterator->isEnd()
|
||||
|| token_iterator->type == TokenType::Semicolon)
|
||||
{
|
||||
out_error_message = "Empty query";
|
||||
@ -137,7 +137,7 @@ ASTPtr tryParseQuery(
|
||||
const char * max_parsed_pos = token_iterator.max().begin;
|
||||
|
||||
/// Lexical error
|
||||
if (!parse_res && token_iterator->type > TokenType::EndOfStream)
|
||||
if (!parse_res && token_iterator->isError())
|
||||
{
|
||||
expected = "any valid token";
|
||||
out_error_message = getSyntaxErrorMessage(begin, end, max_parsed_pos, expected, hilite, description);
|
||||
@ -150,7 +150,7 @@ ASTPtr tryParseQuery(
|
||||
insert = typeid_cast<ASTInsertQuery *>(res.get());
|
||||
|
||||
if (parse_res
|
||||
&& token_iterator->type != TokenType::EndOfStream
|
||||
&& !token_iterator->isEnd()
|
||||
&& token_iterator->type != TokenType::Semicolon
|
||||
&& !(insert && insert->data))
|
||||
{
|
||||
@ -164,7 +164,7 @@ ASTPtr tryParseQuery(
|
||||
|
||||
/// If multi-statements are not allowed, then after semicolon, there must be no non-space characters.
|
||||
if (parse_res && !allow_multi_statements
|
||||
&& token_iterator->type != TokenType::EndOfStream
|
||||
&& !token_iterator->isEnd()
|
||||
&& !(insert && insert->data))
|
||||
{
|
||||
out_error_message = getSyntaxErrorMessage(begin, end, max_parsed_pos, nullptr, hilite,
|
||||
|
@ -73,7 +73,7 @@ int main(int argc, char ** argv)
|
||||
{
|
||||
Token token = lexer.nextToken();
|
||||
|
||||
if (token.type == TokenType::EndOfStream)
|
||||
if (token.isEnd())
|
||||
break;
|
||||
|
||||
writeChar(' ', out);
|
||||
@ -89,7 +89,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
writeChar(' ', out);
|
||||
|
||||
if (token.type >= TokenType::Error)
|
||||
if (token.isError())
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -98,7 +98,7 @@ int main(int argc, char ** argv)
|
||||
Tokens tokens(query.data(), query.data() + query.size());
|
||||
TokenIterator token(tokens);
|
||||
|
||||
while (token->type != TokenType::EndOfStream)
|
||||
while (token->type.isEnd())
|
||||
{
|
||||
auto it = hilite.find(token->type);
|
||||
if (it != hilite.end())
|
||||
|
Loading…
Reference in New Issue
Block a user