ClickHouse/dbms/Parsers/Lexer.h

119 lines
3.2 KiB
C++
Raw Normal View History

2017-07-09 00:04:05 +00:00
#pragma once
#include <stddef.h>
2017-07-09 00:04:05 +00:00
namespace DB
{
#define APPLY_FOR_TOKENS(M) \
M(Whitespace) \
M(Comment) \
\
M(BareWord) /** Either keyword (SELECT) or identifier (column) */ \
\
M(Number) /** Always non-negative. No leading plus. 123 or something like 123.456e12, 0x123p12 */ \
M(StringLiteral) /** 'hello word', 'hello''word', 'hello\'word\\' */ \
\
M(QuotedIdentifier) /** "x", `x` */ \
\
M(OpeningRoundBracket) \
M(ClosingRoundBracket) \
\
M(OpeningSquareBracket) \
M(ClosingSquareBracket) \
\
2019-05-25 13:43:52 +00:00
M(OpeningCurlyBrace) \
M(ClosingCurlyBrace) \
2019-05-18 21:07:23 +00:00
\
M(Comma) \
M(Semicolon) \
M(Dot) /** Compound identifiers, like a.b or tuple access operator a.1, (x, y).2. */ \
/** Need to be distinguished from floating point number with omitted integer part: .1 */ \
\
M(Asterisk) /** Could be used as multiplication operator or on it's own: "SELECT *" */ \
\
M(Plus) \
M(Minus) \
M(Slash) \
M(Percent) \
M(Arrow) /** ->. Should be distinguished from minus operator. */ \
M(QuestionMark) \
M(Colon) \
M(Equals) \
M(NotEquals) \
M(Less) \
M(Greater) \
M(LessOrEquals) \
M(GreaterOrEquals) \
M(Concatenation) /** String concatenation operator: || */ \
\
2020-02-04 21:35:22 +00:00
M(At) /** @. Used only for specifying user names. */ \
\
/** Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream. */ \
\
M(EndOfStream) \
\
/** Something unrecognized. */ \
M(Error) \
/** Something is wrong and we have more information. */ \
M(ErrorMultilineCommentIsNotClosed) \
M(ErrorSingleQuoteIsNotClosed) \
M(ErrorDoubleQuoteIsNotClosed) \
M(ErrorBackQuoteIsNotClosed) \
M(ErrorSingleExclamationMark) \
M(ErrorSinglePipeMark) \
M(ErrorWrongNumber) \
2018-03-26 19:41:55 +00:00
M(ErrorMaxQuerySizeExceeded) \
2017-07-09 00:04:05 +00:00
enum class TokenType
{
#define M(TOKEN) TOKEN,
APPLY_FOR_TOKENS(M)
#undef M
2017-07-09 00:04:05 +00:00
};
const char * getTokenName(TokenType type);
const char * getErrorTokenDescription(TokenType type);
2017-07-09 00:04:05 +00:00
struct Token
{
TokenType type;
const char * begin;
const char * end;
size_t size() const { return end - begin; }
2017-07-09 00:04:05 +00:00
Token() = default;
2019-08-03 11:02:40 +00:00
Token(TokenType type_, const char * begin_, const char * end_) : type(type_), begin(begin_), end(end_) {}
bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; }
bool isError() const { return type > TokenType::EndOfStream; }
bool isEnd() const { return type == TokenType::EndOfStream; }
2017-07-09 00:04:05 +00:00
};
class Lexer
{
public:
2019-08-03 11:02:40 +00:00
Lexer(const char * begin_, const char * end_, size_t max_query_size_ = 0)
: begin(begin_), pos(begin_), end(end_), max_query_size(max_query_size_) {}
2017-07-09 00:04:05 +00:00
Token nextToken();
private:
const char * const begin;
const char * pos;
const char * const end;
2018-03-26 19:41:55 +00:00
const size_t max_query_size;
Token nextTokenImpl();
/// This is needed to disambiguate tuple access operator from floating point number (.1).
TokenType prev_significant_token_type = TokenType::Whitespace; /// No previous token.
2017-07-09 00:04:05 +00:00
};
}