ClickHouse/dbms/src/Parsers/Lexer.h

#pragma once

#include <stddef.h>


namespace DB
{

#define APPLY_FOR_TOKENS(M) \
    M(Whitespace) \
    M(Comment) \
    \
    M(BareWord)               /** Either keyword (SELECT) or identifier (column) */ \
    \
    M(Number)                 /** Always non-negative. No leading plus. 123 or something like 123.456e12, 0x123p12 */ \
    M(StringLiteral)          /** 'hello word', 'hello''word', 'hello\'word\\' */ \
    \
    M(QuotedIdentifier)       /** "x", `x` */ \
    \
    M(OpeningRoundBracket) \
    M(ClosingRoundBracket) \
    \
    M(OpeningSquareBracket) \
    M(ClosingSquareBracket) \
    \
    M(OpeningCurlyBrace) \
    M(ClosingCurlyBrace) \
    \
    M(Comma) \
    M(Semicolon) \
    M(Dot)                    /** Compound identifiers, like a.b or tuple access operator a.1, (x, y).2. */ \
                              /** Need to be distinguished from floating point number with omitted integer part: .1 */ \
    \
    M(Asterisk)               /** Could be used as multiplication operator or on it's own: "SELECT *" */ \
    \
    M(Plus) \
    M(Minus) \
    M(Slash) \
    M(Percent) \
    M(Arrow)                  /** ->. Should be distinguished from minus operator. */ \
    M(QuestionMark) \
    M(Colon) \
    M(Equals) \
    M(NotEquals) \
    M(Less) \
    M(Greater) \
    M(LessOrEquals) \
    M(GreaterOrEquals) \
    M(Concatenation)          /** String concatenation operator: || */ \
    \
    /** Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream. */ \
    \
    M(EndOfStream) \
    \
    /** Something unrecognized. */ \
    M(Error) \
    /** Something is wrong and we have more information. */ \
    M(ErrorMultilineCommentIsNotClosed) \
    M(ErrorSingleQuoteIsNotClosed) \
    M(ErrorDoubleQuoteIsNotClosed) \
    M(ErrorBackQuoteIsNotClosed) \
    M(ErrorSingleExclamationMark) \
    M(ErrorSinglePipeMark) \
    M(ErrorWrongNumber) \
    M(ErrorMaxQuerySizeExceeded) \


enum class TokenType
{
#define M(TOKEN) TOKEN,
APPLY_FOR_TOKENS(M)
#undef M
};

const char * getTokenName(TokenType type);
const char * getErrorTokenDescription(TokenType type);


struct Token
{
    TokenType type;
    const char * begin;
    const char * end;

    size_t size() const { return end - begin; }

    Token() = default;
    Token(TokenType type, const char * begin, const char * end) : type(type), begin(begin), end(end) {}

    bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; }
    bool isError() const { return type > TokenType::EndOfStream; }
    bool isEnd() const { return type == TokenType::EndOfStream; }
};


class Lexer
{
public:
    Lexer(const char * begin, const char * end, size_t max_query_size = 0)
            : begin(begin), pos(begin), end(end), max_query_size(max_query_size) {}
    Token nextToken();

private:
    const char * const begin;
    const char * pos;
    const char * const end;

    const size_t max_query_size;

    Token nextTokenImpl();

    /// This is needed to disambiguate tuple access operator from floating point number (.1).
    TokenType prev_significant_token_type = TokenType::Whitespace;   /// No previous token.
};

}
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00			`#pragma once`

Using lexer (development) [#CLICKHOUSE-2]. 2017-07-12 19:20:57 +00:00			`#include <stddef.h>`

Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00
			`namespace DB`
			`{`

Using lexer: better error messages [#CLICKHOUSE-2]. 2017-07-13 04:20:56 +00:00			`#define APPLY_FOR_TOKENS(M) \`
			`M(Whitespace) \`
			`M(Comment) \`
			`\`
			`M(BareWord) /** Either keyword (SELECT) or identifier (column) */ \`
			`\`
			`M(Number) /** Always non-negative. No leading plus. 123 or something like 123.456e12, 0x123p12 */ \`
			`M(StringLiteral) /** 'hello word', 'hello''word', 'hello\'word\\' */ \`
			`\`
			M(QuotedIdentifier) /** "x", `x` */ \
			`\`
			`M(OpeningRoundBracket) \`
			`M(ClosingRoundBracket) \`
			`\`
			`M(OpeningSquareBracket) \`
			`M(ClosingSquareBracket) \`
			`\`
fix style and add tests 2019-05-25 13:43:52 +00:00			`M(OpeningCurlyBrace) \`
			`M(ClosingCurlyBrace) \`
new 2019-05-18 21:07:23 +00:00			`\`
Using lexer: better error messages [#CLICKHOUSE-2]. 2017-07-13 04:20:56 +00:00			`M(Comma) \`
			`M(Semicolon) \`
			`M(Dot) /** Compound identifiers, like a.b or tuple access operator a.1, (x, y).2. */ \`
			`/** Need to be distinguished from floating point number with omitted integer part: .1 */ \`
			`\`
			`M(Asterisk) /** Could be used as multiplication operator or on it's own: "SELECT " / \`
			`\`
			`M(Plus) \`
			`M(Minus) \`
			`M(Slash) \`
			`M(Percent) \`
			`M(Arrow) /** ->. Should be distinguished from minus operator. */ \`
			`M(QuestionMark) \`
			`M(Colon) \`
			`M(Equals) \`
			`M(NotEquals) \`
			`M(Less) \`
			`M(Greater) \`
			`M(LessOrEquals) \`
			`M(GreaterOrEquals) \`
			`M(Concatenation) /** String concatenation operator: \|\| */ \`
			`\`
			`/** Order is important. EndOfStream goes after all usual tokens, and special error tokens goes after EndOfStream. */ \`
			`\`
			`M(EndOfStream) \`
			`\`
			`/** Something unrecognized. */ \`
			`M(Error) \`
			`/** Something is wrong and we have more information. */ \`
			`M(ErrorMultilineCommentIsNotClosed) \`
			`M(ErrorSingleQuoteIsNotClosed) \`
			`M(ErrorDoubleQuoteIsNotClosed) \`
			`M(ErrorBackQuoteIsNotClosed) \`
			`M(ErrorSingleExclamationMark) \`
			`M(ErrorSinglePipeMark) \`
			`M(ErrorWrongNumber) \`
minor fixes 2018-03-26 19:41:55 +00:00			`M(ErrorMaxQuerySizeExceeded) \`

Using lexer: better error messages [#CLICKHOUSE-2]. 2017-07-13 04:20:56 +00:00
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00			`enum class TokenType`
			`{`
Using lexer: better error messages [#CLICKHOUSE-2]. 2017-07-13 04:20:56 +00:00			`#define M(TOKEN) TOKEN,`
			`APPLY_FOR_TOKENS(M)`
			`#undef M`
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00			`};`

Using lexer: better error messages [#CLICKHOUSE-2]. 2017-07-13 04:20:56 +00:00			`const char * getTokenName(TokenType type);`
			`const char * getErrorTokenDescription(TokenType type);`

Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00
			`struct Token`
			`{`
			`TokenType type;`
			`const char * begin;`
			`const char * end;`

Using lexer (development) [#CLICKHOUSE-2]. 2017-07-12 19:20:57 +00:00			`size_t size() const { return end - begin; }`
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00
			`Token() = default;`
			`Token(TokenType type, const char * begin, const char * end) : type(type), begin(begin), end(end) {}`
Using lexer (development) [#CLICKHOUSE-2]. 2017-07-13 01:12:13 +00:00
			`bool isSignificant() const { return type != TokenType::Whitespace && type != TokenType::Comment; }`
			`bool isError() const { return type > TokenType::EndOfStream; }`
			`bool isEnd() const { return type == TokenType::EndOfStream; }`
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00			`};`


			`class Lexer`
			`{`
			`public:`
minor fixes 2018-03-26 19:41:55 +00:00			`Lexer(const char * begin, const char * end, size_t max_query_size = 0)`
			`: begin(begin), pos(begin), end(end), max_query_size(max_query_size) {}`
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00			`Token nextToken();`

			`private:`
			`const char * const begin;`
			`const char * pos;`
			`const char * const end;`
Using lexer (development) [#CLICKHOUSE-2]. 2017-07-13 01:12:13 +00:00
minor fixes 2018-03-26 19:41:55 +00:00			`const size_t max_query_size;`

Using lexer (development) [#CLICKHOUSE-2]. 2017-07-13 01:12:13 +00:00			`Token nextTokenImpl();`

			`/// This is needed to disambiguate tuple access operator from floating point number (.1).`
			`TokenType prev_significant_token_type = TokenType::Whitespace; /// No previous token.`
Added lexer [#CLICKHOUSE-2]. 2017-07-09 00:04:05 +00:00			`};`

			`}`