From ad598d5dfcc5224abd602a011d1f857fdd7fe4d5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 9 Jul 2017 03:04:05 +0300 Subject: [PATCH] Added lexer [#CLICKHOUSE-2]. --- dbms/src/Parsers/Lexer.cpp | 268 ++++++++++++++++++++++++++ dbms/src/Parsers/Lexer.h | 89 +++++++++ dbms/src/Parsers/tests/CMakeLists.txt | 3 + dbms/src/Parsers/tests/lexer.cpp | 97 ++++++++++ 4 files changed, 457 insertions(+) create mode 100644 dbms/src/Parsers/Lexer.cpp create mode 100644 dbms/src/Parsers/Lexer.h create mode 100644 dbms/src/Parsers/tests/lexer.cpp diff --git a/dbms/src/Parsers/Lexer.cpp b/dbms/src/Parsers/Lexer.cpp new file mode 100644 index 00000000000..19eab4683d4 --- /dev/null +++ b/dbms/src/Parsers/Lexer.cpp @@ -0,0 +1,268 @@ +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +/// This must be consistent with functions in ReadHelpers.h +template +Token quotedString(const char *& pos, const char * const token_begin, const char * const end) +{ + ++pos; + while (true) + { + pos = find_first_symbols(pos, end); + if (pos >= end) + return Token(error_token, token_begin, end); + + if (*pos == quote) + { + ++pos; + if (pos < end && *pos == quote) + { + ++pos; + continue; + } + return Token(success_token, token_begin, pos); + } + + if (*pos == '\\') + { + ++pos; + if (pos >= end) + return Token(error_token, token_begin, end); + ++pos; + continue; + } + + __builtin_unreachable(); + } +} + +} + + +Token Lexer::nextToken() +{ + if (pos >= end) + return Token(TokenType::EndOfStream, end, 0); + + const char * const token_begin = pos; + + auto commentUntilEndOfLine = [&]() mutable + { + pos = find_first_symbols('\n', end); /// This means that newline in single-line comment cannot be escaped. + return Token(TokenType::Comment, token_begin, pos); + }; + + switch (*pos) + { + case ' ': + case '\t': + case '\n': + case '\r': + case '\f': + case '\v': + { + ++pos; + while (pos < end && isWhitespaceASCII(*pos)) + ++pos; + return Token(TokenType::Whitespace, token_begin, pos); + } + + case 'a'...'z': + case 'A'...'Z': + case '_': + { + if (pos > begin && isWordCharASCII(pos[-1])) + return Token(TokenType::ErrorWordWithoutWhitespace, pos, 1); + ++pos; + while (pos < end && isWordCharASCII(*pos)) + ++pos; + return Token(TokenType::BareWord, token_begin, pos); + } + + case '0'...'9': + { + /// 0x, 0b + if (pos < end - 2 && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b')) + { + pos += 2; + } + + while (pos < end && isNumericASCII(*pos)) + ++pos; + + /// decimal point + if (pos < end && *pos == '.') + { + ++pos; + while (pos < end && isNumericASCII(*pos)) + ++pos; + } + + /// exponentation + if (pos + 1 < end && (*pos == 'e' || *pos == 'p')) + { + ++pos; + + /// sign of exponent + if (pos < end - 1 && (*pos == '-' || *pos == '+')) + ++pos; + + while (pos < end && isNumericASCII(*pos)) + ++pos; + } + + return Token(TokenType::Number, token_begin, pos); + } + + case '\'': + return quotedString<'\'', TokenType::StringLiteral, TokenType::ErrorSingleQuoteIsNotClosed>(pos, token_begin, end); + case '"': + return quotedString<'"', TokenType::QuotedIdentifier, TokenType::ErrorDoubleQuoteIsNotClosed>(pos, token_begin, end); + case '`': + return quotedString<'`', TokenType::QuotedIdentifier, TokenType::ErrorBackQuoteIsNotClosed>(pos, token_begin, end); + + case '(': + return Token(TokenType::OpeningRoundBracket, token_begin, ++pos); + case ')': + return Token(TokenType::ClosingRoundBracket, token_begin, ++pos); + case '[': + return Token(TokenType::OpeningSquareBracket, token_begin, ++pos); + case ']': + return Token(TokenType::ClosingSquareBracket, token_begin, ++pos); + + case ',': + return Token(TokenType::Comma, token_begin, ++pos); + case ';': + return Token(TokenType::Semicolon, token_begin, ++pos); + + case '.': /// qualifier, tuple access operator or start of floating point number + { + /// Just after identifier or complex expression. + if (pos > begin && (pos[-1] == ')' || pos[-1] == ']' || isAlphaNumericASCII(pos[-1]))) + return Token(TokenType::Dot, token_begin, ++pos); + + ++pos; + while (pos < end && isNumericASCII(*pos)) + ++pos; + + /// exponentation + if (pos < end - 1 && (*pos == 'e' || *pos == 'p')) + { + ++pos; + + /// sign of exponent + if (pos < end - 1 && (*pos == '-' || *pos == '+')) + ++pos; + + while (pos < end && isNumericASCII(*pos)) + ++pos; + } + + return Token(TokenType::Number, token_begin, pos); + } + + case '+': + return Token(TokenType::Plus, token_begin, ++pos); + case '-': /// minus (-), arrow (->) or start of comment (--) + { + ++pos; + if (pos < end && *pos == '>') + return Token(TokenType::Arrow, token_begin, ++pos); + + if (pos < end && *pos == '-') + { + ++pos; + return commentUntilEndOfLine(); + } + + return Token(TokenType::Minus, token_begin, pos); + } + case '*': + ++pos; + return Token(TokenType::Asterisk, token_begin, pos); + case '/': /// division (/) or start of comment (//, /*) + { + ++pos; + if (pos < end && (*pos == '/' || *pos == '*')) + { + if (*pos == '/') + { + ++pos; + return commentUntilEndOfLine(); + } + else + { + ++pos; + while (pos <= end - 2) + { + /// This means that nested multiline comments are not supported. + if (pos[0] == '*' && pos[1] == '/') + { + pos += 2; + return Token(TokenType::Comment, token_begin, pos); + } + ++pos; + } + return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end); + } + } + return Token(TokenType::Division, token_begin, pos); + } + case '%': + return Token(TokenType::Modulo, token_begin, ++pos); + case '=': /// =, == + { + ++pos; + if (pos < end && *pos == '=') + ++pos; + return Token(TokenType::Equals, token_begin, pos); + } + case '!': /// != + { + ++pos; + if (pos < end && *pos == '=') + return Token(TokenType::NotEquals, token_begin, ++pos); + return Token(TokenType::ErrorSingleExclamationMark, token_begin, pos); + } + case '<': /// <, <=, <> + { + ++pos; + if (pos < end && *pos == '=') + return Token(TokenType::LessOrEquals, token_begin, ++pos); + if (pos < end && *pos == '>') + return Token(TokenType::NotEquals, token_begin, ++pos); + return Token(TokenType::Less, token_begin, pos); + } + case '>': /// >, >= + { + ++pos; + if (pos < end && *pos == '=') + return Token(TokenType::GreaterOrEquals, token_begin, ++pos); + return Token(TokenType::Greater, token_begin, pos); + } + case '?': + return Token(TokenType::QuestionMark, token_begin, ++pos); + case ':': + return Token(TokenType::Colon, token_begin, ++pos); + case '|': + { + ++pos; + if (pos < end && *pos == '|') + return Token(TokenType::Concatenation, token_begin, ++pos); + return Token(TokenType::ErrorSinglePipeMark, token_begin, pos); + } + + default: + return Token(TokenType::Error, token_begin, ++pos); + } +} + +} diff --git a/dbms/src/Parsers/Lexer.h b/dbms/src/Parsers/Lexer.h new file mode 100644 index 00000000000..d0edd575586 --- /dev/null +++ b/dbms/src/Parsers/Lexer.h @@ -0,0 +1,89 @@ +#pragma once + + +namespace DB +{ + +enum class TokenType +{ + Whitespace, + Comment, + + BareWord, /// Either keyword (SELECT) or identifier (column) + + Number, /// Always non-negative. 123 or something like 123.456e12, 0x123p12 + StringLiteral, /// 'hello word', 'hello''word', 'hello\'word\\' + + QuotedIdentifier, /// "x", `x` + + OpeningRoundBracket, + ClosingRoundBracket, + + OpeningSquareBracket, + ClosingSquareBracket, + + Comma, + Semicolon, + Dot, /// Compound identifiers, like a.b or tuple access operator a.1, (x, y).2. + /// Need to be distinguished from floating point number with omitted integer part: .1 + + Asterisk, /// Could be used as multiplication operator or on it's own: SELECT * + + Plus, + Minus, + Division, + Modulo, + Arrow, /// ->. Should be distinguished from minus operator. + QuestionMark, + Colon, + Equals, + NotEquals, + Less, + Greater, + LessOrEquals, + GreaterOrEquals, + Concatenation, /// || + + EndOfStream, + + /// Something unrecognized. + Error, + /// Something is wrong and we have more information. + ErrorMultilineCommentIsNotClosed, + ErrorSingleQuoteIsNotClosed, + ErrorDoubleQuoteIsNotClosed, + ErrorBackQuoteIsNotClosed, + ErrorSingleExclamationMark, + ErrorSinglePipeMark, + ErrorWordWithoutWhitespace, +}; + + +struct Token +{ + TokenType type; + const char * begin; + const char * end; + + auto size() const { return end - begin; } + + Token() = default; + Token(TokenType type, const char * begin, const char * end) : type(type), begin(begin), end(end) {} +}; + + +class Lexer +{ +public: + Lexer(const char * begin, const char * end) : begin(begin), pos(begin), end(end) {} + Token nextToken(); + +private: + const char * const begin; + const char * pos; + const char * const end; + + void skipWhitespacesAndComments(); +}; + +} diff --git a/dbms/src/Parsers/tests/CMakeLists.txt b/dbms/src/Parsers/tests/CMakeLists.txt index 01c0b2013c7..85471fa2a81 100644 --- a/dbms/src/Parsers/tests/CMakeLists.txt +++ b/dbms/src/Parsers/tests/CMakeLists.txt @@ -2,6 +2,9 @@ include_directories (${CMAKE_CURRENT_BINARY_DIR}) set(SRCS ) +add_executable (lexer lexer.cpp ${SRCS}) +target_link_libraries (lexer dbms) + add_executable (select_parser select_parser.cpp ${SRCS}) target_link_libraries (select_parser dbms) diff --git a/dbms/src/Parsers/tests/lexer.cpp b/dbms/src/Parsers/tests/lexer.cpp new file mode 100644 index 00000000000..59ff8c0d085 --- /dev/null +++ b/dbms/src/Parsers/tests/lexer.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include +#include + + +/** How to test: + * for i in ~/work/ClickHouse/dbms/tests/queries/0_stateless/*.sql; do echo $i; grep -q 'FORMAT' $i || ./lexer < $i || break; done + */ + + +using namespace DB; + +std::map hilite = +{ + {TokenType::Whitespace, "\033[0;44m"}, + {TokenType::Comment, "\033[1;46m"}, + {TokenType::BareWord, "\033[1m"}, + {TokenType::Number, "\033[1;36m"}, + {TokenType::StringLiteral, "\033[1;32m"}, + {TokenType::QuotedIdentifier, "\033[1;35m"}, + + {TokenType::OpeningRoundBracket, "\033[1;33m"}, + {TokenType::ClosingRoundBracket, "\033[1;33m"}, + {TokenType::OpeningSquareBracket, "\033[1;33m"}, + {TokenType::ClosingSquareBracket, "\033[1;33m"}, + + {TokenType::Comma, "\033[1;33m"}, + {TokenType::Semicolon, "\033[1;33m"}, + {TokenType::Dot, "\033[1;33m"}, + {TokenType::Asterisk, "\033[1;33m"}, + {TokenType::Plus, "\033[1;33m"}, + {TokenType::Minus, "\033[1;33m"}, + {TokenType::Division, "\033[1;33m"}, + {TokenType::Modulo, "\033[1;33m"}, + {TokenType::Arrow, "\033[1;33m"}, + {TokenType::QuestionMark, "\033[1;33m"}, + {TokenType::Colon, "\033[1;33m"}, + {TokenType::Equals, "\033[1;33m"}, + {TokenType::NotEquals, "\033[1;33m"}, + {TokenType::Less, "\033[1;33m"}, + {TokenType::Greater, "\033[1;33m"}, + {TokenType::LessOrEquals, "\033[1;33m"}, + {TokenType::GreaterOrEquals, "\033[1;33m"}, + {TokenType::Concatenation, "\033[1;33m"}, + + {TokenType::EndOfStream, ""}, + + {TokenType::Error, "\033[0;41m"}, + {TokenType::ErrorMultilineCommentIsNotClosed, "\033[0;41m"}, + {TokenType::ErrorSingleQuoteIsNotClosed, "\033[0;41m"}, + {TokenType::ErrorDoubleQuoteIsNotClosed, "\033[0;41m"}, + {TokenType::ErrorBackQuoteIsNotClosed, "\033[0;41m"}, + {TokenType::ErrorSingleExclamationMark, "\033[0;41m"}, +}; + + +int main(int argc, char ** argv) +{ + String query; + ReadBufferFromFileDescriptor in(STDIN_FILENO); + WriteBufferFromFileDescriptor out(STDOUT_FILENO); + readStringUntilEOF(query, in); + + Lexer lexer(query.data(), query.data() + query.size()); + + while (true) + { + Token token = lexer.nextToken(); + + if (token.type == TokenType::EndOfStream) + break; + + writeChar(' ', out); + + auto it = hilite.find(token.type); + if (it != hilite.end()) + writeCString(it->second, out); + + writeString(token.begin, token.size(), out); + + if (it != hilite.end()) + writeCString("\033[0m", out); + + writeChar(' ', out); + + if (token.type >= TokenType::Error) + return 1; + } + + writeChar('\n', out); + + return 0; +}