Added lexer [#CLICKHOUSE-2].

This commit is contained in:
Alexey Milovidov 2017-07-09 03:04:05 +03:00 committed by alexey-milovidov
parent e8170fde2a
commit ad598d5dfc
4 changed files with 457 additions and 0 deletions

268
dbms/src/Parsers/Lexer.cpp Normal file
View File

@ -0,0 +1,268 @@
#include <Parsers/Lexer.h>
#include <Common/StringUtils.h>
#include <common/find_first_symbols.h>
namespace DB
{
namespace
{
/// This must be consistent with functions in ReadHelpers.h
template <char quote, TokenType success_token, TokenType error_token>
Token quotedString(const char *& pos, const char * const token_begin, const char * const end)
{
++pos;
while (true)
{
pos = find_first_symbols<quote, '\\'>(pos, end);
if (pos >= end)
return Token(error_token, token_begin, end);
if (*pos == quote)
{
++pos;
if (pos < end && *pos == quote)
{
++pos;
continue;
}
return Token(success_token, token_begin, pos);
}
if (*pos == '\\')
{
++pos;
if (pos >= end)
return Token(error_token, token_begin, end);
++pos;
continue;
}
__builtin_unreachable();
}
}
}
Token Lexer::nextToken()
{
if (pos >= end)
return Token(TokenType::EndOfStream, end, 0);
const char * const token_begin = pos;
auto commentUntilEndOfLine = [&]() mutable
{
pos = find_first_symbols('\n', end); /// This means that newline in single-line comment cannot be escaped.
return Token(TokenType::Comment, token_begin, pos);
};
switch (*pos)
{
case ' ':
case '\t':
case '\n':
case '\r':
case '\f':
case '\v':
{
++pos;
while (pos < end && isWhitespaceASCII(*pos))
++pos;
return Token(TokenType::Whitespace, token_begin, pos);
}
case 'a'...'z':
case 'A'...'Z':
case '_':
{
if (pos > begin && isWordCharASCII(pos[-1]))
return Token(TokenType::ErrorWordWithoutWhitespace, pos, 1);
++pos;
while (pos < end && isWordCharASCII(*pos))
++pos;
return Token(TokenType::BareWord, token_begin, pos);
}
case '0'...'9':
{
/// 0x, 0b
if (pos < end - 2 && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b'))
{
pos += 2;
}
while (pos < end && isNumericASCII(*pos))
++pos;
/// decimal point
if (pos < end && *pos == '.')
{
++pos;
while (pos < end && isNumericASCII(*pos))
++pos;
}
/// exponentation
if (pos + 1 < end && (*pos == 'e' || *pos == 'p'))
{
++pos;
/// sign of exponent
if (pos < end - 1 && (*pos == '-' || *pos == '+'))
++pos;
while (pos < end && isNumericASCII(*pos))
++pos;
}
return Token(TokenType::Number, token_begin, pos);
}
case '\'':
return quotedString<'\'', TokenType::StringLiteral, TokenType::ErrorSingleQuoteIsNotClosed>(pos, token_begin, end);
case '"':
return quotedString<'"', TokenType::QuotedIdentifier, TokenType::ErrorDoubleQuoteIsNotClosed>(pos, token_begin, end);
case '`':
return quotedString<'`', TokenType::QuotedIdentifier, TokenType::ErrorBackQuoteIsNotClosed>(pos, token_begin, end);
case '(':
return Token(TokenType::OpeningRoundBracket, token_begin, ++pos);
case ')':
return Token(TokenType::ClosingRoundBracket, token_begin, ++pos);
case '[':
return Token(TokenType::OpeningSquareBracket, token_begin, ++pos);
case ']':
return Token(TokenType::ClosingSquareBracket, token_begin, ++pos);
case ',':
return Token(TokenType::Comma, token_begin, ++pos);
case ';':
return Token(TokenType::Semicolon, token_begin, ++pos);
case '.': /// qualifier, tuple access operator or start of floating point number
{
/// Just after identifier or complex expression.
if (pos > begin && (pos[-1] == ')' || pos[-1] == ']' || isAlphaNumericASCII(pos[-1])))
return Token(TokenType::Dot, token_begin, ++pos);
++pos;
while (pos < end && isNumericASCII(*pos))
++pos;
/// exponentation
if (pos < end - 1 && (*pos == 'e' || *pos == 'p'))
{
++pos;
/// sign of exponent
if (pos < end - 1 && (*pos == '-' || *pos == '+'))
++pos;
while (pos < end && isNumericASCII(*pos))
++pos;
}
return Token(TokenType::Number, token_begin, pos);
}
case '+':
return Token(TokenType::Plus, token_begin, ++pos);
case '-': /// minus (-), arrow (->) or start of comment (--)
{
++pos;
if (pos < end && *pos == '>')
return Token(TokenType::Arrow, token_begin, ++pos);
if (pos < end && *pos == '-')
{
++pos;
return commentUntilEndOfLine();
}
return Token(TokenType::Minus, token_begin, pos);
}
case '*':
++pos;
return Token(TokenType::Asterisk, token_begin, pos);
case '/': /// division (/) or start of comment (//, /*)
{
++pos;
if (pos < end && (*pos == '/' || *pos == '*'))
{
if (*pos == '/')
{
++pos;
return commentUntilEndOfLine();
}
else
{
++pos;
while (pos <= end - 2)
{
/// This means that nested multiline comments are not supported.
if (pos[0] == '*' && pos[1] == '/')
{
pos += 2;
return Token(TokenType::Comment, token_begin, pos);
}
++pos;
}
return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end);
}
}
return Token(TokenType::Division, token_begin, pos);
}
case '%':
return Token(TokenType::Modulo, token_begin, ++pos);
case '=': /// =, ==
{
++pos;
if (pos < end && *pos == '=')
++pos;
return Token(TokenType::Equals, token_begin, pos);
}
case '!': /// !=
{
++pos;
if (pos < end && *pos == '=')
return Token(TokenType::NotEquals, token_begin, ++pos);
return Token(TokenType::ErrorSingleExclamationMark, token_begin, pos);
}
case '<': /// <, <=, <>
{
++pos;
if (pos < end && *pos == '=')
return Token(TokenType::LessOrEquals, token_begin, ++pos);
if (pos < end && *pos == '>')
return Token(TokenType::NotEquals, token_begin, ++pos);
return Token(TokenType::Less, token_begin, pos);
}
case '>': /// >, >=
{
++pos;
if (pos < end && *pos == '=')
return Token(TokenType::GreaterOrEquals, token_begin, ++pos);
return Token(TokenType::Greater, token_begin, pos);
}
case '?':
return Token(TokenType::QuestionMark, token_begin, ++pos);
case ':':
return Token(TokenType::Colon, token_begin, ++pos);
case '|':
{
++pos;
if (pos < end && *pos == '|')
return Token(TokenType::Concatenation, token_begin, ++pos);
return Token(TokenType::ErrorSinglePipeMark, token_begin, pos);
}
default:
return Token(TokenType::Error, token_begin, ++pos);
}
}
}

89
dbms/src/Parsers/Lexer.h Normal file
View File

@ -0,0 +1,89 @@
#pragma once
namespace DB
{
enum class TokenType
{
Whitespace,
Comment,
BareWord, /// Either keyword (SELECT) or identifier (column)
Number, /// Always non-negative. 123 or something like 123.456e12, 0x123p12
StringLiteral, /// 'hello word', 'hello''word', 'hello\'word\\'
QuotedIdentifier, /// "x", `x`
OpeningRoundBracket,
ClosingRoundBracket,
OpeningSquareBracket,
ClosingSquareBracket,
Comma,
Semicolon,
Dot, /// Compound identifiers, like a.b or tuple access operator a.1, (x, y).2.
/// Need to be distinguished from floating point number with omitted integer part: .1
Asterisk, /// Could be used as multiplication operator or on it's own: SELECT *
Plus,
Minus,
Division,
Modulo,
Arrow, /// ->. Should be distinguished from minus operator.
QuestionMark,
Colon,
Equals,
NotEquals,
Less,
Greater,
LessOrEquals,
GreaterOrEquals,
Concatenation, /// ||
EndOfStream,
/// Something unrecognized.
Error,
/// Something is wrong and we have more information.
ErrorMultilineCommentIsNotClosed,
ErrorSingleQuoteIsNotClosed,
ErrorDoubleQuoteIsNotClosed,
ErrorBackQuoteIsNotClosed,
ErrorSingleExclamationMark,
ErrorSinglePipeMark,
ErrorWordWithoutWhitespace,
};
struct Token
{
TokenType type;
const char * begin;
const char * end;
auto size() const { return end - begin; }
Token() = default;
Token(TokenType type, const char * begin, const char * end) : type(type), begin(begin), end(end) {}
};
class Lexer
{
public:
Lexer(const char * begin, const char * end) : begin(begin), pos(begin), end(end) {}
Token nextToken();
private:
const char * const begin;
const char * pos;
const char * const end;
void skipWhitespacesAndComments();
};
}

View File

@ -2,6 +2,9 @@ include_directories (${CMAKE_CURRENT_BINARY_DIR})
set(SRCS )
add_executable (lexer lexer.cpp ${SRCS})
target_link_libraries (lexer dbms)
add_executable (select_parser select_parser.cpp ${SRCS})
target_link_libraries (select_parser dbms)

View File

@ -0,0 +1,97 @@
#include <map>
#include <Parsers/Lexer.h>
#include <Core/Types.h>
#include <IO/ReadBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
/** How to test:
* for i in ~/work/ClickHouse/dbms/tests/queries/0_stateless/*.sql; do echo $i; grep -q 'FORMAT' $i || ./lexer < $i || break; done
*/
using namespace DB;
std::map<TokenType, const char *> hilite =
{
{TokenType::Whitespace, "\033[0;44m"},
{TokenType::Comment, "\033[1;46m"},
{TokenType::BareWord, "\033[1m"},
{TokenType::Number, "\033[1;36m"},
{TokenType::StringLiteral, "\033[1;32m"},
{TokenType::QuotedIdentifier, "\033[1;35m"},
{TokenType::OpeningRoundBracket, "\033[1;33m"},
{TokenType::ClosingRoundBracket, "\033[1;33m"},
{TokenType::OpeningSquareBracket, "\033[1;33m"},
{TokenType::ClosingSquareBracket, "\033[1;33m"},
{TokenType::Comma, "\033[1;33m"},
{TokenType::Semicolon, "\033[1;33m"},
{TokenType::Dot, "\033[1;33m"},
{TokenType::Asterisk, "\033[1;33m"},
{TokenType::Plus, "\033[1;33m"},
{TokenType::Minus, "\033[1;33m"},
{TokenType::Division, "\033[1;33m"},
{TokenType::Modulo, "\033[1;33m"},
{TokenType::Arrow, "\033[1;33m"},
{TokenType::QuestionMark, "\033[1;33m"},
{TokenType::Colon, "\033[1;33m"},
{TokenType::Equals, "\033[1;33m"},
{TokenType::NotEquals, "\033[1;33m"},
{TokenType::Less, "\033[1;33m"},
{TokenType::Greater, "\033[1;33m"},
{TokenType::LessOrEquals, "\033[1;33m"},
{TokenType::GreaterOrEquals, "\033[1;33m"},
{TokenType::Concatenation, "\033[1;33m"},
{TokenType::EndOfStream, ""},
{TokenType::Error, "\033[0;41m"},
{TokenType::ErrorMultilineCommentIsNotClosed, "\033[0;41m"},
{TokenType::ErrorSingleQuoteIsNotClosed, "\033[0;41m"},
{TokenType::ErrorDoubleQuoteIsNotClosed, "\033[0;41m"},
{TokenType::ErrorBackQuoteIsNotClosed, "\033[0;41m"},
{TokenType::ErrorSingleExclamationMark, "\033[0;41m"},
};
int main(int argc, char ** argv)
{
String query;
ReadBufferFromFileDescriptor in(STDIN_FILENO);
WriteBufferFromFileDescriptor out(STDOUT_FILENO);
readStringUntilEOF(query, in);
Lexer lexer(query.data(), query.data() + query.size());
while (true)
{
Token token = lexer.nextToken();
if (token.type == TokenType::EndOfStream)
break;
writeChar(' ', out);
auto it = hilite.find(token.type);
if (it != hilite.end())
writeCString(it->second, out);
writeString(token.begin, token.size(), out);
if (it != hilite.end())
writeCString("\033[0m", out);
writeChar(' ', out);
if (token.type >= TokenType::Error)
return 1;
}
writeChar('\n', out);
return 0;
}