Using lexer (incomplete) [#CLICKHOUSE-2].

2024-09-23 18:20:50 +00:00 · 2017-07-12 04:49:20 +03:00 · 2017-07-12 04:49:20 +03:00 · d815b766fa
commit d815b766fa
parent 4b1eeee1a3
8 changed files with 70 additions and 49 deletions
--- a/dbms/src/Parsers/Lexer.cpp
+++ b/dbms/src/Parsers/Lexer.cpp
@ -221,10 +221,10 @@ Token Lexer::nextToken()
                    return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end);
                }
            }
-            return Token(TokenType::Division, token_begin, pos);
+            return Token(TokenType::Slash, token_begin, pos);
        }
        case '%':
-            return Token(TokenType::Modulo, token_begin, ++pos);
+            return Token(TokenType::Percent, token_begin, ++pos);
        case '=':   /// =, ==
        {
            ++pos;
--- a/dbms/src/Parsers/Lexer.h
+++ b/dbms/src/Parsers/Lexer.h
@ -31,8 +31,8 @@ enum class TokenType

    Plus,
    Minus,
-    Division,
-    Modulo,
+    Slash,
+    Percent,
    Arrow,                  /// ->. Should be distinguished from minus operator.
    QuestionMark,
    Colon,
--- a/dbms/src/Parsers/ParserInsertQuery.cpp
+++ b/dbms/src/Parsers/ParserInsertQuery.cpp
@ -71,17 +71,13 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
    /// VALUES or FORMAT or SELECT
    if (s_values.ignore(pos, expected))
    {
-        data = pos;
+        data = pos->begin;
    }
    else if (s_format.ignore(pos, expected))
    {
        if (!name_p.parse(pos, format, expected))
            return false;

-        /// Data starts after the first newline, if there is one, or after all the whitespace characters, otherwise.
-        const char * data_begin = pos->end;
-
-        ws_without_nl.ignore(pos);
        if (pos->type == TokenType::Semicolon)
            throw Exception("You have excessive ';' symbol before data for INSERT.\n"
                "Example:\n\n"
@ -91,10 +87,17 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
                "\n"
                "Note that there is no ';' in first line.", ErrorCodes::SYNTAX_ERROR);

-        if (pos.isValid() && *pos == '\n')
-            ++pos;
+        /// Data starts after the first newline, if there is one, or after all the whitespace characters, otherwise.
+        data = pos->begin;

-        data = pos;
+        while (data < end && (*data == ' ' || *data == '\t' || *data == '\f'))
+            ++data;
+
+        if (data < end && *data == '\r')
+            ++data;
+
+        if (data < end && *data == '\n')
+            ++data;
    }
    else if (s_select.ignore(pos, expected))
    {
@ -108,7 +111,7 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
        return false;
    }

-    auto query = std::make_shared<ASTInsertQuery>(StringRange(begin, data ? data : pos));
+    auto query = std::make_shared<ASTInsertQuery>(StringRange(begin, pos));
    node = query;

    if (database)
@ -121,7 +124,7 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)

    query->columns = columns;
    query->select = select;
-    query->data = data != end ? data : NULL;
+    query->data = data != end ? data : nullptr;
    query->end = end;

    if (columns)
--- a/dbms/src/Parsers/ParserInsertQuery.h
+++ b/dbms/src/Parsers/ParserInsertQuery.h
@ -26,10 +26,11 @@ class ParserInsertQuery : public IParserBase
 {
 private:
    const char * end;
-protected:
+
+    const char * getName() const override { return "INSERT query"; }
+    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
+public:
    ParserInsertQuery(const char * end) : end(end) {}
-    const char * getName() const { return "INSERT query"; }
-    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected);
 };

 }
--- a/dbms/src/Parsers/ParserQuery.h
+++ b/dbms/src/Parsers/ParserQuery.h
@ -10,10 +10,11 @@ class ParserQuery : public IParserBase
 {
 private:
    const char * end;
-protected:
-    ParserInsertQuery(const char * end) : end(end) {}
-    const char * getName() const { return "Query"; }
-    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected);
+
+    const char * getName() const override { return "Query"; }
+    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
+public:
+    ParserQuery(const char * end) : end(end) {}
 };

 }
--- a/dbms/src/Parsers/ParserSampleRatio.cpp
+++ b/dbms/src/Parsers/ParserSampleRatio.cpp
@ -16,7 +16,7 @@ static bool parseDecimal(const char * pos, const char * end, ASTSampleRatio::Rat
    UInt64 num_after = 0;
    Int64 exponent = 0;

-    IParser::Pos pos_after_first_num = tryReadIntText(num_before, pos, end);
+    const char * pos_after_first_num = tryReadIntText(num_before, pos, end);

    bool has_num_before_point = pos_after_first_num > pos;
    pos = pos_after_first_num;
@ -32,7 +32,7 @@ static bool parseDecimal(const char * pos, const char * end, ASTSampleRatio::Rat

    if (has_point)
    {
-        IParser::Pos pos_after_second_num = tryReadIntText(num_after, pos, end);
+        const char * pos_after_second_num = tryReadIntText(num_after, pos, end);
        number_of_digits_after_point = pos_after_second_num - pos;
        pos = pos_after_second_num;
    }
@ -42,7 +42,7 @@ static bool parseDecimal(const char * pos, const char * end, ASTSampleRatio::Rat
    if (has_exponent)
    {
        ++pos;
-        IParser::Pos pos_after_exponent = tryReadIntText(exponent, pos, end);
+        const char * pos_after_exponent = tryReadIntText(exponent, pos, end);

        if (pos_after_exponent == pos)
            return false;
@ -92,7 +92,7 @@ bool ParserSampleRatio::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
    if (!parseDecimal(pos->begin, pos->end, numerator))
        return false;

-    bool has_slash = pos.isValid() && *pos == '/';
+    bool has_slash = pos->type == TokenType::Slash;

    if (has_slash)
    {
--- a/dbms/src/Parsers/parseQuery.cpp
+++ b/dbms/src/Parsers/parseQuery.cpp
@ -1,6 +1,8 @@
 #include <Parsers/parseQuery.h>
 #include <Parsers/ParserQuery.h>
 #include <Parsers/ASTInsertQuery.h>
+#include <Parsers/Lexer.h>
+#include <Parsers/TokenIterator.h>
 #include <Common/StringUtils.h>
 #include <Common/typeid_cast.h>
 #include <IO/WriteHelpers.h>
@ -24,7 +26,7 @@ static std::pair<size_t, size_t> getLineAndCol(const char * begin, const char *
    size_t line = 0;

    const char * nl;
-    while (nullptr != (nl = reinterpret_cast<IParser::Pos>(memchr(begin, '\n', pos - begin))))
+    while (nullptr != (nl = reinterpret_cast<const char *>(memchr(begin, '\n', pos - begin))))
    {
        ++line;
        begin = nl + 1;
@ -117,7 +119,11 @@ ASTPtr tryParseQuery(
    const std::string & description,
    bool allow_multi_statements)
 {
-    if (pos == end || *pos == ';')
+    Tokens tokens(pos, end);
+    TokenIterator token_iterator(tokens);
+
+    if (token_iterator->type == TokenType::EndOfStream
+        || token_iterator->type == TokenType::Semicolon)
    {
        out_error_message = "Empty query";
        return nullptr;
@ -125,33 +131,42 @@ ASTPtr tryParseQuery(

    Expected expected = "";
    const char * begin = pos;
-    const char * max_parsed_pos = pos;

    ASTPtr res;
-    bool parse_res = parser.parse(pos, res, expected);
+    bool parse_res = parser.parse(token_iterator, res, expected);

-    /// Parsed query must end with end of data or semicolon.
-    if (!parse_res || (pos.isValid() && *pos != ';'))
+    /// Lexical error
+    if (!parse_res && token_iterator->type > TokenType::EndOfStream)
+    {
+        expected = "any valid token";
+        out_error_message = getSyntaxErrorMessage(begin, end, token_iterator->begin, expected, hilite, description);
+        return nullptr;
+    }
+
+    /// Excessive input after query. Parsed query must end with end of data or semicolon.
+    if (parse_res && token_iterator->type != TokenType::EndOfStream && token_iterator->type != TokenType::Semicolon)
    {
-        if (!expected || !*expected)
        expected = "end of query";
-        out_error_message = getSyntaxErrorMessage(begin, expected, hilite, description);
+        out_error_message = getSyntaxErrorMessage(begin, end, token_iterator->begin, expected, hilite, description);
        return nullptr;
    }

    /// If multi-statements are not allowed, then after semicolon, there must be no non-space characters.
-    if (!allow_multi_statements && pos.isValid() && *pos == ';')
-    {
-        ++pos;
-        while (pos.isValid() && isWhitespaceASCII(*pos))
-            ++pos;
+    while (token_iterator->type == TokenType::Semicolon)
+        ++token_iterator;

-        if (pos.isValid())
+    if (!allow_multi_statements && token_iterator->type != TokenType::EndOfStream)
    {
-            out_error_message = getSyntaxErrorMessage(begin, end, pos, nullptr, hilite,
+        out_error_message = getSyntaxErrorMessage(begin, end, token_iterator->begin, nullptr, hilite,
            (description.empty() ? std::string() : std::string(". ")) + "Multi-statements are not allowed");
        return nullptr;
    }
+
+    /// Parse error.
+    if (!parse_res)
+    {
+        out_error_message = getSyntaxErrorMessage(begin, end, token_iterator->begin, expected, hilite, description);
+        return nullptr;
    }

    return res;
@ -189,15 +204,16 @@ ASTPtr parseQuery(
 std::pair<const char *, bool> splitMultipartQuery(const std::string & queries, std::vector<std::string> & queries_list)
 {
    ASTPtr ast;
-    ParserQuery parser;

    const char * begin = queries.data(); /// begin of current query
    const char * pos = begin; /// parser moves pos from begin to the end of current query
    const char * end = begin + queries.size();

+    ParserQuery parser(end);
+
    queries_list.clear();

-    while (pos.isValid())
+    while (pos < end)
    {
        begin = pos;

--- a/dbms/src/Parsers/tests/lexer.cpp
+++ b/dbms/src/Parsers/tests/lexer.cpp
@ -34,8 +34,8 @@ std::map<TokenType, const char *> hilite =
    {TokenType::Asterisk, "\033[1;33m"},
    {TokenType::Plus, "\033[1;33m"},
    {TokenType::Minus, "\033[1;33m"},
-    {TokenType::Division, "\033[1;33m"},
-    {TokenType::Modulo, "\033[1;33m"},
+    {TokenType::Slash, "\033[1;33m"},
+    {TokenType::Percent, "\033[1;33m"},
    {TokenType::Arrow, "\033[1;33m"},
    {TokenType::QuestionMark, "\033[1;33m"},
    {TokenType::Colon, "\033[1;33m"},