Using lexer (development) [#CLICKHOUSE-2].

2024-11-23 08:02:02 +00:00 · 2017-07-13 04:54:51 +03:00 · 2017-07-13 04:54:51 +03:00 · d4974b0724
commit d4974b0724
parent f6ff1f7e72
3 changed files with 54 additions and 35 deletions
--- a/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
@ -277,10 +277,10 @@ private:
                ErrorCodes::SYNTAX_ERROR};
        };

-        auto match = [&](const char * str)
+        auto match = [&pos](const char * str) mutable
        {
            size_t length = strlen(str);
-            if (pos + length < end && 0 == memcmp(pos, str, length))
+            if (pos + length <= end && 0 == memcmp(pos, str, length))
            {
                pos += length;
                return true;
--- a/dbms/src/Parsers/ExpressionElementParsers.cpp
+++ b/dbms/src/Parsers/ExpressionElementParsers.cpp
@ -431,7 +431,7 @@ bool ParserNumber::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
    char * pos_double = buf;
    errno = 0;    /// Functions strto* don't clear errno.
    Float64 float_value = std::strtod(buf, &pos_double);
-    if (pos_double == buf || errno == ERANGE)
+    if (pos_double != buf + pos->size() || errno == ERANGE)
    {
        expected = "number";
        return false;
@ -475,7 +475,7 @@ bool ParserUnsignedInteger::parseImpl(Pos & pos, ASTPtr & node, Expected & expec

    UInt64 x = 0;
    ReadBufferFromMemory in(pos->begin, pos->size());
-    if (!tryReadIntText(x, in) || in.count() == 0)
+    if (!tryReadIntText(x, in) || in.count() != pos->size())
    {
        expected = "unsigned integer";
        return false;
@ -506,6 +506,12 @@ bool ParserStringLiteral::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte
        return false;
    }

+    if (in.count() != pos->size())
+    {
+        expected = "string literal";
+        return false;
+    }
+
    ++pos;
    node = std::make_shared<ASTLiteral>(StringRange(pos->begin, pos->end), s);
    return true;
--- a/dbms/src/Parsers/Lexer.cpp
+++ b/dbms/src/Parsers/Lexer.cpp
@ -98,40 +98,52 @@ Token Lexer::nextTokenImpl()
        {
            /// The task is not to parse a number or check correctness, but only to skip it.

-            /// 0x, 0b
-            bool hex = false;
-            if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
-            {
-                if (pos[1] == 'x' || pos[1] == 'X')
-                    hex = true;
-                pos += 2;
-            }
-            else
-                ++pos;
-
-            while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
-                ++pos;
-
-            /// decimal point
-            if (pos < end && *pos == '.')
+            /// Disambiguation: if previous token was dot, then we could parse only simple integer,
+            ///  for chained tuple access operators (x.1.1) to work.
+            //  Otherwise it will be tokenized as x . 1.1, not as x . 1 . 1
+            if (prev_significant_token_type == TokenType::Dot)
            {
                ++pos;
-                while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
-                    ++pos;
-            }
-
-            /// exponentation (base 10 or base 2)
-            if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E')))
-            {
-                ++pos;
-
-                /// sign of exponent. It is always decimal.
-                if (pos + 1 < end && (*pos == '-' || *pos == '+'))
-                    ++pos;
-
                while (pos < end && isNumericASCII(*pos))
                    ++pos;
            }
+            else
+            {
+                /// 0x, 0b
+                bool hex = false;
+                if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
+                {
+                    if (pos[1] == 'x' || pos[1] == 'X')
+                        hex = true;
+                    pos += 2;
+                }
+                else
+                    ++pos;
+
+                while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
+                    ++pos;
+
+                /// decimal point
+                if (pos < end && *pos == '.')
+                {
+                    ++pos;
+                    while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos)))
+                        ++pos;
+                }
+
+                /// exponentation (base 10 or base 2)
+                if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E')))
+                {
+                    ++pos;
+
+                    /// sign of exponent. It is always decimal.
+                    if (pos + 1 < end && (*pos == '-' || *pos == '+'))
+                        ++pos;
+
+                    while (pos < end && isNumericASCII(*pos))
+                        ++pos;
+                }
+            }

            /// word character cannot go just after number (SELECT 123FROM)
            if (pos < end && isWordCharASCII(*pos))
@ -168,13 +180,14 @@ Token Lexer::nextTokenImpl()

        case '.':   /// qualifier, tuple access operator or start of floating point number
        {
-            /// Just after identifier or complex expression.
+            /// Just after identifier or complex expression or number (for chained tuple access like x.1.1 to work properly).
            if (pos > begin
                && (!(pos + 1 < end && isNumericASCII(pos[1]))
                    || prev_significant_token_type == TokenType::ClosingRoundBracket
                    || prev_significant_token_type == TokenType::ClosingSquareBracket
                    || prev_significant_token_type == TokenType::BareWord
-                    || prev_significant_token_type == TokenType::QuotedIdentifier))
+                    || prev_significant_token_type == TokenType::QuotedIdentifier
+                    || prev_significant_token_type == TokenType::Number))
                return Token(TokenType::Dot, token_begin, ++pos);

            ++pos;