Merge pull request #43925 from jh0x/improve-number-separator

Fixes https://github.com/ClickHouse/ClickHouse/issues/28967
This commit is contained in:
Vladimir C 2022-12-13 12:37:32 +01:00 committed by GitHub
commit ef45590413
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 429 additions and 38 deletions

View File

@ -156,6 +156,20 @@ inline bool isValidIdentifier(std::string_view str)
&& !(str.size() == strlen("null") && 0 == strncasecmp(str.data(), "null", strlen("null"))); && !(str.size() == strlen("null") && 0 == strncasecmp(str.data(), "null", strlen("null")));
} }
inline bool isNumberSeparator(bool is_start_of_block, bool is_hex, const char * pos, const char * end)
{
if (*pos != '_')
return false;
if (is_start_of_block && *pos == '_')
return false; // e.g. _123, 12e_3
if (pos + 1 < end && !(is_hex ? isHexDigit(pos[1]) : isNumericASCII(pos[1])))
return false; // e.g. 1__2, 1_., 1_e, 1_p, 1_;
if (pos + 1 == end)
return false; // e.g. 12_
return true;
}
/// Works assuming isAlphaASCII. /// Works assuming isAlphaASCII.
inline char toLowerIfAlphaASCII(char c) inline char toLowerIfAlphaASCII(char c)
{ {

View File

@ -830,21 +830,65 @@ bool ParserNumber::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
if (!pos.isValid()) if (!pos.isValid())
return false; return false;
/** Maximum length of number. 319 symbols is enough to write maximum double in decimal form. auto try_read_float = [&](const char * it, const char * end)
* Copy is needed to use strto* functions, which require 0-terminated string. {
*/ char * str_end;
static constexpr size_t MAX_LENGTH_OF_NUMBER = 319; errno = 0; /// Functions strto* don't clear errno.
Float64 float_value = std::strtod(it, &str_end);
if (str_end == end && errno != ERANGE)
{
if (float_value < 0)
throw Exception("Logical error: token number cannot begin with minus, but parsed float number is less than zero.", ErrorCodes::LOGICAL_ERROR);
if (pos->size() > MAX_LENGTH_OF_NUMBER) if (negative)
float_value = -float_value;
res = float_value;
auto literal = std::make_shared<ASTLiteral>(res);
literal->begin = literal_begin;
literal->end = ++pos;
node = literal;
return true;
}
expected.add(pos, "number");
return false;
};
/// NaN and Inf
if (pos->type == TokenType::BareWord)
{
return try_read_float(pos->begin, pos->end);
}
if (pos->type != TokenType::Number)
{ {
expected.add(pos, "number"); expected.add(pos, "number");
return false; return false;
} }
/** Maximum length of number. 319 symbols is enough to write maximum double in decimal form.
* Copy is needed to use strto* functions, which require 0-terminated string.
*/
static constexpr size_t MAX_LENGTH_OF_NUMBER = 319;
char buf[MAX_LENGTH_OF_NUMBER + 1]; char buf[MAX_LENGTH_OF_NUMBER + 1];
size_t size = pos->size(); size_t buf_size = 0;
memcpy(buf, pos->begin, size); for (const auto * it = pos->begin; it != pos->end; ++it)
{
if (*it != '_')
buf[buf_size++] = *it;
if (unlikely(buf_size > MAX_LENGTH_OF_NUMBER))
{
expected.add(pos, "number");
return false;
}
}
size_t size = buf_size;
buf[size] = 0; buf[size] = 0;
char * start_pos = buf; char * start_pos = buf;
@ -915,29 +959,7 @@ bool ParserNumber::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
return true; return true;
} }
char * pos_double = buf; return try_read_float(buf, buf + buf_size);
errno = 0; /// Functions strto* don't clear errno.
Float64 float_value = std::strtod(buf, &pos_double);
if (pos_double == buf + pos->size() && errno != ERANGE)
{
if (float_value < 0)
throw Exception("Logical error: token number cannot begin with minus, but parsed float number is less than zero.", ErrorCodes::LOGICAL_ERROR);
if (negative)
float_value = -float_value;
res = float_value;
auto literal = std::make_shared<ASTLiteral>(res);
literal->begin = literal_begin;
literal->end = ++pos;
node = literal;
return true;
}
expected.add(pos, "number");
return false;
} }

View File

@ -105,44 +105,71 @@ Token Lexer::nextTokenImpl()
if (prev_significant_token_type == TokenType::Dot) if (prev_significant_token_type == TokenType::Dot)
{ {
++pos; ++pos;
while (pos < end && isNumericASCII(*pos)) while (pos < end && (isNumericASCII(*pos) || isNumberSeparator(false, false, pos, end)))
++pos; ++pos;
} }
else else
{ {
bool start_of_block = false;
/// 0x, 0b /// 0x, 0b
bool hex = false; bool hex = false;
if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B')) if (pos + 2 < end && *pos == '0' && (pos[1] == 'x' || pos[1] == 'b' || pos[1] == 'X' || pos[1] == 'B'))
{ {
bool is_valid = false;
if (pos[1] == 'x' || pos[1] == 'X') if (pos[1] == 'x' || pos[1] == 'X')
hex = true; {
pos += 2; if (isHexDigit(pos[2]))
{
hex = true;
is_valid = true; // hex
}
}
else if (pos[2] == '0' || pos[2] == '1')
is_valid = true; // bin
if (is_valid)
{
pos += 2;
start_of_block = true;
}
else
++pos; // consume the leading zero - could be an identifier
} }
else else
++pos; ++pos;
while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos))) while (pos < end && ((hex ? isHexDigit(*pos) : isNumericASCII(*pos)) || isNumberSeparator(start_of_block, hex, pos, end)))
{
++pos; ++pos;
start_of_block = false;
}
/// decimal point /// decimal point
if (pos < end && *pos == '.') if (pos < end && *pos == '.')
{ {
start_of_block = true;
++pos; ++pos;
while (pos < end && (hex ? isHexDigit(*pos) : isNumericASCII(*pos))) while (pos < end && ((hex ? isHexDigit(*pos) : isNumericASCII(*pos)) || isNumberSeparator(start_of_block, hex, pos, end)))
{
++pos; ++pos;
start_of_block = false;
}
} }
/// exponentiation (base 10 or base 2) /// exponentiation (base 10 or base 2)
if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E'))) if (pos + 1 < end && (hex ? (*pos == 'p' || *pos == 'P') : (*pos == 'e' || *pos == 'E')))
{ {
start_of_block = true;
++pos; ++pos;
/// sign of exponent. It is always decimal. /// sign of exponent. It is always decimal.
if (pos + 1 < end && (*pos == '-' || *pos == '+')) if (pos + 1 < end && (*pos == '-' || *pos == '+'))
++pos; ++pos;
while (pos < end && isNumericASCII(*pos)) while (pos < end && (isNumericASCII(*pos) || isNumberSeparator(start_of_block, false, pos, end)))
{
++pos; ++pos;
start_of_block = false;
}
} }
} }
@ -201,21 +228,29 @@ Token Lexer::nextTokenImpl()
|| prev_significant_token_type == TokenType::Number)) || prev_significant_token_type == TokenType::Number))
return Token(TokenType::Dot, token_begin, ++pos); return Token(TokenType::Dot, token_begin, ++pos);
bool start_of_block = true;
++pos; ++pos;
while (pos < end && isNumericASCII(*pos)) while (pos < end && (isNumericASCII(*pos) || isNumberSeparator(start_of_block, false, pos, end)))
{
++pos; ++pos;
start_of_block = false;
}
/// exponentiation /// exponentiation
if (pos + 1 < end && (*pos == 'e' || *pos == 'E')) if (pos + 1 < end && (*pos == 'e' || *pos == 'E'))
{ {
start_of_block = true;
++pos; ++pos;
/// sign of exponent /// sign of exponent
if (pos + 1 < end && (*pos == '-' || *pos == '+')) if (pos + 1 < end && (*pos == '-' || *pos == '+'))
++pos; ++pos;
while (pos < end && isNumericASCII(*pos)) while (pos < end && (isNumericASCII(*pos) || isNumberSeparator(start_of_block, false, pos, end)))
{
++pos; ++pos;
start_of_block = false;
}
} }
return Token(TokenType::Number, token_begin, pos); return Token(TokenType::Number, token_begin, pos);

View File

@ -0,0 +1,40 @@
#!/usr/bin/expect -f
set basedir [file dirname $argv0]
set basename [file tail $argv0]
exp_internal -f $env(CLICKHOUSE_TMP)/$basename.debuglog 0
log_user 0
set timeout 60
match_max 100000
set stty_init "rows 25 cols 120"
expect_after {
eof { exp_continue }
timeout { exit 1 }
}
spawn bash
send "source $basedir/../shell_config.sh\r"
send "\$CLICKHOUSE_CLIENT --query 'select 0b'\r"
expect "DB::Exception: Missing columns: '0b' while processing query: 'SELECT `0b`', required columns: '0b'. (UNKNOWN_IDENTIFIER)"
send "\$CLICKHOUSE_CLIENT --query 'select 0b;'\r"
expect "DB::Exception: Missing columns: '0b' while processing query: 'SELECT `0b`', required columns: '0b'. (UNKNOWN_IDENTIFIER)"
send "\$CLICKHOUSE_CLIENT --query 'select 0b ;'\r"
expect "DB::Exception: Missing columns: '0b' while processing query: 'SELECT `0b`', required columns: '0b'. (UNKNOWN_IDENTIFIER)"
send "\$CLICKHOUSE_CLIENT --query 'select 0x'\r"
expect "DB::Exception: Missing columns: '0x' while processing query: 'SELECT `0x`', required columns: '0x'. (UNKNOWN_IDENTIFIER)"
send "\$CLICKHOUSE_CLIENT --query 'select 0x;'\r"
expect "DB::Exception: Missing columns: '0x' while processing query: 'SELECT `0x`', required columns: '0x'. (UNKNOWN_IDENTIFIER)"
send "\$CLICKHOUSE_CLIENT --query 'select 0x ;'\r"
expect "DB::Exception: Missing columns: '0x' while processing query: 'SELECT `0x`', required columns: '0x'. (UNKNOWN_IDENTIFIER)"
send "exit\r"
expect eof

View File

@ -0,0 +1,126 @@
1234
1234
1234
1234
1234
1234
-1234
-1234
-1234
12.34
12.34
12.34
12.34
-12.34
-12.34
-12.34
-12.34
3.4e22
3.4e22
3.4e22
3.4e22
3.4e22
3.4e22
3.4e22
3.4e22
3.4e-20
3.4e-20
3.4e-20
3.4e-20
-3.4e22
-3.4e22
-3.4e22
-3.4e22
-3.4e22
-3.4e22
-3.4e22
-3.4e22
-3.4e-20
-3.4e-20
-3.4e-20
-3.4e-20
1.34e21
1.34e21
1.34e21
1.34e21
1.34e21
1.34e21
1.34e21
1.34e21
1.34e-21
1.34e-21
1.34e-21
1.34e-21
-1.34e21
-1.34e21
-1.34e21
-1.34e21
-1.34e21
-1.34e21
-1.34e21
-1.34e21
-1.34e-21
-1.34e-21
-1.34e-21
-1.34e-21
-340000000000000000000
-340000000000000000000
-340000000000000000000
-340000000000000000000
-340000000000000000000
-340000000000000000000
-340000000000000000000
-340000000000000000000
-3.4e-22
-3.4e-22
-3.4e-22
-3.4e-22
nan
nan
inf
inf
-inf
inf
inf
-inf
inf
inf
-inf
15
15
15
-15
-15
-15
4660
4660
4660
-4660
-4660
-4660
238
238
1.1376953125
1.1376953125
-1.1376953125
-1.1376953125
0.9296875
0.9296875
2.275390625
2.275390625
2.275390625
2.275390625
2.275390625
2.275390625
0.56884765625
0.56884765625
0.56884765625
-2.275390625
-2.275390625
-2.275390625
-2.275390625
-2.275390625
-2.275390625
-0.56884765625
-0.56884765625
-0.56884765625

View File

@ -0,0 +1,154 @@
SELECT 1234; -- Positive integer (+ implied)
SELECT 1_234;
SELECT 1_2_3_4;
SELECT +1234; -- Positive integer (+ explicit)
SELECT +1_234;
SELECT +1_2_3_4;
SELECT -1234; -- Negative integer
SELECT -1_234;
SELECT -1_2_3_4;
SELECT 12.34; -- Positive floating point with . notation
SELECT 12.3_4;
SELECT 1_2.34;
SELECT 1_2.3_4;
SELECT -12.34; -- Negative floating point with . notation
SELECT -12.3_4;
SELECT -1_2.34;
SELECT -1_2.3_4;
SELECT 34e21; -- Positive floating point with positive scientific notation (+ implied)
SELECT 3_4e21;
SELECT 34e2_1;
SELECT 3_4e2_1;
SELECT 34e+21; -- Positive floating point with positive scientific notation (+ explicit)
SELECT 3_4e+21;
SELECT 34e+2_1;
SELECT 3_4e+2_1;
SELECT 34e-21; -- Positive floating point with negative scientific notation
SELECT 3_4e-21;
SELECT 34e-2_1;
SELECT 3_4e-2_1;
SELECT -34e21; -- Negative floating point with positive scientific notation (+ implied)
SELECT -3_4e21;
SELECT -34e2_1;
SELECT -3_4e2_1;
SELECT -34e+21; -- Negative floating point with positive scientific notation (+ explicit)
SELECT -3_4e+21;
SELECT -34e+2_1;
SELECT -3_4e+2_1;
SELECT -34e-21; -- Negative floating point with negative scientific notation
SELECT -3_4e-21;
SELECT -34e-2_1;
SELECT -3_4e-2_1;
SELECT 1.34e21; -- Positive floating point (with .) with positive scientific notation (+ implied)
SELECT 1.3_4e21;
SELECT 1.34e2_1;
SELECT 1.3_4e2_1;
SELECT 1.34e+21; -- Positive floating point (with .) with positive scientific notation (+ explicit)
SELECT 1.3_4e+21;
SELECT 1.34e+2_1;
SELECT 1.3_4e+2_1;
SELECT 1.34e-21; -- Positive floating point (with .) with negative scientific notation
SELECT 1.3_4e-21;
SELECT 1.34e-2_1;
SELECT 1.3_4e-2_1;
SELECT -1.34e21; -- Negative floating point (with .) with positive scientific notation (+ implied)
SELECT -1.3_4e21;
SELECT -1.34e2_1;
SELECT -1.3_4e2_1;
SELECT -1.34e+21; -- Negative floating point (with .) with positive scientific notation (+ explicit)
SELECT -1.3_4e+21;
SELECT -1.34e+2_1;
SELECT -1.3_4e+2_1;
SELECT -1.34e-21; -- Negative floating point (with .) with negative scientific notation
SELECT -1.3_4e-21;
SELECT -1.34e-2_1;
SELECT -1.3_4e-2_1;
SELECT -.34e21; -- Negative floating point (with .) with positive scientific notation (+ implied)
SELECT -.3_4e21;
SELECT -.34e2_1;
SELECT -.3_4e2_1;
SELECT -.34e+21; -- Negative floating point (with .) with positive scientific notation (+ explicit)
SELECT -.3_4e+21;
SELECT -.34e+2_1;
SELECT -.3_4e+2_1;
SELECT -.34e-21; -- Negative floating point (with .) with negative scientific notation
SELECT -.3_4e-21;
SELECT -.34e-2_1;
SELECT -.3_4e-2_1;
SELECT NaN; -- Specials
SELECT nan;
SELECT inf;
SELECT +inf;
SELECT -inf;
SELECT Inf;
SELECT +Inf;
SELECT -Inf;
SELECT INF;
SELECT +INF;
SELECT -INF;
SELECT 0b1111; -- Binary
SELECT 0b1_111;
SELECT 0b1_1_1_1;
SELECT -0b1111;
SELECT -0b1_111;
SELECT -0b1_1_1_1;
SELECT 0x1234; -- Hex
SELECT 0x1_234;
SELECT 0x1_2_3_4;
SELECT -0x1234;
SELECT -0x1_234;
SELECT -0x1_2_3_4;
SELECT 0xee;
SELECT 0xe_e;
SELECT 0x1.234; -- Hex fractions
SELECT 0x1.2_3_4;
SELECT -0x1.234;
SELECT -0x1.2_3_4;
SELECT 0x0.ee;
SELECT 0x0.e_e;
SELECT 0x1.234p01; -- Hex scientific notation
SELECT 0x1.2_34p01;
SELECT 0x1.234p0_1;
SELECT 0x1.234p+01;
SELECT 0x1.2_34p+01;
SELECT 0x1.2_34p+0_1;
SELECT 0x1.234p-01;
SELECT 0x1.2_34p-01;
SELECT 0x1.2_34p-0_1;
SELECT -0x1.234p01;
SELECT -0x1.2_34p01;
SELECT -0x1.2_34p0_1;
SELECT -0x1.234p+01;
SELECT -0x1.2_34p+01;
SELECT -0x1.2_34p+0_1;
SELECT -0x1.234p-01;
SELECT -0x1.2_34p-01;
SELECT -0x1.2_34p-0_1;
-- Things that are not a number
select _1000; -- { serverError UNKNOWN_IDENTIFIER }
select _1000 FROM (SELECT 1 AS _1000) FORMAT Null;
select -_1; -- { serverError UNKNOWN_IDENTIFIER }
select -_1 FROM (SELECT -1 AS _1) FORMAT Null;
select +_1; -- { clientError SYNTAX_ERROR }
select 1__0; -- { serverError UNKNOWN_IDENTIFIER }
select 1_; -- { serverError UNKNOWN_IDENTIFIER }
select 1_ ; -- { serverError UNKNOWN_IDENTIFIER }
select 10_; -- { serverError UNKNOWN_IDENTIFIER }
select 1_e5; -- { serverError UNKNOWN_IDENTIFIER }
select 1e_5; -- { serverError UNKNOWN_IDENTIFIER }
select 1e5_; -- { serverError UNKNOWN_IDENTIFIER }
select 1e_; -- { serverError UNKNOWN_IDENTIFIER }
select 1_.; -- { clientError SYNTAX_ERROR }
select 1e_1; -- { serverError UNKNOWN_IDENTIFIER }
select 0_x2; -- { serverError UNKNOWN_IDENTIFIER }
select 0x2_p2; -- { serverError UNKNOWN_IDENTIFIER }
select 0x2p_2; -- { serverError UNKNOWN_IDENTIFIER }
select 0x2p2_; -- { serverError UNKNOWN_IDENTIFIER }
select 0b; -- { serverError UNKNOWN_IDENTIFIER }
select 0b ; -- { serverError UNKNOWN_IDENTIFIER }
select 0x; -- { serverError UNKNOWN_IDENTIFIER }
select 0x ; -- { serverError UNKNOWN_IDENTIFIER }
select 0x_; -- { serverError UNKNOWN_IDENTIFIER }
select 0x_1; -- { serverError UNKNOWN_IDENTIFIER }