Merge pull request #43785 from mo-avatar/binhexstringliterals

This commit is contained in:
Vladimir C 2022-12-22 10:56:18 +01:00 committed by GitHub
commit 6a0210fb0f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 236 additions and 66 deletions

View File

@ -0,0 +1,76 @@
#pragma once
#include <Common/hex.h>
namespace DB
{
static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size = 2)
{
if ((end - pos) & 1)
{
*out = unhex(*pos);
++out;
++pos;
}
while (pos < end)
{
*out = unhex2(pos);
pos += word_size;
++out;
}
*out = '\0';
++out;
}
static void inline binStringDecode(const char * pos, const char * end, char *& out)
{
if (pos == end)
{
*out = '\0';
++out;
return;
}
UInt8 left = 0;
/// end - pos is the length of input.
/// (length & 7) to make remain bits length mod 8 is zero to split.
/// e.g. the length is 9 and the input is "101000001",
/// first left_cnt is 1, left is 0, right shift, pos is 1, left = 1
/// then, left_cnt is 0, remain input is '01000001'.
for (UInt8 left_cnt = (end - pos) & 7; left_cnt > 0; --left_cnt)
{
left = left << 1;
if (*pos != '0')
left += 1;
++pos;
}
if (left != 0 || end - pos == 0)
{
*out = left;
++out;
}
assert((end - pos) % 8 == 0);
while (end - pos != 0)
{
UInt8 c = 0;
for (UInt8 i = 0; i < 8; ++i)
{
c = c << 1;
if (*pos != '0')
c += 1;
++pos;
}
*out = c;
++out;
}
*out = '\0';
++out;
}
}

View File

@ -4,7 +4,7 @@
#include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h>
#include <Common/BitHelpers.h>
#include <Common/hex.h>
#include <Common/BinStringDecodeHelper.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
@ -126,20 +126,7 @@ struct UnhexImpl
static void decode(const char * pos, const char * end, char *& out)
{
if ((end - pos) & 1)
{
*out = unhex(*pos);
++out;
++pos;
}
while (pos < end)
{
*out = unhex2(pos);
pos += word_size;
++out;
}
*out = '\0';
++out;
hexStringDecode(pos, end, out, word_size);
}
};
@ -233,52 +220,7 @@ struct UnbinImpl
static void decode(const char * pos, const char * end, char *& out)
{
if (pos == end)
{
*out = '\0';
++out;
return;
}
UInt8 left = 0;
/// end - pos is the length of input.
/// (length & 7) to make remain bits length mod 8 is zero to split.
/// e.g. the length is 9 and the input is "101000001",
/// first left_cnt is 1, left is 0, right shift, pos is 1, left = 1
/// then, left_cnt is 0, remain input is '01000001'.
for (UInt8 left_cnt = (end - pos) & 7; left_cnt > 0; --left_cnt)
{
left = left << 1;
if (*pos != '0')
left += 1;
++pos;
}
if (left != 0 || end - pos == 0)
{
*out = left;
++out;
}
assert((end - pos) % 8 == 0);
while (end - pos != 0)
{
UInt8 c = 0;
for (UInt8 i = 0; i < 8; ++i)
{
c = c << 1;
if (*pos != '0')
c += 1;
++pos;
}
*out = c;
++out;
}
*out = '\0';
++out;
binStringDecode(pos, end, out);
}
};

View File

@ -8,6 +8,7 @@
#include <Parsers/DumpASTNode.h>
#include <Common/typeid_cast.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/BinStringDecodeHelper.h>
#include <Parsers/ASTAsterisk.h>
#include <Parsers/ASTCollation.h>
@ -986,6 +987,38 @@ bool ParserUnsignedInteger::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
return true;
}
inline static bool makeStringLiteral(IParser::Pos & pos, ASTPtr & node, String str)
{
auto literal = std::make_shared<ASTLiteral>(str);
literal->begin = pos;
literal->end = ++pos;
node = literal;
return true;
}
inline static bool makeHexOrBinStringLiteral(IParser::Pos & pos, ASTPtr & node, bool hex, size_t word_size)
{
const char * str_begin = pos->begin + 2;
const char * str_end = pos->end - 1;
if (str_begin == str_end)
return makeStringLiteral(pos, node, "");
PODArray<UInt8> res;
res.resize((pos->size() + word_size) / word_size + 1);
char * res_begin = reinterpret_cast<char *>(res.data());
char * res_pos = res_begin;
if (hex)
{
hexStringDecode(str_begin, str_end, res_pos);
}
else
{
binStringDecode(str_begin, str_end, res_pos);
}
return makeStringLiteral(pos, node, String(reinterpret_cast<char *>(res.data()), (res_pos - res_begin - 1)));
}
bool ParserStringLiteral::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
@ -996,6 +1029,18 @@ bool ParserStringLiteral::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte
if (pos->type == TokenType::StringLiteral)
{
if (*pos->begin == 'x' || *pos->begin == 'X')
{
constexpr size_t word_size = 2;
return makeHexOrBinStringLiteral(pos, node, true, word_size);
}
if (*pos->begin == 'b' || *pos->begin == 'B')
{
constexpr size_t word_size = 8;
return makeHexOrBinStringLiteral(pos, node, false, word_size);
}
ReadBufferFromMemory in(pos->begin, pos->size());
try
@ -1022,11 +1067,7 @@ bool ParserStringLiteral::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte
s = String(pos->begin + heredoc_size, pos->size() - heredoc_size * 2);
}
auto literal = std::make_shared<ASTLiteral>(s);
literal->begin = pos;
literal->end = ++pos;
node = literal;
return true;
return makeStringLiteral(pos, node, s);
}
template <typename Collection>

View File

@ -1,3 +1,4 @@
#include <cassert>
#include <base/defines.h>
#include <Parsers/Lexer.h>
#include <Common/StringUtils/StringUtils.h>
@ -44,6 +45,36 @@ Token quotedString(const char *& pos, const char * const token_begin, const char
}
}
Token quotedHexOrBinString(const char *& pos, const char * const token_begin, const char * const end)
{
constexpr char quote = '\'';
assert(pos[1] == quote);
bool hex = (*pos == 'x' || *pos == 'X');
pos += 2;
if (hex)
{
while (pos < end && isHexDigit(*pos))
++pos;
}
else
{
pos = find_first_not_symbols<'0', '1'>(pos, end);
}
if (pos >= end || *pos != quote)
{
pos = end;
return Token(TokenType::ErrorSingleQuoteIsNotClosed, token_begin, end);
}
++pos;
return Token(TokenType::StringLiteral, token_begin, pos);
}
}
@ -420,6 +451,12 @@ Token Lexer::nextTokenImpl()
return Token(TokenType::DollarSign, token_begin, ++pos);
}
}
if (pos + 2 < end && pos[1] == '\'' && (*pos == 'x' || *pos == 'b' || *pos == 'X' || *pos == 'B'))
{
return quotedHexOrBinString(pos, token_begin, end);
}
if (isWordCharASCII(*pos) || *pos == '$')
{
++pos;

View File

@ -0,0 +1,24 @@
1
0
10
1
1
0
10
1
1
0
10
1
1
0
10
1
1
1
1
1

View File

@ -0,0 +1,29 @@
select b'';
select b'0' == '\0';
select b'00110000'; -- 0
select b'0011000100110000'; -- 10
select b'111001101011010110001011111010001010111110010101' == '测试';
select B'';
select B'0' == '\0';
select B'00110000'; -- 0
select B'0011000100110000'; -- 10
select B'111001101011010110001011111010001010111110010101' == '测试';
select x'';
select x'0' == '\0';
select x'30'; -- 0
select x'3130'; -- 10
select x'e6b58be8af95' == '测试';
select X'';
select X'0' == '\0';
select X'30'; -- 0
select X'3130'; -- 10
select X'e6b58be8af95' == '测试';
select x'' == b'';
select x'0' == b'0';
select X'' == X'';
select X'0' == X'0';

View File

@ -0,0 +1,6 @@
Syntax error
Syntax error
Syntax error
Syntax error
Syntax error
Syntax error

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT --query="SELECT b '0';" 2>&1 | grep -o 'Syntax error'
$CLICKHOUSE_CLIENT --query="SELECT x 'a'" 2>&1 | grep -o 'Syntax error'
$CLICKHOUSE_CLIENT --query="SELECT b'3';" 2>&1 | grep -o 'Syntax error'
$CLICKHOUSE_CLIENT --query="SELECT x'k'" 2>&1 | grep -o 'Syntax error'
$CLICKHOUSE_CLIENT --query="SELECT b'1" 2>&1 | grep -o 'Syntax error'
$CLICKHOUSE_CLIENT --query="SELECT x'a" 2>&1 | grep -o 'Syntax error'