Allow var-int encoded 64-bit integers with MSB=1

Resolves: #51486

Until now, it was illegal to encode 64-bit (unsigned) integers with
MSB=1, i.e. values > (1ULL<<63) - 1, as var-int. In more detail, the
var-int code used by ClickHouse server and client spent at most 9 bytes
per value such that 9 * 7 = 63 bits could be encoded. Some 3rd party
clients (e.g. Rust clickhouse-rs) had the same limitation, whereas other
clients understand the full range (Python clickhouse-driver).

PRs #47608 and #48628 added sanity checks as asserts or exceptions
during var-int encoding on the server side. This was considered okay as
such huge integers so far occurred only during testing (usually fuzzing)
but not in practice.

Issue #51486 is a new fuzzing issue where the exception thrown from the
sanity check led to a half-baked progress packet and as a result, a
logical error / server crash.

The only fix which is not another bandaid is to allow the full range in
var-int coding. Clients will have to allow the full range too, a note
will be added to the changelog. (the alternative was to create another
protocol version but as var-int is used all over the place this was
considered infeasible)

Review note: this is the relevant commit.
This commit is contained in:
Robert Schulze 2023-07-06 14:56:05 +00:00
parent 3f744c1e14
commit 271297823a
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
5 changed files with 34 additions and 51 deletions

View File

@ -6,7 +6,6 @@ namespace DB
namespace ErrorCodes
{
extern const int ATTEMPT_TO_READ_AFTER_EOF;
extern const int BAD_ARGUMENTS;
}
void throwReadAfterEOF()
@ -14,12 +13,4 @@ void throwReadAfterEOF()
throw Exception(ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF, "Attempt to read after eof");
}
void throwValueTooLargeForVarIntEncoding(UInt64 x)
{
/// Under practical circumstances, we should virtually never end up here but AST Fuzzer manages to create superlarge input integers
/// which trigger this exception. Intentionally not throwing LOGICAL_ERROR or calling abort() or [ch]assert(false), so AST Fuzzer
/// can swallow the exception and continue to run.
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Value {} is too large for VarInt encoding", x);
}
}

View File

@ -13,73 +13,59 @@ namespace DB
/// Variable-Length Quantity (VLQ) Base-128 compression, also known as Variable Byte (VB) or Varint encoding.
[[noreturn]] void throwReadAfterEOF();
[[noreturn]] void throwValueTooLargeForVarIntEncoding(UInt64 x);
/// NOTE: Due to historical reasons, only values up to 1<<63-1 can be safely encoded/decoded (bigger values are not idempotent under
/// encoding/decoding). This cannot be changed without breaking backward compatibility (some drivers, e.g. clickhouse-rs (Rust), have the
/// same limitation, others support the full 1<<64 range, e.g. clickhouse-driver (Python))
constexpr UInt64 VAR_UINT_MAX = (1ULL<<63) - 1;
inline void writeVarUInt(UInt64 x, WriteBuffer & ostr)
{
if (x > VAR_UINT_MAX) [[unlikely]]
throwValueTooLargeForVarIntEncoding(x);
for (size_t i = 0; i < 9; ++i)
while (x > 0x7F)
{
uint8_t byte = x & 0x7F;
if (x > 0x7F)
byte |= 0x80;
uint8_t byte = 0x80 | (x & 0x7F);
ostr.nextIfAtEnd();
*ostr.position() = byte;
++ostr.position();
x >>= 7;
if (!x)
return;
}
uint8_t final_byte = static_cast<uint8_t>(x);
ostr.nextIfAtEnd();
*ostr.position() = final_byte;
++ostr.position();
}
inline void writeVarUInt(UInt64 x, std::ostream & ostr)
{
if (x > VAR_UINT_MAX) [[unlikely]]
throwValueTooLargeForVarIntEncoding(x);
for (size_t i = 0; i < 9; ++i)
while (x > 0x7F)
{
uint8_t byte = x & 0x7F;
if (x > 0x7F)
byte |= 0x80;
uint8_t byte = 0x80 | (x & 0x7F);
ostr.put(byte);
x >>= 7;
if (!x)
return;
}
uint8_t final_byte = static_cast<uint8_t>(x);
ostr.put(final_byte);
}
inline char * writeVarUInt(UInt64 x, char * ostr)
{
if (x > VAR_UINT_MAX) [[unlikely]]
throwValueTooLargeForVarIntEncoding(x);
for (size_t i = 0; i < 9; ++i)
while (x > 0x7F)
{
uint8_t byte = x & 0x7F;
if (x > 0x7F)
byte |= 0x80;
uint8_t byte = 0x80 | (x & 0x7F);
*ostr = byte;
++ostr;
x >>= 7;
if (!x)
return ostr;
}
uint8_t final_byte = static_cast<uint8_t>(x);
*ostr = final_byte;
++ostr;
return ostr;
}
@ -101,7 +87,7 @@ template <bool check_eof>
inline void readVarUInt(UInt64 & x, ReadBuffer & istr)
{
x = 0;
for (size_t i = 0; i < 9; ++i)
for (size_t i = 0; i < 10; ++i)
{
if constexpr (check_eof)
if (istr.eof()) [[unlikely]]
@ -120,7 +106,7 @@ inline void readVarUInt(UInt64 & x, ReadBuffer & istr)
inline void readVarUInt(UInt64 & x, ReadBuffer & istr)
{
if (istr.buffer().end() - istr.position() >= 9)
if (istr.buffer().end() - istr.position() >= 10)
return impl::readVarUInt<false>(x, istr);
return impl::readVarUInt<true>(x, istr);
}
@ -128,7 +114,7 @@ inline void readVarUInt(UInt64 & x, ReadBuffer & istr)
inline void readVarUInt(UInt64 & x, std::istream & istr)
{
x = 0;
for (size_t i = 0; i < 9; ++i)
for (size_t i = 0; i < 10; ++i)
{
UInt64 byte = istr.get();
x |= (byte & 0x7F) << (7 * i);
@ -143,7 +129,7 @@ inline const char * readVarUInt(UInt64 & x, const char * istr, size_t size)
const char * end = istr + size;
x = 0;
for (size_t i = 0; i < 9; ++i)
for (size_t i = 0; i < 10; ++i)
{
if (istr == end) [[unlikely]]
throwReadAfterEOF();
@ -220,7 +206,8 @@ inline size_t getLengthOfVarUInt(UInt64 x)
: (x < (1ULL << 42) ? 6
: (x < (1ULL << 49) ? 7
: (x < (1ULL << 56) ? 8
: 9)))))));
: (x < (1ULL << 63) ? 9
: 10))))))));
}

View File

@ -1905,17 +1905,18 @@ void TCPHandler::sendData(const Block & block)
{
initBlockOutput(block);
auto prev_bytes_written_out = out->count();
auto prev_bytes_written_compressed_out = state.maybe_compressed_out->count();
size_t prev_bytes_written_out = out->count();
size_t prev_bytes_written_compressed_out = state.maybe_compressed_out->count();
try
{
/// For testing hedged requests
if (unknown_packet_in_send_data)
{
constexpr UInt64 marker = (1ULL<<63) - 1;
--unknown_packet_in_send_data;
if (unknown_packet_in_send_data == 0)
writeVarUInt(VAR_UINT_MAX, *out);
writeVarUInt(marker, *out);
}
writeVarUInt(Protocol::Server::Data, *out);

View File

@ -0,0 +1,4 @@
-- 64-bit integers with MSB set (i.e. values > (1ULL<<63) - 1) could for historical/compat reasons not be serialized as var-ints (issue #51486).
-- These two queries internally produce such big values, run them to be sure no bad things happen.
SELECT topKWeightedState(65535)(now(), -2) FORMAT Null;
SELECT number FROM numbers(toUInt64(-1)) limit 10 Format Null;