Backport #66106 to 24.6: Fix buffer overflow in unbin

This commit is contained in:
robot-clickhouse 2024-07-05 17:15:43 +00:00
parent 0cfd9846ef
commit 2d5af08d36
5 changed files with 60 additions and 14 deletions

View File

@ -5,7 +5,7 @@
namespace DB
{
static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size = 2)
static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size)
{
if ((end - pos) & 1)
{
@ -23,7 +23,7 @@ static void inline hexStringDecode(const char * pos, const char * end, char *& o
++out;
}
static void inline binStringDecode(const char * pos, const char * end, char *& out)
static void inline binStringDecode(const char * pos, const char * end, char *& out, size_t word_size)
{
if (pos == end)
{
@ -53,7 +53,7 @@ static void inline binStringDecode(const char * pos, const char * end, char *& o
++out;
}
assert((end - pos) % 8 == 0);
chassert((end - pos) % word_size == 0);
while (end - pos != 0)
{

View File

@ -3,14 +3,14 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h>
#include <Common/BitHelpers.h>
#include <Common/BinStringDecodeHelper.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context_fwd.h>
#include <Interpreters/castColumn.h>
#include <Common/BinStringDecodeHelper.h>
#include <Common/BitHelpers.h>
namespace DB
{
@ -218,10 +218,7 @@ struct UnbinImpl
static constexpr auto name = "unbin";
static constexpr size_t word_size = 8;
static void decode(const char * pos, const char * end, char *& out)
{
binStringDecode(pos, end, out);
}
static void decode(const char * pos, const char * end, char *& out) { binStringDecode(pos, end, out, word_size); }
};
/// Encode number or string to string with binary or hexadecimal representation
@ -651,7 +648,15 @@ public:
size_t size = in_offsets.size();
out_offsets.resize(size);
out_vec.resize(in_vec.size() / word_size + size);
size_t max_out_len = 0;
for (size_t i = 0; i < in_offsets.size(); ++i)
{
const size_t len = in_offsets[i] - (i == 0 ? 0 : in_offsets[i - 1])
- /* trailing zero symbol that is always added in ColumnString and that is ignored while decoding */ 1;
max_out_len += (len + word_size - 1) / word_size + /* trailing zero symbol that is always added by Impl::decode */ 1;
}
out_vec.resize(max_out_len);
char * begin = reinterpret_cast<char *>(out_vec.data());
char * pos = begin;
@ -661,6 +666,7 @@ public:
{
size_t new_offset = in_offsets[i];
/// `new_offset - 1` because in ColumnString each string is stored with trailing zero byte
Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset - 1]), pos);
out_offsets[i] = pos - begin;
@ -668,6 +674,9 @@ public:
prev_offset = new_offset;
}
chassert(
static_cast<size_t>(pos - begin) <= out_vec.size(),
fmt::format("too small amount of memory was preallocated: needed {}, but have only {}", pos - begin, out_vec.size()));
out_vec.resize(pos - begin);
return col_res;
@ -680,11 +689,11 @@ public:
ColumnString::Offsets & out_offsets = col_res->getOffsets();
const ColumnString::Chars & in_vec = col_fix_string->getChars();
size_t n = col_fix_string->getN();
const size_t n = col_fix_string->getN();
size_t size = col_fix_string->size();
out_offsets.resize(size);
out_vec.resize(in_vec.size() / word_size + size);
out_vec.resize(((n + word_size - 1) / word_size + /* trailing zero symbol that is always added by Impl::decode */ 1) * size);
char * begin = reinterpret_cast<char *>(out_vec.data());
char * pos = begin;
@ -694,6 +703,7 @@ public:
{
size_t new_offset = prev_offset + n;
/// here we don't subtract 1 from `new_offset` because in ColumnFixedString strings are stored without trailing zero byte
Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset]), pos);
out_offsets[i] = pos - begin;
@ -701,6 +711,9 @@ public:
prev_offset = new_offset;
}
chassert(
static_cast<size_t>(pos - begin) <= out_vec.size(),
fmt::format("too small amount of memory was preallocated: needed {}, but have only {}", pos - begin, out_vec.size()));
out_vec.resize(pos - begin);
return col_res;

View File

@ -1129,11 +1129,11 @@ inline static bool makeHexOrBinStringLiteral(IParser::Pos & pos, ASTPtr & node,
if (hex)
{
hexStringDecode(str_begin, str_end, res_pos);
hexStringDecode(str_begin, str_end, res_pos, word_size);
}
else
{
binStringDecode(str_begin, str_end, res_pos);
binStringDecode(str_begin, str_end, res_pos, word_size);
}
return makeStringLiteral(pos, node, String(reinterpret_cast<char *>(res.data()), (res_pos - res_begin - 1)));

View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
# check for buffer overflow in unbin (due to not enough memory preallocate for output buffer)
# we iterate over all remainders of input string length modulo word_size and check that no assertions are triggered
word_size=8
for i in $(seq 1 $((word_size+1))); do
str=$(printf "%${i}s" | tr ' ' 'x')
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unbin(toFixedString(materialize('$str'), $i)) WITH ROLLUP WITH TOTALS FORMAT NULL"
done
word_size=8
for i in $(seq 1 $((word_size+1))); do
str=$(printf "%${i}s" | tr ' ' 'x')
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unbin(materialize('$str')) WITH ROLLUP WITH TOTALS FORMAT NULL"
done
word_size=2
for i in $(seq 1 $((word_size+1))); do
str=$(printf "%${i}s" | tr ' ' 'x')
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unhex(toFixedString(materialize('$str'), $i)) WITH ROLLUP WITH TOTALS FORMAT NULL"
done
word_size=2
for i in $(seq 1 $((word_size+1))); do
str=$(printf "%${i}s" | tr ' ' 'x')
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unhex(materialize('$str')) WITH ROLLUP WITH TOTALS FORMAT NULL"
done