mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
Merge pull request #66151 from ClickHouse/backport/24.3/66106
Backport #66106 to 24.3: Fix buffer overflow in `unbin`
This commit is contained in:
commit
2153da8ea6
@ -5,7 +5,7 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size = 2)
|
||||
static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size)
|
||||
{
|
||||
if ((end - pos) & 1)
|
||||
{
|
||||
@ -23,7 +23,7 @@ static void inline hexStringDecode(const char * pos, const char * end, char *& o
|
||||
++out;
|
||||
}
|
||||
|
||||
static void inline binStringDecode(const char * pos, const char * end, char *& out)
|
||||
static void inline binStringDecode(const char * pos, const char * end, char *& out, size_t word_size)
|
||||
{
|
||||
if (pos == end)
|
||||
{
|
||||
@ -53,7 +53,7 @@ static void inline binStringDecode(const char * pos, const char * end, char *& o
|
||||
++out;
|
||||
}
|
||||
|
||||
assert((end - pos) % 8 == 0);
|
||||
chassert((end - pos) % word_size == 0);
|
||||
|
||||
while (end - pos != 0)
|
||||
{
|
||||
|
@ -3,14 +3,14 @@
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Common/BitHelpers.h>
|
||||
#include <Common/BinStringDecodeHelper.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Interpreters/castColumn.h>
|
||||
#include <Common/BinStringDecodeHelper.h>
|
||||
#include <Common/BitHelpers.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -218,10 +218,7 @@ struct UnbinImpl
|
||||
static constexpr auto name = "unbin";
|
||||
static constexpr size_t word_size = 8;
|
||||
|
||||
static void decode(const char * pos, const char * end, char *& out)
|
||||
{
|
||||
binStringDecode(pos, end, out);
|
||||
}
|
||||
static void decode(const char * pos, const char * end, char *& out) { binStringDecode(pos, end, out, word_size); }
|
||||
};
|
||||
|
||||
/// Encode number or string to string with binary or hexadecimal representation
|
||||
@ -651,7 +648,15 @@ public:
|
||||
|
||||
size_t size = in_offsets.size();
|
||||
out_offsets.resize(size);
|
||||
out_vec.resize(in_vec.size() / word_size + size);
|
||||
|
||||
size_t max_out_len = 0;
|
||||
for (size_t i = 0; i < in_offsets.size(); ++i)
|
||||
{
|
||||
const size_t len = in_offsets[i] - (i == 0 ? 0 : in_offsets[i - 1])
|
||||
- /* trailing zero symbol that is always added in ColumnString and that is ignored while decoding */ 1;
|
||||
max_out_len += (len + word_size - 1) / word_size + /* trailing zero symbol that is always added by Impl::decode */ 1;
|
||||
}
|
||||
out_vec.resize(max_out_len);
|
||||
|
||||
char * begin = reinterpret_cast<char *>(out_vec.data());
|
||||
char * pos = begin;
|
||||
@ -661,6 +666,7 @@ public:
|
||||
{
|
||||
size_t new_offset = in_offsets[i];
|
||||
|
||||
/// `new_offset - 1` because in ColumnString each string is stored with trailing zero byte
|
||||
Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset - 1]), pos);
|
||||
|
||||
out_offsets[i] = pos - begin;
|
||||
@ -668,6 +674,9 @@ public:
|
||||
prev_offset = new_offset;
|
||||
}
|
||||
|
||||
chassert(
|
||||
static_cast<size_t>(pos - begin) <= out_vec.size(),
|
||||
fmt::format("too small amount of memory was preallocated: needed {}, but have only {}", pos - begin, out_vec.size()));
|
||||
out_vec.resize(pos - begin);
|
||||
|
||||
return col_res;
|
||||
@ -680,11 +689,11 @@ public:
|
||||
ColumnString::Offsets & out_offsets = col_res->getOffsets();
|
||||
|
||||
const ColumnString::Chars & in_vec = col_fix_string->getChars();
|
||||
size_t n = col_fix_string->getN();
|
||||
const size_t n = col_fix_string->getN();
|
||||
|
||||
size_t size = col_fix_string->size();
|
||||
out_offsets.resize(size);
|
||||
out_vec.resize(in_vec.size() / word_size + size);
|
||||
out_vec.resize(((n + word_size - 1) / word_size + /* trailing zero symbol that is always added by Impl::decode */ 1) * size);
|
||||
|
||||
char * begin = reinterpret_cast<char *>(out_vec.data());
|
||||
char * pos = begin;
|
||||
@ -694,6 +703,7 @@ public:
|
||||
{
|
||||
size_t new_offset = prev_offset + n;
|
||||
|
||||
/// here we don't subtract 1 from `new_offset` because in ColumnFixedString strings are stored without trailing zero byte
|
||||
Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset]), pos);
|
||||
|
||||
out_offsets[i] = pos - begin;
|
||||
@ -701,6 +711,9 @@ public:
|
||||
prev_offset = new_offset;
|
||||
}
|
||||
|
||||
chassert(
|
||||
static_cast<size_t>(pos - begin) <= out_vec.size(),
|
||||
fmt::format("too small amount of memory was preallocated: needed {}, but have only {}", pos - begin, out_vec.size()));
|
||||
out_vec.resize(pos - begin);
|
||||
|
||||
return col_res;
|
||||
|
@ -1110,11 +1110,11 @@ inline static bool makeHexOrBinStringLiteral(IParser::Pos & pos, ASTPtr & node,
|
||||
|
||||
if (hex)
|
||||
{
|
||||
hexStringDecode(str_begin, str_end, res_pos);
|
||||
hexStringDecode(str_begin, str_end, res_pos, word_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
binStringDecode(str_begin, str_end, res_pos);
|
||||
binStringDecode(str_begin, str_end, res_pos, word_size);
|
||||
}
|
||||
|
||||
return makeStringLiteral(pos, node, String(reinterpret_cast<char *>(res.data()), (res_pos - res_begin - 1)));
|
||||
|
33
tests/queries/0_stateless/03199_unbin_buffer_overflow.sh
Executable file
33
tests/queries/0_stateless/03199_unbin_buffer_overflow.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
|
||||
# check for buffer overflow in unbin (due to not enough memory preallocate for output buffer)
|
||||
# we iterate over all remainders of input string length modulo word_size and check that no assertions are triggered
|
||||
|
||||
word_size=8
|
||||
for i in $(seq 1 $((word_size+1))); do
|
||||
str=$(printf "%${i}s" | tr ' ' 'x')
|
||||
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unbin(toFixedString(materialize('$str'), $i)) WITH ROLLUP WITH TOTALS FORMAT NULL"
|
||||
done
|
||||
|
||||
word_size=8
|
||||
for i in $(seq 1 $((word_size+1))); do
|
||||
str=$(printf "%${i}s" | tr ' ' 'x')
|
||||
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unbin(materialize('$str')) WITH ROLLUP WITH TOTALS FORMAT NULL"
|
||||
done
|
||||
|
||||
word_size=2
|
||||
for i in $(seq 1 $((word_size+1))); do
|
||||
str=$(printf "%${i}s" | tr ' ' 'x')
|
||||
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unhex(toFixedString(materialize('$str'), $i)) WITH ROLLUP WITH TOTALS FORMAT NULL"
|
||||
done
|
||||
|
||||
word_size=2
|
||||
for i in $(seq 1 $((word_size+1))); do
|
||||
str=$(printf "%${i}s" | tr ' ' 'x')
|
||||
$CLICKHOUSE_CLIENT -q "SELECT count() FROM numbers(99) GROUP BY unhex(materialize('$str')) WITH ROLLUP WITH TOTALS FORMAT NULL"
|
||||
done
|
Loading…
Reference in New Issue
Block a user