Fix Array and Map support with Keyed hashing

When working with materialized key columns and rows containing Arrays
or Maps (implemented as Tuple's Arrays) with multiple values,
the keyed hash functions were erroneously refusing to proceed, because
they misinterpreted the output vector size.

Close #61497

which was reported as a security issue, but it didn't actually have any
security impact.
The usefulness of keyed hashing over Maps is also questionable, but
we support it for completeness.
This commit is contained in:
Salvatore Mesoraca 2024-05-10 15:25:21 +02:00
parent 2c6d0c69ab
commit 5004c22583
No known key found for this signature in database
GPG Key ID: 0567E50A25403074
3 changed files with 33 additions and 1 deletions

View File

@ -49,6 +49,8 @@
#include <base/bit_cast.h>
#include <base/unaligned.h>
#include <algorithm>
namespace DB
{
@ -75,17 +77,29 @@ namespace impl
ColumnPtr key0;
ColumnPtr key1;
bool is_const;
const ColumnArray::Offsets * offsets{};
size_t size() const
{
assert(key0 && key1);
assert(key0->size() == key1->size());
assert(offsets == nullptr || offsets->size() == key0->size());
if (offsets != nullptr)
return offsets->back();
return key0->size();
}
SipHashKey getKey(size_t i) const
{
if (is_const)
i = 0;
if (offsets != nullptr)
{
const auto begin = offsets->begin();
auto upper = std::upper_bound(begin, offsets->end(), i);
if (upper == offsets->end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "offset {} not found in function SipHashKeyColumns::getKey", i);
i = upper - begin;
}
const auto & key0data = assert_cast<const ColumnUInt64 &>(*key0).getData();
const auto & key1data = assert_cast<const ColumnUInt64 &>(*key1).getData();
return {key0data[i], key1data[i]};
@ -1112,6 +1126,14 @@ private:
typename ColumnVector<ToType>::Container vec_temp(nested_size);
bool nested_is_first = true;
if constexpr (Keyed)
{
KeyColumnsType key_cols_tmp{key_cols};
key_cols_tmp.offsets = &offsets;
executeForArgument(key_cols_tmp, nested_type, nested_column, vec_temp, nested_is_first);
}
else
executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first);
const size_t size = offsets.size();

View File

@ -236,3 +236,6 @@ Check asan bug
0
Check bug found fuzzing
9042C6691B1A75F0EA3314B6F55728BB
Check bug 2 found fuzzing
608E1FF030C9E206185B112C2A25F1A7
ABB65AE97711A2E053E324ED88B1D08B

View File

@ -338,3 +338,10 @@ SELECT sipHash128((toUInt64(9223372036854775806), 1)) = sipHash128(1) GROUP BY s
SELECT 'Check bug found fuzzing';
SELECT [(255, 1048575)], sipHash128ReferenceKeyed((toUInt64(2147483646), toUInt64(9223372036854775807)), ([(NULL, 100), (NULL, NULL), (1024, 10)], toUInt64(2), toUInt64(1024)), ''), hex(sipHash128ReferenceKeyed((-9223372036854775807, 1.), '-1', NULL)), ('', toUInt64(65535), [(9223372036854775807, 9223372036854775806)], toUInt64(65536)), arrayJoin((NULL, 65537, 255), [(NULL, NULL)]) GROUP BY tupleElement((NULL, NULL, NULL, -1), toUInt64(2), 2) = NULL; -- { serverError NOT_IMPLEMENTED }
SELECT hex(sipHash128ReferenceKeyed((0::UInt64, 0::UInt64), ([1, 1])));
SELECT 'Check bug 2 found fuzzing';
DROP TABLE IF EXISTS sipHashKeyed_keys;
CREATE TABLE sipHashKeyed_keys (`a` Map(String, String)) ENGINE = Memory;
INSERT INTO sipHashKeyed_keys FORMAT VALUES ({'a':'b', 'c':'d'}), ({'e':'f', 'g':'h'});
SELECT hex(sipHash128ReferenceKeyed((0::UInt64, materialize(0::UInt64)), a)) FROM sipHashKeyed_keys ORDER BY a;
DROP TABLE sipHashKeyed_keys;