Merge pull request #66952 from ClickHouse/backport/24.6/66579

Backport #66579 to 24.6: Fix weak hash for sparse
This commit is contained in:
Dmitry Novik 2024-07-29 13:55:25 +02:00 committed by GitHub
commit dddd8e2695
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 197 additions and 189 deletions

View File

@ -362,13 +362,10 @@ void ColumnAggregateFunction::updateHashWithValue(size_t n, SipHash & hash) cons
hash.update(wbuf.str().c_str(), wbuf.str().size());
}
void ColumnAggregateFunction::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnAggregateFunction::getWeakHash32() const
{
auto s = data.size();
if (hash.getData().size() != data.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), hash.getData().size());
WeakHash32 hash(s);
auto & hash_data = hash.getData();
std::vector<UInt8> v;
@ -379,6 +376,8 @@ void ColumnAggregateFunction::updateWeakHash32(WeakHash32 & hash) const
wbuf.finalize();
hash_data[i] = ::updateWeakHash32(v.data(), v.size(), hash_data[i]);
}
return hash;
}
void ColumnAggregateFunction::updateHashFast(SipHash & hash) const

View File

@ -170,7 +170,7 @@ public:
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -271,15 +271,12 @@ void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const
getData().updateHashWithValue(offset + i, hash);
}
void ColumnArray::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnArray::getWeakHash32() const
{
auto s = offsets->size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", s, hash.getData().size());
WeakHash32 hash(s);
WeakHash32 internal_hash(data->size());
data->updateWeakHash32(internal_hash);
WeakHash32 internal_hash = data->getWeakHash32();
Offset prev_offset = 0;
const auto & offsets_data = getOffsets();
@ -300,6 +297,8 @@ void ColumnArray::updateWeakHash32(WeakHash32 & hash) const
prev_offset = offsets_data[i];
}
return hash;
}
void ColumnArray::updateHashFast(SipHash & hash) const

View File

@ -82,7 +82,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insert(const Field & x) override;

View File

@ -3,6 +3,7 @@
#include <optional>
#include <Core/Field.h>
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
#include <IO/BufferWithOwnMemory.h>
@ -94,7 +95,7 @@ public:
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeDecompressed(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeDecompressed(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeDecompressed(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); }
WeakHash32 getWeakHash32() const override { throwMustBeDecompressed(); }
void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); }
ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeDecompressed(); }
void expand(const Filter &, bool) override { throwMustBeDecompressed(); }

View File

@ -137,18 +137,10 @@ void ColumnConst::updatePermutation(PermutationSortDirection /*direction*/, Perm
{
}
void ColumnConst::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnConst::getWeakHash32() const
{
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 element_hash(1);
data->updateWeakHash32(element_hash);
size_t data_hash = element_hash.getData()[0];
for (auto & value : hash.getData())
value = static_cast<UInt32>(intHashCRC32(data_hash, value));
WeakHash32 element_hash = data->getWeakHash32();
return WeakHash32(s, element_hash.getData()[0]);
}
void ColumnConst::compareColumn(

View File

@ -190,7 +190,7 @@ public:
data->updateHashWithValue(0, hash);
}
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override
{

View File

@ -28,7 +28,6 @@ namespace ErrorCodes
extern const int PARAMETER_OUT_OF_BOUND;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int NOT_IMPLEMENTED;
extern const int LOGICAL_ERROR;
}
template <is_decimal T>
@ -72,13 +71,10 @@ void ColumnDecimal<T>::updateHashWithValue(size_t n, SipHash & hash) const
}
template <is_decimal T>
void ColumnDecimal<T>::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnDecimal<T>::getWeakHash32() const
{
auto s = data.size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const T * begin = data.data();
const T * end = begin + s;
@ -90,6 +86,8 @@ void ColumnDecimal<T>::updateWeakHash32(WeakHash32 & hash) const
++begin;
++hash_data;
}
return hash;
}
template <is_decimal T>

View File

@ -90,7 +90,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const override;
void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability,

View File

@ -4,6 +4,7 @@
#include <Columns/ColumnVector.h>
#include <Columns/ColumnVariant.h>
#include <DataTypes/IDataType.h>
#include <Common/WeakHash.h>
namespace DB
@ -167,9 +168,9 @@ public:
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override
WeakHash32 getWeakHash32() const override
{
variant_column->updateWeakHash32(hash);
return variant_column->getWeakHash32();
}
void updateHashFast(SipHash & hash) const override

View File

@ -129,14 +129,10 @@ void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const
hash.update(reinterpret_cast<const char *>(&chars[n * index]), n);
}
void ColumnFixedString::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnFixedString::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, "
"hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const UInt8 * pos = chars.data();
UInt32 * hash_data = hash.getData().data();
@ -148,6 +144,8 @@ void ColumnFixedString::updateWeakHash32(WeakHash32 & hash) const
pos += n;
++hash_data;
}
return hash;
}
void ColumnFixedString::updateHashFast(SipHash & hash) const

View File

@ -125,7 +125,7 @@ public:
void updateHashWithValue(size_t index, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -4,6 +4,7 @@
#include <Core/NamesAndTypes.h>
#include <Core/ColumnsWithTypeAndName.h>
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
namespace DB
@ -122,9 +123,9 @@ public:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "updateHashWithValue is not implemented for {}", getName());
}
void updateWeakHash32(WeakHash32 &) const override
WeakHash32 getWeakHash32() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "updateWeakHash32 is not implemented for {}", getName());
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "getWeakHash32 is not implemented for {}", getName());
}
void updateHashFast(SipHash &) const override

View File

@ -7,8 +7,7 @@
#include <Common/HashTable/HashMap.h>
#include <Common/WeakHash.h>
#include <Common/assert_cast.h>
#include "Storages/IndicesDescription.h"
#include "base/types.h"
#include <base/types.h>
#include <base/sort.h>
#include <base/scope_guard.h>
@ -312,19 +311,10 @@ const char * ColumnLowCardinality::skipSerializedInArena(const char * pos) const
return getDictionary().skipSerializedInArena(pos);
}
void ColumnLowCardinality::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnLowCardinality::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
const auto & dict = getDictionary().getNestedColumn();
WeakHash32 dict_hash(dict->size());
dict->updateWeakHash32(dict_hash);
idx.updateWeakHash(hash, dict_hash);
WeakHash32 dict_hash = getDictionary().getNestedColumn()->getWeakHash32();
return idx.getWeakHash(dict_hash);
}
void ColumnLowCardinality::updateHashFast(SipHash & hash) const
@ -820,10 +810,11 @@ bool ColumnLowCardinality::Index::containsDefault() const
return contains;
}
void ColumnLowCardinality::Index::updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const
WeakHash32 ColumnLowCardinality::Index::getWeakHash(const WeakHash32 & dict_hash) const
{
WeakHash32 hash(positions->size());
auto & hash_data = hash.getData();
auto & dict_hash_data = dict_hash.getData();
const auto & dict_hash_data = dict_hash.getData();
auto update_weak_hash = [&](auto x)
{
@ -832,10 +823,11 @@ void ColumnLowCardinality::Index::updateWeakHash(WeakHash32 & hash, WeakHash32 &
auto size = data.size();
for (size_t i = 0; i < size; ++i)
hash_data[i] = static_cast<UInt32>(intHashCRC32(dict_hash_data[data[i]], hash_data[i]));
hash_data[i] = dict_hash_data[data[i]];
};
callForType(std::move(update_weak_hash), size_of_type);
return hash;
}
void ColumnLowCardinality::Index::collectSerializedValueSizes(

View File

@ -103,7 +103,7 @@ public:
getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash);
}
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash &) const override;
@ -313,7 +313,7 @@ public:
bool containsDefault() const;
void updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const;
WeakHash32 getWeakHash(const WeakHash32 & dict_hash) const;
void collectSerializedValueSizes(PaddedPODArray<UInt64> & sizes, const PaddedPODArray<UInt64> & dict_sizes) const;

View File

@ -143,9 +143,9 @@ void ColumnMap::updateHashWithValue(size_t n, SipHash & hash) const
nested->updateHashWithValue(n, hash);
}
void ColumnMap::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnMap::getWeakHash32() const
{
nested->updateWeakHash32(hash);
return nested->getWeakHash32();
}
void ColumnMap::updateHashFast(SipHash & hash) const

View File

@ -64,7 +64,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void insertFrom(const IColumn & src_, size_t n) override;
void insertManyFrom(const IColumn & src, size_t position, size_t length) override;

View File

@ -56,25 +56,21 @@ void ColumnNullable::updateHashWithValue(size_t n, SipHash & hash) const
getNestedColumn().updateHashWithValue(n, hash);
}
void ColumnNullable::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnNullable::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 old_hash = hash;
nested_column->updateWeakHash32(hash);
WeakHash32 hash = nested_column->getWeakHash32();
const auto & null_map_data = getNullMapData();
auto & hash_data = hash.getData();
auto & old_hash_data = old_hash.getData();
/// Use old data for nulls.
/// Use default for nulls.
for (size_t row = 0; row < s; ++row)
if (null_map_data[row])
hash_data[row] = old_hash_data[row];
hash_data[row] = WeakHash32::kDefaultInitialValue;
return hash;
}
void ColumnNullable::updateHashFast(SipHash & hash) const

View File

@ -119,7 +119,7 @@ public:
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;
// Special function for nullable minmax index

View File

@ -5,6 +5,7 @@
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <DataTypes/IDataType.h>
@ -241,7 +242,7 @@ public:
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash &) const override { throwMustBeConcrete(); }
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }

View File

@ -666,20 +666,22 @@ void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
values->updateHashWithValue(getValueIndex(n), hash);
}
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnSparse::getWeakHash32() const
{
if (hash.getData().size() != _size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", _size, hash.getData().size());
WeakHash32 values_hash = values->getWeakHash32();
WeakHash32 hash(size());
auto & hash_data = hash.getData();
auto & values_hash_data = values_hash.getData();
auto offset_it = begin();
auto & hash_data = hash.getData();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
size_t value_index = offset_it.getValueIndex();
auto data_ref = values->getDataAt(value_index);
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
hash_data[i] = values_hash_data[value_index];
}
return hash;
}
void ColumnSparse::updateHashFast(SipHash & hash) const

View File

@ -127,7 +127,7 @@ public:
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;

View File

@ -104,13 +104,10 @@ MutableColumnPtr ColumnString::cloneResized(size_t to_size) const
return res;
}
void ColumnString::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnString::getWeakHash32() const
{
auto s = offsets.size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const UInt8 * pos = chars.data();
UInt32 * hash_data = hash.getData().data();
@ -126,6 +123,8 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash) const
prev_offset = offset;
++hash_data;
}
return hash;
}

View File

@ -204,7 +204,7 @@ public:
hash.update(reinterpret_cast<const char *>(&chars[offset]), string_size);
}
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override
{

View File

@ -300,16 +300,15 @@ void ColumnTuple::updateHashWithValue(size_t n, SipHash & hash) const
column->updateHashWithValue(n, hash);
}
void ColumnTuple::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnTuple::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
for (const auto & column : columns)
column->updateWeakHash32(hash);
hash.update(column->getWeakHash32());
return hash;
}
void ColumnTuple::updateHashFast(SipHash & hash) const

View File

@ -74,7 +74,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;

View File

@ -777,36 +777,26 @@ void ColumnVariant::updateHashWithValue(size_t n, SipHash & hash) const
variants[localDiscriminatorByGlobal(global_discr)]->updateHashWithValue(offsetAt(n), hash);
}
void ColumnVariant::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnVariant::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
/// If we have only NULLs, keep hash unchanged.
if (hasOnlyNulls())
return;
return WeakHash32(s);
/// Optimization for case when there is only 1 non-empty variant and no NULLs.
/// In this case we can just calculate weak hash for this variant.
if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls())
{
variants[*non_empty_local_discr]->updateWeakHash32(hash);
return;
}
return variants[*non_empty_local_discr]->getWeakHash32();
/// Calculate weak hash for all variants.
std::vector<WeakHash32> nested_hashes;
for (const auto & variant : variants)
{
WeakHash32 nested_hash(variant->size());
variant->updateWeakHash32(nested_hash);
nested_hashes.emplace_back(std::move(nested_hash));
}
nested_hashes.emplace_back(variant->getWeakHash32());
/// For each row hash is a hash of corresponding row from corresponding variant.
WeakHash32 hash(s);
auto & hash_data = hash.getData();
const auto & local_discriminators_data = getLocalDiscriminators();
const auto & offsets_data = getOffsets();
@ -815,11 +805,10 @@ void ColumnVariant::updateWeakHash32(WeakHash32 & hash) const
Discriminator discr = local_discriminators_data[i];
/// Update hash only for non-NULL values
if (discr != NULL_DISCRIMINATOR)
{
auto nested_hash = nested_hashes[local_discriminators_data[i]].getData()[offsets_data[i]];
hash_data[i] = static_cast<UInt32>(hashCRC32(nested_hash, hash_data[i]));
}
hash_data[i] = nested_hashes[discr].getData()[offsets_data[i]];
}
return hash;
}
void ColumnVariant::updateHashFast(SipHash & hash) const

View File

@ -203,7 +203,7 @@ public:
const char * deserializeVariantAndInsertFromArena(Discriminator global_discr, const char * pos);
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;

View File

@ -73,13 +73,10 @@ void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash) const
}
template <typename T>
void ColumnVector<T>::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnVector<T>::getWeakHash32() const
{
auto s = data.size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const T * begin = data.data();
const T * end = begin + s;
@ -91,6 +88,8 @@ void ColumnVector<T>::updateWeakHash32(WeakHash32 & hash) const
++begin;
++hash_data;
}
return hash;
}
template <typename T>

View File

@ -106,7 +106,7 @@ public:
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -277,10 +277,10 @@ public:
/// passed bytes to hash must identify sequence of values unambiguously.
virtual void updateHashWithValue(size_t n, SipHash & hash) const = 0;
/// Update hash function value. Hash is calculated for each element.
/// Get hash function value. Hash is calculated for each element.
/// It's a fast weak hash function. Mainly need to scatter data between threads.
/// WeakHash32 must have the same size as column.
virtual void updateWeakHash32(WeakHash32 & hash) const = 0;
virtual WeakHash32 getWeakHash32() const = 0;
/// Update state of hash with all column.
virtual void updateHashFast(SipHash & hash) const = 0;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
namespace DB
@ -59,8 +60,9 @@ public:
{
}
void updateWeakHash32(WeakHash32 & /*hash*/) const override
WeakHash32 getWeakHash32() const override
{
return WeakHash32(s);
}
void updateHashFast(SipHash & /*hash*/) const override

View File

@ -1,6 +1,7 @@
#pragma once
#include <optional>
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
namespace DB
{
@ -162,9 +163,9 @@ public:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method scatter is not supported for ColumnUnique.");
}
void updateWeakHash32(WeakHash32 &) const override
WeakHash32 getWeakHash32() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updateWeakHash32 is not supported for ColumnUnique.");
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getWeakHash32 is not supported for ColumnUnique.");
}
void updateHashFast(SipHash &) const override

View File

@ -60,8 +60,7 @@ TEST(WeakHash32, ColumnVectorU8)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -77,8 +76,7 @@ TEST(WeakHash32, ColumnVectorI8)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -94,8 +92,7 @@ TEST(WeakHash32, ColumnVectorU16)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -111,8 +108,7 @@ TEST(WeakHash32, ColumnVectorI16)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -128,8 +124,7 @@ TEST(WeakHash32, ColumnVectorU32)
data.push_back(i << 16u);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -145,8 +140,7 @@ TEST(WeakHash32, ColumnVectorI32)
data.push_back(i << 16);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -162,8 +156,7 @@ TEST(WeakHash32, ColumnVectorU64)
data.push_back(i << 32u);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -179,8 +172,7 @@ TEST(WeakHash32, ColumnVectorI64)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -204,8 +196,7 @@ TEST(WeakHash32, ColumnVectorU128)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -221,8 +212,7 @@ TEST(WeakHash32, ColumnVectorI128)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -238,8 +228,7 @@ TEST(WeakHash32, ColumnDecimal32)
data.push_back(i << 16);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -255,8 +244,7 @@ TEST(WeakHash32, ColumnDecimal64)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -272,8 +260,7 @@ TEST(WeakHash32, ColumnDecimal128)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -294,8 +281,7 @@ TEST(WeakHash32, ColumnString1)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -331,8 +317,7 @@ TEST(WeakHash32, ColumnString2)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -369,8 +354,7 @@ TEST(WeakHash32, ColumnString3)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -397,8 +381,7 @@ TEST(WeakHash32, ColumnFixedString)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -444,8 +427,7 @@ TEST(WeakHash32, ColumnArray)
auto col_arr = ColumnArray::create(std::move(val), std::move(off));
WeakHash32 hash(col_arr->size());
col_arr->updateWeakHash32(hash);
WeakHash32 hash = col_arr->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -479,8 +461,7 @@ TEST(WeakHash32, ColumnArray2)
auto col_arr = ColumnArray::create(std::move(val), std::move(off));
WeakHash32 hash(col_arr->size());
col_arr->updateWeakHash32(hash);
WeakHash32 hash = col_arr->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -536,8 +517,7 @@ TEST(WeakHash32, ColumnArrayArray)
auto col_arr = ColumnArray::create(std::move(val), std::move(off));
auto col_arr_arr = ColumnArray::create(std::move(col_arr), std::move(off2));
WeakHash32 hash(col_arr_arr->size());
col_arr_arr->updateWeakHash32(hash);
WeakHash32 hash = col_arr_arr->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -555,8 +535,7 @@ TEST(WeakHash32, ColumnConst)
auto col_const = ColumnConst::create(std::move(inner_col), 256);
WeakHash32 hash(col_const->size());
col_const->updateWeakHash32(hash);
WeakHash32 hash = col_const->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -576,8 +555,7 @@ TEST(WeakHash32, ColumnLowcardinality)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -602,8 +580,7 @@ TEST(WeakHash32, ColumnNullable)
auto col_null = ColumnNullable::create(std::move(col), std::move(mask));
WeakHash32 hash(col_null->size());
col_null->updateWeakHash32(hash);
WeakHash32 hash = col_null->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -633,8 +610,7 @@ TEST(WeakHash32, ColumnTupleUInt64UInt64)
columns.emplace_back(std::move(col2));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -671,8 +647,7 @@ TEST(WeakHash32, ColumnTupleUInt64String)
columns.emplace_back(std::move(col2));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -709,8 +684,7 @@ TEST(WeakHash32, ColumnTupleUInt64FixedString)
columns.emplace_back(std::move(col2));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -756,8 +730,7 @@ TEST(WeakHash32, ColumnTupleUInt64Array)
columns.emplace_back(ColumnArray::create(std::move(val), std::move(off)));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}

View File

@ -1,2 +1,24 @@
#include <Common/WeakHash.h>
#include <Common/Exception.h>
#include <Common/HashTable/Hash.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
void WeakHash32::update(const WeakHash32 & other)
{
size_t size = data.size();
if (size != other.data.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match:"
"left size is {}, right size is {}", size, other.data.size());
for (size_t i = 0; i < size; ++i)
data[i] = static_cast<UInt32>(intHashCRC32(other.data[i], data[i]));
}
}

View File

@ -11,9 +11,8 @@ namespace DB
/// The main purpose why this class needed is to support data initialization. Initially, every bit is 1.
class WeakHash32
{
static constexpr UInt32 kDefaultInitialValue = ~UInt32(0);
public:
static constexpr UInt32 kDefaultInitialValue = ~UInt32(0);
using Container = PaddedPODArray<UInt32>;
@ -22,6 +21,8 @@ public:
void reset(size_t size, UInt32 initial_value = kDefaultInitialValue) { data.assign(size, initial_value); }
void update(const WeakHash32 & other);
const Container & getData() const { return data; }
Container & getData() { return data; }

View File

@ -271,7 +271,7 @@ IColumn::Selector ConcurrentHashJoin::selectDispatchBlock(const Strings & key_co
{
const auto & key_col = from_block.getByName(key_name).column->convertToFullColumnIfConst();
const auto & key_col_no_lc = recursiveRemoveLowCardinality(recursiveRemoveSparse(key_col));
key_col_no_lc->updateWeakHash32(hash);
hash.update(key_col_no_lc->getWeakHash32());
}
return hashToSelector(hash, num_shards);
}

View File

@ -554,7 +554,7 @@ static Blocks scatterBlockByHashImpl(const Strings & key_columns_names, const Bl
for (const auto & key_name : key_columns_names)
{
ColumnPtr key_col = materializeColumn(block, key_name);
key_col->updateWeakHash32(hash);
hash.update(key_col->getWeakHash32());
}
auto selector = hashToSelector(hash, sharder);

View File

@ -109,7 +109,7 @@ void ScatterByPartitionTransform::generateOutputChunks()
hash.reset(num_rows);
for (const auto & column_number : key_columns)
columns[column_number]->updateWeakHash32(hash);
hash.update(columns[column_number]->getWeakHash32());
const auto & hash_data = hash.getData();
IColumn::Selector selector(num_rows);

View File

@ -0,0 +1,10 @@
false 1 1
true 1 1
---
false 1 1
false 1 2
false 1 3
true 1 1
true 1 2
---
-755809149 0

View File

@ -0,0 +1,33 @@
create table t(c Int32, d Bool) Engine=MergeTree order by c;
system stop merges t;
insert into t values (1, 0);
insert into t values (1, 0);
insert into t values (1, 1);
insert into t values (1, 0)(1, 1);
SELECT d, c, row_number() over (partition by d order by c) as c8 FROM t qualify c8=1 order by d settings max_threads=2, allow_experimental_analyzer = 1;
SELECT '---';
SELECT d, c, row_number() over (partition by d order by c) as c8 FROM t order by d, c8 settings max_threads=2;
SELECT '---';
drop table t;
create table t (
c Int32 primary key ,
s Bool ,
w Float64
);
system stop merges t;
insert into t values(439499072,true,0),(1393290072,true,0);
insert into t values(-1317193174,false,0),(1929066636,false,0);
insert into t values(-2,false,0),(1962246186,true,0),(2054878592,false,0);
insert into t values(-1893563136,true,41.55);
insert into t values(-1338380855,true,-0.7),(-991301833,true,0),(-755809149,false,43.18),(-41,true,0),(3,false,0),(255,false,0),(255,false,0),(189195893,false,0),(195550885,false,9223372036854776000);
SELECT * FROM (
SELECT c, min(w) OVER (PARTITION BY s ORDER BY c ASC, s ASC, w ASC)
FROM t limit toUInt64(-1))
WHERE c = -755809149;