ClickHouse/src/Interpreters/BloomFilter.cpp
2023-10-18 14:40:34 +08:00

158 lines
5.1 KiB
C++

#include <Interpreters/BloomFilter.h>
#include <city.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnLowCardinality.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
static constexpr UInt64 SEED_GEN_A = 845897321;
static constexpr UInt64 SEED_GEN_B = 217728422;
static constexpr UInt64 MAX_BLOOM_FILTER_SIZE = 1 << 30;
BloomFilterParameters::BloomFilterParameters(size_t filter_size_, size_t filter_hashes_, size_t seed_)
: filter_size(filter_size_), filter_hashes(filter_hashes_), seed(seed_)
{
if (filter_size == 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of bloom filter cannot be zero");
if (filter_hashes == 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The number of hash functions for bloom filter cannot be zero");
if (filter_size > MAX_BLOOM_FILTER_SIZE)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of bloom filter cannot be more than {}", MAX_BLOOM_FILTER_SIZE);
}
BloomFilter::BloomFilter(const BloomFilterParameters & params)
: BloomFilter(params.filter_size, params.filter_hashes, params.seed)
{
}
BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_)
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0)
{
chassert(size != 0);
chassert(hashes != 0);
}
void BloomFilter::resize(size_t size_)
{
size = size_;
words = ((size + sizeof(UnderType) - 1) / sizeof(UnderType));
filter.resize(words);
}
bool BloomFilter::find(const char * data, size_t len)
{
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
for (size_t i = 0; i < hashes; ++i)
{
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
return false;
}
return true;
}
void BloomFilter::add(const char * data, size_t len)
{
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
for (size_t i = 0; i < hashes; ++i)
{
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
}
}
void BloomFilter::clear()
{
filter.assign(words, 0);
}
bool BloomFilter::contains(const BloomFilter & bf)
{
for (size_t i = 0; i < words; ++i)
{
if ((filter[i] & bf.filter[i]) != bf.filter[i])
return false;
}
return true;
}
UInt64 BloomFilter::isEmpty() const
{
for (size_t i = 0; i < words; ++i)
if (filter[i] != 0)
return false;
return true;
}
bool operator== (const BloomFilter & a, const BloomFilter & b)
{
for (size_t i = 0; i < a.words; ++i)
if (a.filter[i] != b.filter[i])
return false;
return true;
}
void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
{
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
}
bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
{
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))));
}
DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type)
{
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
{
if (!typeid_cast<const DataTypeArray *>(array_type->getNestedType().get()))
return getPrimitiveType(array_type->getNestedType());
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type {} of bloom filter index.", data_type->getName());
}
if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get()))
return getPrimitiveType(nullable_type->getNestedType());
if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_type.get()))
return getPrimitiveType(low_cardinality_type->getDictionaryType());
return data_type;
}
ColumnPtr BloomFilter::getPrimitiveColumn(const ColumnPtr & column)
{
if (const auto * array_col = typeid_cast<const ColumnArray *>(column.get()))
return getPrimitiveColumn(array_col->getDataPtr());
if (const auto * nullable_col = typeid_cast<const ColumnNullable *>(column.get()))
return getPrimitiveColumn(nullable_col->getNestedColumnPtr());
if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(column.get()))
return getPrimitiveColumn(low_cardinality_col->convertToFullColumnIfLowCardinality());
return column;
}
}