ClickHouse/src/Interpreters/BloomFilter.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

151 lines
4.9 KiB
C++
Raw Normal View History

2019-02-20 11:22:07 +00:00
#include <Interpreters/BloomFilter.h>
2019-02-20 09:02:19 +00:00
#include <city.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnLowCardinality.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
2019-02-20 09:02:19 +00:00
namespace DB
{
2020-02-25 18:10:48 +00:00
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
2019-02-20 09:02:19 +00:00
2019-02-23 14:46:40 +00:00
static constexpr UInt64 SEED_GEN_A = 845897321;
static constexpr UInt64 SEED_GEN_B = 217728422;
2020-07-10 08:21:40 +00:00
static constexpr UInt64 MAX_BLOOM_FILTER_SIZE = 1 << 30;
2020-07-10 08:13:21 +00:00
BloomFilterParameters::BloomFilterParameters(size_t filter_size_, size_t filter_hashes_, size_t seed_)
: filter_size(filter_size_), filter_hashes(filter_hashes_), seed(seed_)
{
if (filter_size == 0)
throw Exception("The size of bloom filter cannot be zero", ErrorCodes::BAD_ARGUMENTS);
if (filter_hashes == 0)
throw Exception("The number of hash functions for bloom filter cannot be zero", ErrorCodes::BAD_ARGUMENTS);
2020-07-10 08:21:40 +00:00
if (filter_size > MAX_BLOOM_FILTER_SIZE)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of bloom filter cannot be more than {}", MAX_BLOOM_FILTER_SIZE);
2020-07-10 08:13:21 +00:00
}
2020-05-28 12:37:05 +00:00
BloomFilter::BloomFilter(const BloomFilterParameters & params)
: BloomFilter(params.filter_size, params.filter_hashes, params.seed)
{
}
2019-05-10 03:42:28 +00:00
BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_)
2020-07-10 08:13:21 +00:00
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0)
{
assert(size != 0);
assert(hashes != 0);
}
2019-02-20 09:02:19 +00:00
2019-05-10 03:42:28 +00:00
bool BloomFilter::find(const char * data, size_t len)
2019-02-20 09:02:19 +00:00
{
2019-02-23 14:46:40 +00:00
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
2019-02-20 09:02:19 +00:00
for (size_t i = 0; i < hashes; ++i)
{
2019-02-23 14:46:40 +00:00
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
2019-02-23 15:56:48 +00:00
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
2019-02-20 09:02:19 +00:00
return false;
}
return true;
}
2019-05-10 03:42:28 +00:00
void BloomFilter::add(const char * data, size_t len)
2019-02-20 09:02:19 +00:00
{
2019-02-23 14:46:40 +00:00
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
2019-02-20 09:02:19 +00:00
for (size_t i = 0; i < hashes; ++i)
{
2019-02-23 14:46:40 +00:00
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
2019-02-23 15:56:48 +00:00
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
2019-02-20 09:02:19 +00:00
}
}
2019-05-10 03:42:28 +00:00
void BloomFilter::clear()
2019-02-20 12:48:50 +00:00
{
2019-02-23 15:56:48 +00:00
filter.assign(words, 0);
2019-02-20 12:48:50 +00:00
}
2019-05-10 03:42:28 +00:00
bool BloomFilter::contains(const BloomFilter & bf)
2019-02-20 09:02:19 +00:00
{
2019-02-23 15:56:48 +00:00
for (size_t i = 0; i < words; ++i)
2019-02-20 09:02:19 +00:00
{
if ((filter[i] & bf.filter[i]) != bf.filter[i])
return false;
}
return true;
}
2019-05-10 03:42:28 +00:00
UInt64 BloomFilter::isEmpty() const
2019-02-20 16:24:46 +00:00
{
2019-02-23 15:56:48 +00:00
for (size_t i = 0; i < words; ++i)
2019-02-25 18:23:21 +00:00
if (filter[i] != 0)
return false;
return true;
2019-02-23 09:26:32 +00:00
}
2019-05-10 03:42:28 +00:00
bool operator== (const BloomFilter & a, const BloomFilter & b)
2019-02-20 19:27:23 +00:00
{
2019-02-23 15:56:48 +00:00
for (size_t i = 0; i < a.words; ++i)
2019-02-20 19:27:23 +00:00
if (a.filter[i] != b.filter[i])
return false;
return true;
}
2019-06-19 15:09:07 +00:00
void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
2019-05-10 03:42:28 +00:00
{
2019-06-19 15:09:07 +00:00
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
2019-05-10 03:42:28 +00:00
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
}
2019-06-19 15:09:07 +00:00
bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
2019-05-10 03:42:28 +00:00
{
2019-06-19 15:09:07 +00:00
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
2019-05-10 03:42:28 +00:00
return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))));
}
2019-11-01 15:31:02 +00:00
DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type)
{
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
{
if (!typeid_cast<const DataTypeArray *>(array_type->getNestedType().get()))
return getPrimitiveType(array_type->getNestedType());
else
2020-01-10 11:16:14 +00:00
throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::BAD_ARGUMENTS);
}
if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get()))
return getPrimitiveType(nullable_type->getNestedType());
if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_type.get()))
return getPrimitiveType(low_cardinality_type->getDictionaryType());
return data_type;
}
2019-11-01 15:31:02 +00:00
ColumnPtr BloomFilter::getPrimitiveColumn(const ColumnPtr & column)
{
if (const auto * array_col = typeid_cast<const ColumnArray *>(column.get()))
return getPrimitiveColumn(array_col->getDataPtr());
if (const auto * nullable_col = typeid_cast<const ColumnNullable *>(column.get()))
return getPrimitiveColumn(nullable_col->getNestedColumnPtr());
if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(column.get()))
return getPrimitiveColumn(low_cardinality_col->convertToFullColumnIfLowCardinality());
return column;
}
2019-02-26 19:37:07 +00:00
}