2019-02-20 11:22:07 +00:00
|
|
|
#include <Interpreters/BloomFilter.h>
|
2019-02-20 09:02:19 +00:00
|
|
|
#include <city.h>
|
2019-10-15 04:22:51 +00:00
|
|
|
#include <Columns/ColumnArray.h>
|
|
|
|
#include <Columns/ColumnNullable.h>
|
|
|
|
#include <Columns/ColumnLowCardinality.h>
|
|
|
|
#include <DataTypes/DataTypeArray.h>
|
|
|
|
#include <DataTypes/DataTypeNullable.h>
|
|
|
|
#include <DataTypes/DataTypeLowCardinality.h>
|
2019-02-20 09:02:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int BAD_ARGUMENTS;
|
|
|
|
}
|
2019-02-20 09:02:19 +00:00
|
|
|
|
2019-02-23 14:46:40 +00:00
|
|
|
static constexpr UInt64 SEED_GEN_A = 845897321;
|
|
|
|
static constexpr UInt64 SEED_GEN_B = 217728422;
|
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
BloomFilter::BloomFilter(const BloomFilterParameters & params)
|
|
|
|
: BloomFilter(params.filter_size, params.filter_hashes, params.seed)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_)
|
2019-02-23 15:56:48 +00:00
|
|
|
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0) {}
|
2019-02-20 09:02:19 +00:00
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
bool BloomFilter::find(const char * data, size_t len)
|
2019-02-20 09:02:19 +00:00
|
|
|
{
|
2019-02-23 14:46:40 +00:00
|
|
|
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
|
|
|
|
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
|
|
|
|
|
2019-02-20 09:02:19 +00:00
|
|
|
for (size_t i = 0; i < hashes; ++i)
|
|
|
|
{
|
2019-02-23 14:46:40 +00:00
|
|
|
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
|
2019-02-23 15:56:48 +00:00
|
|
|
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
|
2019-02-20 09:02:19 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
void BloomFilter::add(const char * data, size_t len)
|
2019-02-20 09:02:19 +00:00
|
|
|
{
|
2019-02-23 14:46:40 +00:00
|
|
|
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
|
|
|
|
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
|
|
|
|
|
2019-02-20 09:02:19 +00:00
|
|
|
for (size_t i = 0; i < hashes; ++i)
|
|
|
|
{
|
2019-02-23 14:46:40 +00:00
|
|
|
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
|
2019-02-23 15:56:48 +00:00
|
|
|
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
|
2019-02-20 09:02:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
void BloomFilter::clear()
|
2019-02-20 12:48:50 +00:00
|
|
|
{
|
2019-02-23 15:56:48 +00:00
|
|
|
filter.assign(words, 0);
|
2019-02-20 12:48:50 +00:00
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
bool BloomFilter::contains(const BloomFilter & bf)
|
2019-02-20 09:02:19 +00:00
|
|
|
{
|
2019-02-23 15:56:48 +00:00
|
|
|
for (size_t i = 0; i < words; ++i)
|
2019-02-20 09:02:19 +00:00
|
|
|
{
|
|
|
|
if ((filter[i] & bf.filter[i]) != bf.filter[i])
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
UInt64 BloomFilter::isEmpty() const
|
2019-02-20 16:24:46 +00:00
|
|
|
{
|
2019-02-23 15:56:48 +00:00
|
|
|
for (size_t i = 0; i < words; ++i)
|
2019-02-25 18:23:21 +00:00
|
|
|
if (filter[i] != 0)
|
|
|
|
return false;
|
|
|
|
return true;
|
2019-02-23 09:26:32 +00:00
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
bool operator== (const BloomFilter & a, const BloomFilter & b)
|
2019-02-20 19:27:23 +00:00
|
|
|
{
|
2019-02-23 15:56:48 +00:00
|
|
|
for (size_t i = 0; i < a.words; ++i)
|
2019-02-20 19:27:23 +00:00
|
|
|
if (a.filter[i] != b.filter[i])
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-06-19 15:09:07 +00:00
|
|
|
void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
|
2019-05-10 03:42:28 +00:00
|
|
|
{
|
2019-06-19 15:09:07 +00:00
|
|
|
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
|
2019-05-10 03:42:28 +00:00
|
|
|
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
|
|
|
|
}
|
|
|
|
|
2019-06-19 15:09:07 +00:00
|
|
|
bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
|
2019-05-10 03:42:28 +00:00
|
|
|
{
|
2019-06-19 15:09:07 +00:00
|
|
|
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
|
2019-05-10 03:42:28 +00:00
|
|
|
return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))));
|
|
|
|
}
|
|
|
|
|
2019-11-01 15:31:02 +00:00
|
|
|
DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type)
|
2019-10-15 04:22:51 +00:00
|
|
|
{
|
|
|
|
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
|
2019-10-18 02:55:01 +00:00
|
|
|
{
|
|
|
|
if (!typeid_cast<const DataTypeArray *>(array_type->getNestedType().get()))
|
|
|
|
return getPrimitiveType(array_type->getNestedType());
|
|
|
|
else
|
2020-01-10 11:16:14 +00:00
|
|
|
throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::BAD_ARGUMENTS);
|
2019-10-18 02:55:01 +00:00
|
|
|
}
|
2019-10-15 04:22:51 +00:00
|
|
|
|
|
|
|
if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get()))
|
|
|
|
return getPrimitiveType(nullable_type->getNestedType());
|
|
|
|
|
|
|
|
if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_type.get()))
|
|
|
|
return getPrimitiveType(low_cardinality_type->getDictionaryType());
|
|
|
|
|
|
|
|
return data_type;
|
|
|
|
}
|
|
|
|
|
2019-11-01 15:31:02 +00:00
|
|
|
ColumnPtr BloomFilter::getPrimitiveColumn(const ColumnPtr & column)
|
2019-10-15 04:22:51 +00:00
|
|
|
{
|
|
|
|
if (const auto * array_col = typeid_cast<const ColumnArray *>(column.get()))
|
|
|
|
return getPrimitiveColumn(array_col->getDataPtr());
|
|
|
|
|
|
|
|
if (const auto * nullable_col = typeid_cast<const ColumnNullable *>(column.get()))
|
|
|
|
return getPrimitiveColumn(nullable_col->getNestedColumnPtr());
|
|
|
|
|
|
|
|
if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(column.get()))
|
|
|
|
return getPrimitiveColumn(low_cardinality_col->convertToFullColumnIfLowCardinality());
|
|
|
|
|
|
|
|
return column;
|
|
|
|
}
|
|
|
|
|
2019-02-26 19:37:07 +00:00
|
|
|
}
|