ClickHouse/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp

#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnFixedString.h>
#include <DataTypes/DataTypeNullable.h>
#include <Common/HashTable/Hash.h>
#include <ext/bit_cast.h>
#include <Interpreters/BloomFilterHash.h>


namespace DB
{

MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_)
    : bits_per_row(bits_per_row_), hash_functions(hash_functions_)
{
    total_rows = 0;
    bloom_filters.resize(index_columns_);
}

MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(
    size_t bits_per_row_, size_t hash_functions_, size_t total_rows_, const Blocks & granule_index_blocks_)
        : total_rows(total_rows_), bits_per_row(bits_per_row_), hash_functions(hash_functions_)
{
    if (granule_index_blocks_.empty() || !total_rows)
        throw Exception("LOGICAL ERROR: granule_index_blocks empty or total_rows is zero.", ErrorCodes::LOGICAL_ERROR);

    assertGranuleBlocksStructure(granule_index_blocks_);

    for (size_t index = 0; index < granule_index_blocks_.size(); ++index)
    {
        Block granule_index_block = granule_index_blocks_[index];

        if (unlikely(!granule_index_block || !granule_index_block.rows()))
            throw Exception("LOGICAL ERROR: granule_index_block is empty.", ErrorCodes::LOGICAL_ERROR);

        if (index == 0)
        {
            static size_t atom_size = 8;
            size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;

            for (size_t column = 0, columns = granule_index_block.columns(); column < columns; ++column)
                bloom_filters.emplace_back(std::make_shared<BloomFilter>(bytes_size, hash_functions, 0));
        }

        for (size_t column = 0, columns = granule_index_block.columns(); column < columns; ++column)
            fillingBloomFilter(bloom_filters[column], granule_index_block, column);
    }
}

bool MergeTreeIndexGranuleBloomFilter::empty() const
{
    return !total_rows;
}

void MergeTreeIndexGranuleBloomFilter::deserializeBinary(ReadBuffer & istr)
{
    if (!empty())
        throw Exception("Cannot read data to a non-empty bloom filter index.", ErrorCodes::LOGICAL_ERROR);

    readVarUInt(total_rows, istr);
    for (size_t index = 0; index < bloom_filters.size(); ++index)
    {
        static size_t atom_size = 8;
        size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
        bloom_filters[index] = std::make_shared<BloomFilter>(bytes_size, hash_functions, 0);
        istr.read(reinterpret_cast<char *>(bloom_filters[index]->getFilter().data()), bytes_size);
    }
}

void MergeTreeIndexGranuleBloomFilter::serializeBinary(WriteBuffer & ostr) const
{
    if (empty())
        throw Exception("Attempt to write empty bloom filter index.", ErrorCodes::LOGICAL_ERROR);

    static size_t atom_size = 8;
    writeVarUInt(total_rows, ostr);
    size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
    for (const auto & bloom_filter : bloom_filters)
        ostr.write(reinterpret_cast<const char *>(bloom_filter->getFilter().data()), bytes_size);
}

void MergeTreeIndexGranuleBloomFilter::assertGranuleBlocksStructure(const Blocks & granule_index_blocks) const
{
    Block prev_block;
    for (size_t index = 0; index < granule_index_blocks.size(); ++index)
    {
        Block granule_index_block = granule_index_blocks[index];

        if (index != 0)
            assertBlocksHaveEqualStructure(prev_block, granule_index_block, "Granule blocks of bloom filter has difference structure.");

        prev_block = granule_index_block;
    }
}

void MergeTreeIndexGranuleBloomFilter::fillingBloomFilter(BloomFilterPtr & bf, const Block & granule_index_block, size_t index_hash_column)
{
    const auto & column = granule_index_block.getByPosition(index_hash_column);

    if (const auto hash_column = typeid_cast<const ColumnUInt64 *>(column.column.get()))
    {
        const auto & hash_column_vec = hash_column->getData();

        for (size_t index = 0, size = hash_column_vec.size(); index < size; ++index)
        {
            const UInt64 & bf_base_hash = hash_column_vec[index];

            for (size_t i = 0; i < hash_functions; ++i)
                bf->addHashWithSeed(bf_base_hash, BloomFilterHash::bf_hash_seed[i]);
        }
    }
}

}
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>`
			`#include <Columns/ColumnString.h>`
			`#include <Columns/ColumnNullable.h>`
			`#include <Columns/ColumnFixedString.h>`
			`#include <DataTypes/DataTypeNullable.h>`
			`#include <Common/HashTable/Hash.h>`
			`#include <ext/bit_cast.h>`
			`#include <Interpreters/BloomFilterHash.h>`


			`namespace DB`
			`{`

adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_)`
			`: bits_per_row(bits_per_row_), hash_functions(hash_functions_)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
			`total_rows = 0;`
adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`bloom_filters.resize(index_columns_);`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`}`

			`MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(`
adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`size_t bits_per_row_, size_t hash_functions_, size_t total_rows_, const Blocks & granule_index_blocks_)`
			`: total_rows(total_rows_), bits_per_row(bits_per_row_), hash_functions(hash_functions_)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`if (granule_index_blocks_.empty() \|\| !total_rows)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`throw Exception("LOGICAL ERROR: granule_index_blocks empty or total_rows is zero.", ErrorCodes::LOGICAL_ERROR);`

adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`assertGranuleBlocksStructure(granule_index_blocks_);`
support bloom filter for any type 2019-05-10 03:42:28 +00:00
adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`for (size_t index = 0; index < granule_index_blocks_.size(); ++index)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`Block granule_index_block = granule_index_blocks_[index];`
support bloom filter for any type 2019-05-10 03:42:28 +00:00
			`if (unlikely(!granule_index_block \|\| !granule_index_block.rows()))`
			`throw Exception("LOGICAL ERROR: granule_index_block is empty.", ErrorCodes::LOGICAL_ERROR);`

			`if (index == 0)`
			`{`
			`static size_t atom_size = 8;`
			`size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;`

			`for (size_t column = 0, columns = granule_index_block.columns(); column < columns; ++column)`
			`bloom_filters.emplace_back(std::make_shared<BloomFilter>(bytes_size, hash_functions, 0));`
			`}`

			`for (size_t column = 0, columns = granule_index_block.columns(); column < columns; ++column)`
fix build & chmod shell test 2019-06-19 16:32:20 +00:00			`fillingBloomFilter(bloom_filters[column], granule_index_block, column);`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`}`
			`}`

			`bool MergeTreeIndexGranuleBloomFilter::empty() const`
			`{`
			`return !total_rows;`
			`}`

			`void MergeTreeIndexGranuleBloomFilter::deserializeBinary(ReadBuffer & istr)`
			`{`
			`if (!empty())`
			`throw Exception("Cannot read data to a non-empty bloom filter index.", ErrorCodes::LOGICAL_ERROR);`

			`readVarUInt(total_rows, istr);`
			`for (size_t index = 0; index < bloom_filters.size(); ++index)`
			`{`
			`static size_t atom_size = 8;`
			`size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;`
			`bloom_filters[index] = std::make_shared<BloomFilter>(bytes_size, hash_functions, 0);`
			`istr.read(reinterpret_cast<char *>(bloom_filters[index]->getFilter().data()), bytes_size);`
			`}`
			`}`

			`void MergeTreeIndexGranuleBloomFilter::serializeBinary(WriteBuffer & ostr) const`
			`{`
			`if (empty())`
			`throw Exception("Attempt to write empty bloom filter index.", ErrorCodes::LOGICAL_ERROR);`

			`static size_t atom_size = 8;`
			`writeVarUInt(total_rows, ostr);`
			`size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;`
			`for (const auto & bloom_filter : bloom_filters)`
			`ostr.write(reinterpret_cast<const char *>(bloom_filter->getFilter().data()), bytes_size);`
			`}`

			`void MergeTreeIndexGranuleBloomFilter::assertGranuleBlocksStructure(const Blocks & granule_index_blocks) const`
			`{`
			`Block prev_block;`
			`for (size_t index = 0; index < granule_index_blocks.size(); ++index)`
			`{`
			`Block granule_index_block = granule_index_blocks[index];`

			`if (index != 0)`
			`assertBlocksHaveEqualStructure(prev_block, granule_index_block, "Granule blocks of bloom filter has difference structure.");`

			`prev_block = granule_index_block;`
			`}`
			`}`

fix build & chmod shell test 2019-06-19 16:32:20 +00:00			`void MergeTreeIndexGranuleBloomFilter::fillingBloomFilter(BloomFilterPtr & bf, const Block & granule_index_block, size_t index_hash_column)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
			`const auto & column = granule_index_block.getByPosition(index_hash_column);`

			`if (const auto hash_column = typeid_cast<const ColumnUInt64 *>(column.column.get()))`
			`{`
			`const auto & hash_column_vec = hash_column->getData();`

			`for (size_t index = 0, size = hash_column_vec.size(); index < size; ++index)`
			`{`
			`const UInt64 & bf_base_hash = hash_column_vec[index];`

			`for (size_t i = 0; i < hash_functions; ++i)`
			`bf->addHashWithSeed(bf_base_hash, BloomFilterHash::bf_hash_seed[i]);`
			`}`
			`}`
			`}`

			`}`