ClickHouse/dbms/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp

#include <Storages/MergeTree/MergeTreeIndexBloomFilter.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Interpreters/SyntaxAnalyzer.h>
#include <Interpreters/ExpressionAnalyzer.h>
#include <Core/Types.h>
#include <ext/bit_cast.h>
#include <Parsers/ASTLiteral.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h>
#include <Parsers/queryToString.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnLowCardinality.h>
#include <Interpreters/BloomFilterHash.h>


namespace DB
{

namespace ErrorCodes
{
    extern const int LOGICAL_ERROR;
    extern const int INCORRECT_QUERY;
}

MergeTreeIndexBloomFilter::MergeTreeIndexBloomFilter(
    const String & name_, const ExpressionActionsPtr & expr_, const Names & columns_, const DataTypes & data_types_, const Block & header_,
    size_t granularity_, size_t bits_per_row_, size_t hash_functions_)
    : IMergeTreeIndex(name_, expr_, columns_, data_types_, header_, granularity_), bits_per_row(bits_per_row_),
      hash_functions(hash_functions_)
{
}

MergeTreeIndexGranulePtr MergeTreeIndexBloomFilter::createIndexGranule() const
{
    return std::make_shared<MergeTreeIndexGranuleBloomFilter>(bits_per_row, hash_functions, columns.size());
}

bool MergeTreeIndexBloomFilter::mayBenefitFromIndexForIn(const ASTPtr & node) const
{
    const String & column_name = node->getColumnName();

    for (const auto & cname : columns)
        if (column_name == cname)
            return true;

    if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))
    {
        for (const auto & children : func->arguments->children)
            if (mayBenefitFromIndexForIn(children))
                return true;
    }

    return false;
}

MergeTreeIndexAggregatorPtr MergeTreeIndexBloomFilter::createIndexAggregator() const
{
    return std::make_shared<MergeTreeIndexAggregatorBloomFilter>(bits_per_row, hash_functions, columns);
}

MergeTreeIndexConditionPtr MergeTreeIndexBloomFilter::createIndexCondition(const SelectQueryInfo & query_info, const Context & context) const
{
    return std::make_shared<MergeTreeIndexConditionBloomFilter>(query_info, context, header, hash_functions);
}

static void assertIndexColumnsType(const Block & header)
{
    if (!header || !header.columns())
        throw Exception("Index must have columns.", ErrorCodes::INCORRECT_QUERY);

    const DataTypes & columns_data_types = header.getDataTypes();

    for (auto & type : columns_data_types)
    {
        const IDataType * actual_type = BloomFilter::getPrimitiveType(type).get();
        WhichDataType which(actual_type);

        if (!which.isUInt() && !which.isInt() && !which.isString() && !which.isFixedString() && !which.isFloat() &&
            !which.isDateOrDateTime() && !which.isEnum())
            throw Exception("Unexpected type " + type->getName() + " of bloom filter index.",
                            ErrorCodes::ILLEGAL_COLUMN);
    }
}

std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreatorNew(
    const NamesAndTypesList & columns, std::shared_ptr<ASTIndexDeclaration> node, const Context & context)
{
    if (node->name.empty())
        throw Exception("Index must have unique name.", ErrorCodes::INCORRECT_QUERY);

    ASTPtr expr_list = MergeTreeData::extractKeyExpressionList(node->expr->clone());

    auto syntax = SyntaxAnalyzer(context, {}).analyze(expr_list, columns);
    auto index_expr = ExpressionAnalyzer(expr_list, syntax, context).getActions(false);
    auto index_sample = ExpressionAnalyzer(expr_list, syntax, context).getActions(true)->getSampleBlock();

    assertIndexColumnsType(index_sample);

    double max_conflict_probability = 0.025;
    if (node->type->arguments && !node->type->arguments->children.empty())
        max_conflict_probability = typeid_cast<const ASTLiteral &>(*node->type->arguments->children[0]).value.get<Float64>();

    const auto & bits_per_row_and_size_of_hash_functions = BloomFilterHash::calculationBestPractices(max_conflict_probability);

    return std::make_unique<MergeTreeIndexBloomFilter>(
        node->name, std::move(index_expr), index_sample.getNames(), index_sample.getDataTypes(), index_sample, node->granularity,
        bits_per_row_and_size_of_hash_functions.first, bits_per_row_and_size_of_hash_functions.second);
}

}
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`#include <Storages/MergeTree/MergeTreeIndexBloomFilter.h>`
			`#include <Storages/MergeTree/MergeTreeData.h>`
			`#include <Interpreters/SyntaxAnalyzer.h>`
			`#include <Interpreters/ExpressionAnalyzer.h>`
			`#include <Core/Types.h>`
			`#include <ext/bit_cast.h>`
			`#include <Parsers/ASTLiteral.h>`
			`#include <IO/ReadHelpers.h>`
			`#include <IO/WriteHelpers.h>`
Make bloom_filter-index support some types of Arrays. 2019-09-17 11:18:04 +00:00			`#include <DataTypes/DataTypeArray.h>`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`#include <DataTypes/DataTypeNullable.h>`
			`#include <Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h>`
			`#include <Parsers/queryToString.h>`
			`#include <Columns/ColumnConst.h>`
Made bloom_filter type of index supporting LowCardinality and Nullable 2019-10-15 04:22:51 +00:00			`#include <Columns/ColumnLowCardinality.h>`
fix float in bloom filter 2019-06-19 15:09:07 +00:00			`#include <Interpreters/BloomFilterHash.h>`
support bloom filter for any type 2019-05-10 03:42:28 +00:00

			`namespace DB`
			`{`

			`namespace ErrorCodes`
			`{`
			`extern const int LOGICAL_ERROR;`
			`extern const int INCORRECT_QUERY;`
			`}`

			`MergeTreeIndexBloomFilter::MergeTreeIndexBloomFilter(`
fix build and test failure 2019-06-20 00:33:37 +00:00			`const String & name_, const ExpressionActionsPtr & expr_, const Names & columns_, const DataTypes & data_types_, const Block & header_,`
			`size_t granularity_, size_t bits_per_row_, size_t hash_functions_)`
			`: IMergeTreeIndex(name_, expr_, columns_, data_types_, header_, granularity_), bits_per_row(bits_per_row_),`
			`hash_functions(hash_functions_)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
			`}`

			`MergeTreeIndexGranulePtr MergeTreeIndexBloomFilter::createIndexGranule() const`
			`{`
			`return std::make_shared<MergeTreeIndexGranuleBloomFilter>(bits_per_row, hash_functions, columns.size());`
			`}`

			`bool MergeTreeIndexBloomFilter::mayBenefitFromIndexForIn(const ASTPtr & node) const`
			`{`
convert type with condition 2019-06-19 10:50:37 +00:00			`const String & column_name = node->getColumnName();`
support bloom filter for any type 2019-05-10 03:42:28 +00:00
adding -Wshadow for GCC 2019-08-03 11:02:40 +00:00			`for (const auto & cname : columns)`
			`if (column_name == cname)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`return true;`

			`if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))`
convert type with condition 2019-06-19 10:50:37 +00:00			`{`
			`for (const auto & children : func->arguments->children)`
			`if (mayBenefitFromIndexForIn(children))`
			`return true;`
			`}`
support bloom filter for any type 2019-05-10 03:42:28 +00:00
			`return false;`
			`}`

			`MergeTreeIndexAggregatorPtr MergeTreeIndexBloomFilter::createIndexAggregator() const`
			`{`
			`return std::make_shared<MergeTreeIndexAggregatorBloomFilter>(bits_per_row, hash_functions, columns);`
			`}`

fix code style & rename minmax, set 2019-06-19 15:30:48 +00:00			`MergeTreeIndexConditionPtr MergeTreeIndexBloomFilter::createIndexCondition(const SelectQueryInfo & query_info, const Context & context) const`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
			`return std::make_shared<MergeTreeIndexConditionBloomFilter>(query_info, context, header, hash_functions);`
			`}`

fix code style & rename minmax, set 2019-06-19 15:30:48 +00:00			`static void assertIndexColumnsType(const Block & header)`
convert type with condition 2019-06-19 10:50:37 +00:00			`{`
			`if (!header \|\| !header.columns())`
			`throw Exception("Index must have columns.", ErrorCodes::INCORRECT_QUERY);`

			`const DataTypes & columns_data_types = header.getDataTypes();`

Code cleanup. 2019-11-01 15:31:02 +00:00			`for (auto & type : columns_data_types)`
convert type with condition 2019-06-19 10:50:37 +00:00			`{`
Code cleanup. 2019-11-01 15:31:02 +00:00			`const IDataType * actual_type = BloomFilter::getPrimitiveType(type).get();`
Made bloom_filter type of index supporting LowCardinality and Nullable 2019-10-15 04:22:51 +00:00			`WhichDataType which(actual_type);`
Make bloom_filter-index support some types of Arrays. 2019-09-17 11:18:04 +00:00
convert type with condition 2019-06-19 10:50:37 +00:00			`if (!which.isUInt() && !which.isInt() && !which.isString() && !which.isFixedString() && !which.isFloat() &&`
Removed useless code and improve type checking for bloom_filter index 2019-09-23 03:40:48 +00:00			`!which.isDateOrDateTime() && !which.isEnum())`
Code cleanup. 2019-11-01 15:31:02 +00:00			`throw Exception("Unexpected type " + type->getName() + " of bloom filter index.",`
convert type with condition 2019-06-19 10:50:37 +00:00			`ErrorCodes::ILLEGAL_COLUMN);`
			`}`
			`}`

fix code style & rename minmax, set 2019-06-19 15:30:48 +00:00			`std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreatorNew(`
			`const NamesAndTypesList & columns, std::shared_ptr<ASTIndexDeclaration> node, const Context & context)`
support bloom filter for any type 2019-05-10 03:42:28 +00:00			`{`
			`if (node->name.empty())`
			`throw Exception("Index must have unique name.", ErrorCodes::INCORRECT_QUERY);`

			`ASTPtr expr_list = MergeTreeData::extractKeyExpressionList(node->expr->clone());`

			`auto syntax = SyntaxAnalyzer(context, {}).analyze(expr_list, columns);`
			`auto index_expr = ExpressionAnalyzer(expr_list, syntax, context).getActions(false);`
			`auto index_sample = ExpressionAnalyzer(expr_list, syntax, context).getActions(true)->getSampleBlock();`

convert type with condition 2019-06-19 10:50:37 +00:00			`assertIndexColumnsType(index_sample);`
support bloom filter for any type 2019-05-10 03:42:28 +00:00
			`double max_conflict_probability = 0.025;`
			`if (node->type->arguments && !node->type->arguments->children.empty())`
			`max_conflict_probability = typeid_cast<const ASTLiteral &>(*node->type->arguments->children[0]).value.get<Float64>();`

fix float in bloom filter 2019-06-19 15:09:07 +00:00			`const auto & bits_per_row_and_size_of_hash_functions = BloomFilterHash::calculationBestPractices(max_conflict_probability);`
support bloom filter for any type 2019-05-10 03:42:28 +00:00
			`return std::make_unique<MergeTreeIndexBloomFilter>(`
			`node->name, std::move(index_expr), index_sample.getNames(), index_sample.getDataTypes(), index_sample, node->granularity,`
			`bits_per_row_and_size_of_hash_functions.first, bits_per_row_and_size_of_hash_functions.second);`
			`}`

			`}`