ClickHouse/src/Storages/MergeTree/MergeTreeIndices.h

230 lines
7.5 KiB
C++
Raw Normal View History

2018-12-26 12:19:24 +00:00
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
2018-12-26 17:34:44 +00:00
#include <memory>
#include <utility>
2018-12-26 17:34:44 +00:00
#include <Core/Block.h>
2020-05-27 18:38:34 +00:00
#include <Storages/StorageInMemoryMetadata.h>
2019-01-02 14:24:26 +00:00
#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
2018-12-29 11:12:41 +00:00
#include <Storages/SelectQueryInfo.h>
#include <Storages/MergeTree/MarkRange.h>
2018-12-26 17:34:44 +00:00
#include <Interpreters/ExpressionActions.h>
#include <DataTypes/DataTypeLowCardinality.h>
2018-12-26 12:19:24 +00:00
2019-01-05 09:26:02 +00:00
constexpr auto INDEX_FILE_PREFIX = "skp_idx_";
2018-12-26 12:19:24 +00:00
namespace DB
{
2021-11-21 20:02:33 +00:00
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
using MergeTreeIndexVersion = uint8_t;
struct MergeTreeIndexFormat
{
MergeTreeIndexVersion version;
const char* extension;
operator bool() const { return version != 0; }
};
2019-02-06 07:49:18 +00:00
/// Stores some info about a single block of data.
struct IMergeTreeIndexGranule
2019-01-03 16:47:42 +00:00
{
2019-02-06 07:49:18 +00:00
virtual ~IMergeTreeIndexGranule() = default;
2019-01-03 16:47:42 +00:00
/// Serialize always last version.
2019-01-04 14:33:38 +00:00
virtual void serializeBinary(WriteBuffer & ostr) const = 0;
/// Version of the index to deserialize:
///
/// - 2 -- minmax index for proper Nullable support,
/// - 1 -- everything else.
///
/// Implementation is responsible for version check,
/// and throw LOGICAL_ERROR in case of unsupported version.
///
/// See also:
/// - IMergeTreeIndex::getSerializedFileExtension()
/// - IMergeTreeIndex::getDeserializedFormat()
/// - MergeTreeDataMergerMutator::collectFilesToSkip()
/// - MergeTreeDataMergerMutator::collectFilesForRenames()
virtual void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) = 0;
2019-01-03 16:47:42 +00:00
2019-01-04 14:33:38 +00:00
virtual bool empty() const = 0;
2019-03-08 15:36:39 +00:00
};
using MergeTreeIndexGranulePtr = std::shared_ptr<IMergeTreeIndexGranule>;
using MergeTreeIndexGranules = std::vector<MergeTreeIndexGranulePtr>;
/// Aggregates info about a single block of data.
struct IMergeTreeIndexAggregator
{
virtual ~IMergeTreeIndexAggregator() = default;
virtual bool empty() const = 0;
2019-03-08 19:52:21 +00:00
virtual MergeTreeIndexGranulePtr getGranuleAndReset() = 0;
2019-02-06 07:49:18 +00:00
/// Updates the stored info using rows of the specified block.
/// Reads no more than `limit` rows.
/// After finishing updating `pos` will store the position of the first row which was not read.
virtual void update(const Block & block, size_t * pos, size_t limit) = 0;
2019-01-03 16:47:42 +00:00
};
2019-03-08 15:36:39 +00:00
using MergeTreeIndexAggregatorPtr = std::shared_ptr<IMergeTreeIndexAggregator>;
using MergeTreeIndexAggregators = std::vector<MergeTreeIndexAggregatorPtr>;
2019-01-03 16:47:42 +00:00
2019-02-06 07:49:18 +00:00
2019-01-07 12:51:14 +00:00
/// Condition on the index.
2019-06-19 15:30:48 +00:00
class IMergeTreeIndexCondition
2019-01-26 06:26:49 +00:00
{
2019-01-07 12:51:14 +00:00
public:
2019-06-19 15:30:48 +00:00
virtual ~IMergeTreeIndexCondition() = default;
2019-01-07 12:51:14 +00:00
/// Checks if this index is useful for query.
virtual bool alwaysUnknownOrTrue() const = 0;
2019-01-08 17:27:44 +00:00
virtual bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const = 0;
2019-01-06 15:22:04 +00:00
};
2019-06-19 15:30:48 +00:00
using MergeTreeIndexConditionPtr = std::shared_ptr<IMergeTreeIndexCondition>;
2021-05-02 19:16:40 +00:00
using MergeTreeIndexConditions = std::vector<MergeTreeIndexConditionPtr>;
2019-01-07 12:51:14 +00:00
2021-11-21 19:14:20 +00:00
struct IMergeTreeIndex;
using MergeTreeIndexPtr = std::shared_ptr<const IMergeTreeIndex>;
/// IndexCondition that checks several indexes at the same time.
class IMergeTreeIndexMergedCondition
{
public:
explicit IMergeTreeIndexMergedCondition(size_t granularity_)
: granularity(granularity_)
{
}
virtual ~IMergeTreeIndexMergedCondition() = default;
virtual void addIndex(const MergeTreeIndexPtr & index) = 0;
virtual bool alwaysUnknownOrTrue() const = 0;
virtual bool mayBeTrueOnGranule(const MergeTreeIndexGranules & granules) const = 0;
protected:
const size_t granularity;
};
using MergeTreeIndexMergedConditionPtr = std::shared_ptr<IMergeTreeIndexMergedCondition>;
using MergeTreeIndexMergedConditions = std::vector<IMergeTreeIndexMergedCondition>;
2019-01-06 15:22:04 +00:00
2020-05-28 13:45:08 +00:00
struct IMergeTreeIndex
2018-12-26 12:19:24 +00:00
{
2020-05-28 13:09:03 +00:00
IMergeTreeIndex(const IndexDescription & index_)
2020-05-27 18:38:34 +00:00
: index(index_)
{
}
2018-12-26 12:19:24 +00:00
2019-02-06 07:49:18 +00:00
virtual ~IMergeTreeIndex() = default;
2018-12-26 12:19:24 +00:00
/// Returns filename without extension.
2020-05-27 18:38:34 +00:00
String getFileName() const { return INDEX_FILE_PREFIX + index.name; }
2021-05-02 19:16:40 +00:00
size_t getGranularity() const { return index.granularity; }
virtual bool isMergeable() const { return false; }
2019-01-05 09:26:02 +00:00
/// Returns extension for serialization.
/// Reimplement if you want new index format.
///
/// NOTE: In case getSerializedFileExtension() is reimplemented,
/// getDeserializedFormat() should be reimplemented too,
/// and check all previous extensions too
/// (to avoid breaking backward compatibility).
virtual const char* getSerializedFileExtension() const { return ".idx"; }
/// Returns extension for deserialization.
///
/// Return pair<extension, version>.
virtual MergeTreeIndexFormat getDeserializedFormat(const DiskPtr disk, const std::string & relative_path_prefix) const
{
if (disk->exists(relative_path_prefix + ".idx"))
return {1, ".idx"};
return {0 /*unknown*/, ""};
}
2019-02-25 08:43:19 +00:00
/// Checks whether the column is in data skipping index.
virtual bool mayBenefitFromIndexForIn(const ASTPtr & node) const = 0;
2019-01-04 14:33:38 +00:00
virtual MergeTreeIndexGranulePtr createIndexGranule() const = 0;
2020-05-27 18:38:34 +00:00
2019-03-08 19:52:21 +00:00
virtual MergeTreeIndexAggregatorPtr createIndexAggregator() const = 0;
2019-01-04 14:33:38 +00:00
2019-06-19 15:30:48 +00:00
virtual MergeTreeIndexConditionPtr createIndexCondition(
2021-11-21 19:14:20 +00:00
const SelectQueryInfo & query_info, ContextPtr context) const = 0;
2022-03-01 21:42:27 +00:00
virtual MergeTreeIndexMergedConditionPtr createIndexMergedCondition(
2021-11-21 19:14:20 +00:00
const SelectQueryInfo & /*query_info*/, StorageMetadataPtr /*storage_metadata*/) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"MergedCondition is not implemented for index of type {}", index.type);
}
2019-01-04 15:54:34 +00:00
2020-05-27 18:38:34 +00:00
Names getColumnsRequiredForIndexCalc() const { return index.expression->getRequiredColumns(); }
2019-08-28 18:23:20 +00:00
2020-05-28 13:09:03 +00:00
const IndexDescription & index;
2018-12-26 12:19:24 +00:00
};
2020-05-28 13:45:08 +00:00
using MergeTreeIndexPtr = std::shared_ptr<const IMergeTreeIndex>;
using MergeTreeIndices = std::vector<MergeTreeIndexPtr>;
2019-01-05 18:33:30 +00:00
class MergeTreeIndexFactory : private boost::noncopyable
2018-12-26 12:19:24 +00:00
{
public:
static MergeTreeIndexFactory & instance();
2020-05-28 13:45:08 +00:00
using Creator = std::function<MergeTreeIndexPtr(const IndexDescription & index)>;
2019-01-07 18:53:51 +00:00
2020-05-28 13:09:03 +00:00
using Validator = std::function<void(const IndexDescription & index, bool attach)>;
2018-12-26 12:19:24 +00:00
2020-05-28 13:09:03 +00:00
void validate(const IndexDescription & index, bool attach) const;
2018-12-26 12:19:24 +00:00
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr get(const IndexDescription & index) const;
2020-05-28 12:37:05 +00:00
2020-05-28 13:09:03 +00:00
MergeTreeIndices getMany(const std::vector<IndexDescription> & indices) const;
2020-05-28 12:37:05 +00:00
void registerCreator(const std::string & index_type, Creator creator);
2020-05-29 14:10:09 +00:00
void registerValidator(const std::string & index_type, Validator validator);
2018-12-26 12:19:24 +00:00
2018-12-26 17:34:44 +00:00
protected:
2019-02-06 09:05:05 +00:00
MergeTreeIndexFactory();
2018-12-26 17:34:44 +00:00
2018-12-26 12:19:24 +00:00
private:
2020-05-28 12:37:05 +00:00
using Creators = std::unordered_map<std::string, Creator>;
using Validators = std::unordered_map<std::string, Validator>;
Creators creators;
Validators validators;
2018-12-26 12:19:24 +00:00
};
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr minmaxIndexCreator(const IndexDescription & index);
2020-05-28 13:09:03 +00:00
void minmaxIndexValidator(const IndexDescription & index, bool attach);
2020-05-28 12:37:05 +00:00
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr setIndexCreator(const IndexDescription & index);
2020-05-28 13:09:03 +00:00
void setIndexValidator(const IndexDescription & index, bool attach);
2020-05-28 12:37:05 +00:00
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr bloomFilterIndexCreator(const IndexDescription & index);
2020-05-28 13:09:03 +00:00
void bloomFilterIndexValidator(const IndexDescription & index, bool attach);
2019-12-15 06:34:43 +00:00
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr bloomFilterIndexCreatorNew(const IndexDescription & index);
2020-05-28 13:09:03 +00:00
void bloomFilterIndexValidatorNew(const IndexDescription & index, bool attach);
2020-05-28 13:45:08 +00:00
2021-04-26 09:40:54 +00:00
MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index);
void hypothesisIndexValidator(const IndexDescription & index, bool attach);
MergeTreeIndexPtr AnnoyIndexCreator(const IndexDescription & index);
void AnnoyIndexValidator(const IndexDescription & index, bool attach);
2019-01-26 06:26:49 +00:00
}