Instant count() for MergeTree

Use (Replicated)MergeTree's metadata to do trivial count()
2024-11-21 15:12:02 +00:00 · 2019-10-29 01:27:43 +08:00 · 2019-10-29 01:27:43 +08:00 · 2c75a51d4f
commit 2c75a51d4f
parent a17cefb9c6
12 changed files with 188 additions and 57 deletions
--- a/dbms/src/Core/Settings.h
+++ b/dbms/src/Core/Settings.h
@ -380,6 +380,7 @@ struct Settings : public SettingsCollection<Settings>
    M(SettingUInt64, min_free_disk_space_for_temporary_data, 0, "The minimum disk space to keep while writing temporary data used in external sorting and aggregation.") \
    \
    M(SettingBool, enable_scalar_subquery_optimization, true, "If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once.") \
    M(SettingBool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.") \
    \
    /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
    \
--- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
@ -26,6 +26,7 @@
 #include <DataStreams/ReverseBlockInputStream.h>
 #include <DataStreams/FillingBlockInputStream.h>
 #include <DataStreams/SquashingBlockInputStream.h>
 #include <DataStreams/OneBlockInputStream.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIdentifier.h>
@ -65,6 +66,7 @@
 #include <Common/checkStackSize.h>
 #include <Parsers/queryToString.h>
 #include <ext/map.h>
 #include <ext/scope_guard.h>
 #include <memory>
 #include <Processors/Sources/NullSource.h>
@ -90,6 +92,7 @@
 #include <Processors/Transforms/FinishSortingTransform.h>
 #include <DataTypes/DataTypeAggregateFunction.h>
 #include <DataStreams/materializeBlock.h>
 #include <IO/MemoryReadWriteBuffer.h>
 namespace DB
@ -1273,6 +1276,65 @@ void InterpreterSelectQuery::executeFetchColumns(
    auto & query = getSelectQuery();
    const Settings & settings = context.getSettingsRef();
    /// Optimization for trivial query like SELECT count() FROM table.
    auto check_trivial_count_query = [&]() -> std::optional<AggregateDescription>
    {
        if (!settings.optimize_trivial_count_query || !syntax_analyzer_result->maybe_optimize_trivial_count || !storage
            || query.sample_size() || query.sample_offset() || query.final() || query.prewhere() || query.where()
            || !query_analyzer->hasAggregation() || processing_stage != QueryProcessingStage::FetchColumns)
            return {};
        Names key_names;
        AggregateDescriptions aggregates;
        query_analyzer->getAggregateInfo(key_names, aggregates);
        if (aggregates.size() != 1)
            return {};
        const AggregateDescription & desc = aggregates[0];
        if (typeid_cast<AggregateFunctionCount *>(desc.function.get()))
            return desc;
        return {};
    };
    if (auto desc = check_trivial_count_query())
    {
        auto func = desc->function;
        std::optional<UInt64> num_rows = storage->totalRows();
        if (num_rows)
        {
            AggregateFunctionCount & agg_count = static_cast<AggregateFunctionCount &>(*func);
            /// We will process it up to "WithMergeableState".
            std::vector<char> state(agg_count.sizeOfData());
            AggregateDataPtr place = state.data();
            agg_count.create(place);
            SCOPE_EXIT(agg_count.destroy(place));
            MemoryWriteBuffer out;
            writeVarUInt(*num_rows, out);
            auto in = out.tryGetReadBuffer();
            agg_count.deserialize(place, *in, nullptr);
            auto column = ColumnAggregateFunction::create(func);
            column->insertFrom(place);
            Block block_with_count{
                {std::move(column), std::make_shared<DataTypeAggregateFunction>(func, DataTypes(), Array()), desc->column_name}};
            auto istream = std::make_shared<OneBlockInputStream>(block_with_count);
            if constexpr (pipeline_with_processors)
                pipeline.init({std::make_shared<SourceFromInputStream>(istream)});
            else
                pipeline.streams.emplace_back(istream);
            from_stage = QueryProcessingStage::WithMergeableState;
            analysis_result.first_stage = false;
            return;
        }
    }
    /// Actions to calculate ALIAS if required.
    ExpressionActionsPtr alias_actions;
--- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp
+++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp
@ -727,6 +727,7 @@ void SyntaxAnalyzerResult::collectUsedColumns(const ASTPtr & query, const NamesA
    /// You need to read at least one column to find the number of rows.
    if (select_query && required.empty())
    {
        maybe_optimize_trivial_count = true;
        /// We will find a column with minimum <compressed_size, type_size, uncompressed_size>.
        /// Because it is the column that is cheapest to read.
        struct ColumnSizeTuple
--- a/dbms/src/Interpreters/SyntaxAnalyzer.h
+++ b/dbms/src/Interpreters/SyntaxAnalyzer.h
@ -48,6 +48,8 @@ struct SyntaxAnalyzerResult
    /// Results of scalar sub queries
    Scalars scalars;
    bool maybe_optimize_trivial_count = false;
    void collectUsedColumns(const ASTPtr & query, const NamesAndTypesList & additional_source_columns);
    Names requiredSourceColumns() const { return required_source_columns.getNames(); }
    const Scalars & getScalars() const { return scalars; }
--- a/dbms/src/Storages/IStorage.h
+++ b/dbms/src/Storages/IStorage.h
@ -406,6 +406,13 @@ public:
    /// Returns storage policy if storage supports it
    virtual DiskSpace::StoragePolicyPtr getStoragePolicy() const { return {}; }
    /** If it is possible to quickly determine exact number of rows in the table at this moment of time, then return it.
     */
    virtual std::optional<UInt64> totalRows() const
    {
        return {};
    }
 private:
    /// You always need to take the next three locks in this order.
--- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp
@ -2374,6 +2374,20 @@ size_t MergeTreeData::getTotalActiveSizeInBytes() const
 }
 size_t MergeTreeData::getTotalActiveSizeInRows() const
 {
    size_t res = 0;
    {
        auto lock = lockParts();
        for (auto & part : getDataPartsStateRange(DataPartState::Committed))
            res += part->rows_count;
    }
    return res;
 }
 size_t MergeTreeData::getPartsCount() const
 {
    auto lock = lockParts();
@ -2486,7 +2500,7 @@ void MergeTreeData::throwInsertIfNeeded() const
 }
 MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(
-    const MergeTreePartInfo & part_info, MergeTreeData::DataPartState state, DataPartsLock & /*lock*/)
+    const MergeTreePartInfo & part_info, MergeTreeData::DataPartState state, DataPartsLock & /*lock*/) const
 {
    auto current_state_parts_range = getDataPartsStateRange(state);
@ -2534,13 +2548,13 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy)
 }
-MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const MergeTreePartInfo & part_info)
+MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const MergeTreePartInfo & part_info) const
 {
    auto lock = lockParts();
    return getActiveContainingPart(part_info, DataPartState::Committed, lock);
 }
-MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & part_name)
+MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & part_name) const
 {
    auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
    return getActiveContainingPart(part_info);
--- a/dbms/src/Storages/MergeTree/MergeTreeData.h
+++ b/dbms/src/Storages/MergeTree/MergeTreeData.h
@ -435,9 +435,9 @@ public:
    DataPartsVector getDataPartsVector() const;
    /// Returns a committed part with the given name or a part containing it. If there is no such part, returns nullptr.
-    DataPartPtr getActiveContainingPart(const String & part_name);
+    DataPartPtr getActiveContainingPart(const String & part_name) const;
-    DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info);
+    DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info) const;
-    DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info, DataPartState state, DataPartsLock & lock);
+    DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info, DataPartState state, DataPartsLock & lock) const;
    /// Swap part with it's identical copy (possible with another path on another disk).
    /// If original part is not active or doesn't exist exception will be thrown.
@ -453,6 +453,8 @@ public:
    /// Total size of active parts in bytes.
    size_t getTotalActiveSizeInBytes() const;
    size_t getTotalActiveSizeInRows() const;
    size_t getPartsCount() const;
    size_t getMaxPartsCountForPartition() const;
--- a/dbms/src/Storages/StorageMergeTree.cpp
+++ b/dbms/src/Storages/StorageMergeTree.cpp
@ -134,6 +134,11 @@ BlockInputStreams StorageMergeTree::read(
    return reader.read(column_names, query_info, context, max_block_size, num_streams);
 }
 std::optional<UInt64> StorageMergeTree::totalRows() const
 {
    return getTotalActiveSizeInRows();
 }
 BlockOutputStreamPtr StorageMergeTree::write(const ASTPtr & /*query*/, const Context & context)
 {
    return std::make_shared<MergeTreeBlockOutputStream>(*this, context.getSettingsRef().max_partitions_per_insert_block);
--- a/dbms/src/Storages/StorageMergeTree.h
+++ b/dbms/src/Storages/StorageMergeTree.h
@ -45,6 +45,8 @@ public:
        size_t max_block_size,
        unsigned num_streams) override;
    std::optional<UInt64> totalRows() const override;
    BlockOutputStreamPtr write(const ASTPtr & query, const Context & context) override;
    /** Perform the next step in combining the parts.
--- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp
@ -171,13 +171,13 @@ void StorageReplicatedMergeTree::setZooKeeper(zkutil::ZooKeeperPtr zookeeper)
    current_zookeeper = zookeeper;
 }
-zkutil::ZooKeeperPtr StorageReplicatedMergeTree::tryGetZooKeeper()
+zkutil::ZooKeeperPtr StorageReplicatedMergeTree::tryGetZooKeeper() const
 {
    std::lock_guard lock(current_zookeeper_mutex);
    return current_zookeeper;
 }
-zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper()
+zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper() const
 {
    auto res = tryGetZooKeeper();
    if (!res)
@ -2920,28 +2920,14 @@ StorageReplicatedMergeTree::~StorageReplicatedMergeTree()
 }
-BlockInputStreams StorageReplicatedMergeTree::read(
+ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMergeTree::getMaxAddedBlocks() const
    const Names & column_names,
    const SelectQueryInfo & query_info,
    const Context & context,
    QueryProcessingStage::Enum /*processed_stage*/,
    const size_t max_block_size,
    const unsigned num_streams)
 {
    const Settings & settings_ = context.getSettingsRef();
    /** The `select_sequential_consistency` setting has two meanings:
    * 1. To throw an exception if on a replica there are not all parts which have been written down on quorum of remaining replicas.
    * 2. Do not read parts that have not yet been written to the quorum of the replicas.
    * For this you have to synchronously go to ZooKeeper.
    */
    if (settings_.select_sequential_consistency)
 {
    ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock max_added_blocks;
    for (const auto & data_part : getDataParts())
    {
-            max_added_blocks[data_part->info.partition_id] = std::max(max_added_blocks[data_part->info.partition_id], data_part->info.max_block);
+        max_added_blocks[data_part->info.partition_id]
            = std::max(max_added_blocks[data_part->info.partition_id], data_part->info.max_block);
    }
    auto zookeeper = getZooKeeper();
@ -2973,14 +2959,37 @@ BlockInputStreams StorageReplicatedMergeTree::read(
            for (const auto & added_part : added_parts)
                if (!getActiveContainingPart(added_part.second))
-                        throw Exception("Replica doesn't have part " + added_part.second + " which was successfully written to quorum of other replicas."
+                    throw Exception(
-                            " Send query to another replica or disable 'select_sequential_consistency' setting.", ErrorCodes::REPLICA_IS_NOT_IN_QUORUM);
+                        "Replica doesn't have part " + added_part.second
                            + " which was successfully written to quorum of other replicas."
                              " Send query to another replica or disable 'select_sequential_consistency' setting.",
                        ErrorCodes::REPLICA_IS_NOT_IN_QUORUM);
            for (const auto & max_block : part_with_quorum.getMaxInsertedBlocks())
                max_added_blocks[max_block.first] = max_block.second;
        }
    }
    return max_added_blocks;
 }
 BlockInputStreams StorageReplicatedMergeTree::read(
    const Names & column_names,
    const SelectQueryInfo & query_info,
    const Context & context,
    QueryProcessingStage::Enum /*processed_stage*/,
    const size_t max_block_size,
    const unsigned num_streams)
 {
    const Settings & settings_ = context.getSettingsRef();
    /** The `select_sequential_consistency` setting has two meanings:
    * 1. To throw an exception if on a replica there are not all parts which have been written down on quorum of remaining replicas.
    * 2. Do not read parts that have not yet been written to the quorum of the replicas.
    * For this you have to synchronously go to ZooKeeper.
    */
    if (settings_.select_sequential_consistency)
    {
        auto max_added_blocks = getMaxAddedBlocks();
        return reader.read(column_names, query_info, context, max_block_size, num_streams, &max_added_blocks);
    }
@ -2988,6 +2997,26 @@ BlockInputStreams StorageReplicatedMergeTree::read(
 }
 std::optional<UInt64> StorageReplicatedMergeTree::totalRows() const
 {
    size_t res = 0;
    auto max_added_blocks = getMaxAddedBlocks();
    auto lock = lockParts();
    for (auto & part : getDataPartsStateRange(DataPartState::Committed))
    {
        if (part->isEmpty())
            continue;
        auto blocks_iterator = max_added_blocks.find(part->info.partition_id);
        if (blocks_iterator == max_added_blocks.end() || part->info.max_block > blocks_iterator->second)
            continue;
        res += part->rows_count;
    }
    return res;
 }
 void StorageReplicatedMergeTree::assertNotReadonly() const
 {
    if (is_readonly)
--- a/dbms/src/Storages/StorageReplicatedMergeTree.h
+++ b/dbms/src/Storages/StorageReplicatedMergeTree.h
@ -96,6 +96,8 @@ public:
        size_t max_block_size,
        unsigned num_streams) override;
    std::optional<UInt64> totalRows() const override;
    BlockOutputStreamPtr write(const ASTPtr & query, const Context & context) override;
    bool optimize(const ASTPtr & query, const ASTPtr & partition, bool final, bool deduplicate, const Context & query_context) override;
@ -174,6 +176,10 @@ public:
    bool canUseAdaptiveGranularity() const override;
 private:
    /// Get a sequential consistent view of current parts.
    ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock getMaxAddedBlocks() const;
    /// Delete old parts from disk and from ZooKeeper.
    void clearOldPartsAndRemoveFromZK();
@ -191,10 +197,10 @@ private:
    using LogEntryPtr = LogEntry::Ptr;
    zkutil::ZooKeeperPtr current_zookeeper;        /// Use only the methods below.
-    std::mutex current_zookeeper_mutex;            /// To recreate the session in the background thread.
+    mutable std::mutex current_zookeeper_mutex;    /// To recreate the session in the background thread.
-    zkutil::ZooKeeperPtr tryGetZooKeeper();
+    zkutil::ZooKeeperPtr tryGetZooKeeper() const;
-    zkutil::ZooKeeperPtr getZooKeeper();
+    zkutil::ZooKeeperPtr getZooKeeper() const;
    void setZooKeeper(zkutil::ZooKeeperPtr zookeeper);
    /// If true, the table is offline and can not be written to it.
--- a/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql
+++ b/dbms/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql
@ -14,9 +14,9 @@ SELECT count() FROM merge_tree;
 SET max_rows_to_read = 900000;
 SET merge_tree_uniform_read_distribution = 1;
-SELECT count() FROM merge_tree; -- { serverError 158 }
+SELECT count() FROM merge_tree WHERE not ignore(); -- { serverError 158 }
 SET merge_tree_uniform_read_distribution = 0;
-SELECT count() FROM merge_tree; -- { serverError 158 }
+SELECT count() FROM merge_tree WHERE not ignore(); -- { serverError 158 }
 DROP TABLE merge_tree;