Delay reading from StorageKafka by creating a query plan step

2024-12-13 18:02:24 +00:00 · 2024-01-03 17:13:41 +00:00 · 2024-01-03 17:13:41 +00:00 · 81b5d8fddd
commit 81b5d8fddd
parent 570d1c013b
2 changed files with 107 additions and 40 deletions
--- a/src/Storages/Kafka/StorageKafka.cpp
+++ b/src/Storages/Kafka/StorageKafka.cpp
@ -8,8 +8,10 @@
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <Formats/FormatFactory.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterInsertQuery.h>
+#include <Interpreters/InterpreterSelectQuery.h>
 #include <Interpreters/evaluateConstantExpression.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTExpressionList.h>
@ -17,18 +19,19 @@
 #include <Parsers/ASTInsertQuery.h>
 #include <Parsers/ASTLiteral.h>
 #include <Processors/Executors/CompletedPipelineExecutor.h>
-#include <QueryPipeline/QueryPipeline.h>
+#include <Processors/QueryPlan/ISourceStep.h>
+#include <Processors/QueryPlan/QueryPlan.h>
 #include <QueryPipeline/Pipe.h>
-#include <Storages/MessageQueueSink.h>
+#include <QueryPipeline/QueryPipeline.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Storages/Kafka/KafkaProducer.h>
 #include <Storages/Kafka/KafkaSettings.h>
 #include <Storages/Kafka/KafkaSource.h>
+#include <Storages/MessageQueueSink.h>
+#include <Storages/NamedCollectionsHelpers.h>
 #include <Storages/StorageFactory.h>
 #include <Storages/StorageMaterializedView.h>
-#include <Storages/NamedCollectionsHelpers.h>
 #include <base/getFQDNOrHostName.h>
-#include <Common/Stopwatch.h>
-#include <Common/logger_useful.h>
 #include <boost/algorithm/string/replace.hpp>
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/trim.hpp>
@ -37,11 +40,12 @@
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Common/Exception.h>
 #include <Common/Macros.h>
+#include <Common/Stopwatch.h>
 #include <Common/formatReadable.h>
 #include <Common/getNumberOfPhysicalCPUCores.h>
+#include <Common/logger_useful.h>
 #include <Common/quoteString.h>
 #include <Common/setThreadName.h>
-#include <Formats/FormatFactory.h>

 #include <Storages/ColumnDefault.h>
 #include <Common/config_version.h>
@ -174,6 +178,92 @@ struct StorageKafkaInterceptors
    }
 };

+class ReadFromStorageKafkaStep final : public ISourceStep
+{
+public:
+    ReadFromStorageKafkaStep(
+        const Names & column_names_,
+        StoragePtr storage_,
+        const StorageSnapshotPtr & storage_snapshot_,
+        SelectQueryInfo & query_info,
+        ContextPtr context_)
+        : ISourceStep{DataStream{.header = storage_snapshot_->getSampleBlockForColumns(column_names_)}}
+        , column_names{column_names_}
+        , storage{storage_}
+        , storage_snapshot{storage_snapshot_}
+        , storage_limits{query_info.storage_limits}
+        , context{context_}
+    {
+    }
+
+    String getName() const override { return "ReadFromStorageKafka"; }
+
+    void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override
+    {
+        auto pipe = makePipe();
+
+        /// Add storage limits.
+        for (const auto & processor : pipe.getProcessors())
+            processor->setStorageLimits(storage_limits);
+
+        /// Add to processors to get processor info through explain pipeline statement.
+        for (const auto & processor : pipe.getProcessors())
+            processors.emplace_back(processor);
+
+        pipeline.init(std::move(pipe));
+    }
+
+private:
+    Pipe makePipe()
+    {
+        auto & kafka_storage = storage->as<StorageKafka &>();
+        if (kafka_storage.shutdown_called)
+            throw Exception(ErrorCodes::ABORTED, "Table is detached");
+
+        if (!context->getSettingsRef().stream_like_engine_allow_direct_select)
+            throw Exception(
+                ErrorCodes::QUERY_NOT_ALLOWED,
+                "Direct select is not allowed. To enable use setting `stream_like_engine_allow_direct_select`");
+
+        if (kafka_storage.mv_attached)
+            throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "Cannot read from StorageKafka with attached materialized views");
+
+        ProfileEvents::increment(ProfileEvents::KafkaDirectReads);
+
+        /// Always use all consumers at once, otherwise SELECT may not read messages from all partitions.
+        Pipes pipes;
+        pipes.reserve(kafka_storage.num_consumers);
+        auto modified_context = Context::createCopy(context);
+        modified_context->applySettingsChanges(kafka_storage.settings_adjustments);
+
+        // Claim as many consumers as requested, but don't block
+        for (size_t i = 0; i < kafka_storage.num_consumers; ++i)
+        {
+            /// Use block size of 1, otherwise LIMIT won't work properly as it will buffer excess messages in the last block
+            /// TODO: probably that leads to awful performance.
+            /// FIXME: seems that doesn't help with extra reading and committing unprocessed messages.
+            pipes.emplace_back(std::make_shared<KafkaSource>(
+                kafka_storage,
+                storage_snapshot,
+                modified_context,
+                column_names,
+                kafka_storage.log,
+                1,
+                kafka_storage.kafka_settings->kafka_commit_on_select));
+        }
+
+        LOG_DEBUG(kafka_storage.log, "Starting reading {} streams", pipes.size());
+        return Pipe::unitePipes(std::move(pipes));
+    }
+    ActionsDAGPtr buildFilterDAG();
+
+    const Names column_names;
+    StoragePtr storage;
+    StorageSnapshotPtr storage_snapshot;
+    std::shared_ptr<const StorageLimitsList> storage_limits;
+    ContextPtr context;
+};
+
 namespace
 {
    const String CONFIG_KAFKA_TAG = "kafka";
@ -347,45 +437,18 @@ String StorageKafka::getDefaultClientId(const StorageID & table_id_)
    return fmt::format("{}-{}-{}-{}", VERSION_NAME, getFQDNOrHostName(), table_id_.database_name, table_id_.table_name);
 }

-
-Pipe StorageKafka::read(
+void StorageKafka::read(
+    QueryPlan & query_plan,
    const Names & column_names,
    const StorageSnapshotPtr & storage_snapshot,
-    SelectQueryInfo & /* query_info */,
-    ContextPtr local_context,
+    SelectQueryInfo & query_info,
+    ContextPtr query_context,
    QueryProcessingStage::Enum /* processed_stage */,
    size_t /* max_block_size */,
    size_t /* num_streams */)
 {
-    if (shutdown_called)
-        throw Exception(ErrorCodes::ABORTED, "Table is detached");
-
-    if (!local_context->getSettingsRef().stream_like_engine_allow_direct_select)
-        throw Exception(ErrorCodes::QUERY_NOT_ALLOWED,
-                        "Direct select is not allowed. To enable use setting `stream_like_engine_allow_direct_select`");
-
-    if (mv_attached)
-        throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "Cannot read from StorageKafka with attached materialized views");
-
-    ProfileEvents::increment(ProfileEvents::KafkaDirectReads);
-
-    /// Always use all consumers at once, otherwise SELECT may not read messages from all partitions.
-    Pipes pipes;
-    pipes.reserve(num_consumers);
-    auto modified_context = Context::createCopy(local_context);
-    modified_context->applySettingsChanges(settings_adjustments);
-
-    // Claim as many consumers as requested, but don't block
-    for (size_t i = 0; i < num_consumers; ++i)
-    {
-        /// Use block size of 1, otherwise LIMIT won't work properly as it will buffer excess messages in the last block
-        /// TODO: probably that leads to awful performance.
-        /// FIXME: seems that doesn't help with extra reading and committing unprocessed messages.
-        pipes.emplace_back(std::make_shared<KafkaSource>(*this, storage_snapshot, modified_context, column_names, log, 1, kafka_settings->kafka_commit_on_select));
-    }
-
-    LOG_DEBUG(log, "Starting reading {} streams", pipes.size());
-    return Pipe::unitePipes(std::move(pipes));
+    query_plan.addStep(std::make_unique<ReadFromStorageKafkaStep>(
+        column_names, shared_from_this(), storage_snapshot, query_info, std::move(query_context)));
 }


--- a/src/Storages/Kafka/StorageKafka.h
+++ b/src/Storages/Kafka/StorageKafka.h
@ -20,6 +20,7 @@ namespace DB
 {

 class StorageSystemKafkaConsumers;
+class ReadFromStorageKafkaStep;

 struct StorageKafkaInterceptors;

@ -48,7 +49,8 @@ public:
    void startup() override;
    void shutdown(bool is_drop) override;

-    Pipe read(
+    void read(
+        QueryPlan & query_plan,
        const Names & column_names,
        const StorageSnapshotPtr & storage_snapshot,
        SelectQueryInfo & query_info,
@ -86,6 +88,8 @@ public:
    SafeConsumers getSafeConsumers() { return {shared_from_this(), std::unique_lock(mutex), consumers};  }

 private:
+    friend class ReadFromStorageKafkaStep;
+
    // Configuration and state
    std::unique_ptr<KafkaSettings> kafka_settings;
    Macros::MacroExpansionInfo macros_info;