Don't use a lot of stack for pipeline traverse.

2024-11-17 13:13:36 +00:00 · 2021-10-26 21:50:13 +03:00 · 2021-10-26 21:50:13 +03:00 · c7a07bafe0
commit c7a07bafe0
parent 05f42e2d07
5 changed files with 129 additions and 137 deletions
--- a/src/Processors/Executors/ExecutionThreadContext.h
+++ b/src/Processors/Executors/ExecutionThreadContext.h
@ -45,7 +45,7 @@ class ExecutionThreadContext
 {
 private:
    /// Will store context for all expand pipeline tasks (it's easy and we don't expect many).
-    /// This can be solved by using atomic shard ptr.
+    /// This can be solved by using atomic shared ptr.
    std::list<StoppingPipelineTask> task_list;

    /// A queue of async tasks. Task is added to queue when waited.
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@ -125,51 +125,65 @@ bool PipelineExecutor::expandPipeline(Stack & stack, UInt64 pid)
    return true;
 }

-bool PipelineExecutor::tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, ExecutionThreadContext & thread_context)
+bool PipelineExecutor::prepareProcessor(UInt64 pid, ExecutionThreadContext & thread_context, Queue & queue, Queue & async_queue)
 {
-    /// In this method we have ownership on edge, but node can be concurrently accessed.
+    std::stack<ExecutingGraph::Edge *> updated_edges;
+    Stack updated_processors;
+    updated_processors.push(pid);

-    auto & node = *graph->nodes[edge.to];
+    while (!updated_processors.empty() || !updated_edges.empty())
+    {
+        std::optional<std::unique_lock<std::mutex>> stack_top_lock;
+
+        if (updated_processors.empty())
+        {
+            auto * edge = updated_edges.top();
+            updated_edges.pop();
+
+            /// Here we have ownership on edge, but node can be concurrently accessed.
+
+            auto & node = *graph->nodes[edge->to];

            std::unique_lock lock(node.status_mutex);

            ExecutingGraph::ExecStatus status = node.status;

-    if (status == ExecutingGraph::ExecStatus::Finished)
-        return true;
-
-    if (edge.backward)
-        node.updated_output_ports.push_back(edge.output_port_number);
+            if (status != ExecutingGraph::ExecStatus::Finished)
+            {
+                if (edge->backward)
+                    node.updated_output_ports.push_back(edge->output_port_number);
                else
-        node.updated_input_ports.push_back(edge.input_port_number);
+                    node.updated_input_ports.push_back(edge->input_port_number);

                if (status == ExecutingGraph::ExecStatus::Idle)
                {
                    node.status = ExecutingGraph::ExecStatus::Preparing;
-        return prepareProcessor(edge.to, thread_context, queue, async_queue, std::move(lock));
+                    updated_processors.push(edge->to);
+                    stack_top_lock = std::move(lock);
                }
                else
-        graph->nodes[edge.to]->processor->onUpdatePorts();
+                    graph->nodes[edge->to]->processor->onUpdatePorts();
+            }
+        }
+        else
+        {
+            pid = updated_processors.top();
+            updated_processors.pop();

-    return true;
-}
-
-bool PipelineExecutor::prepareProcessor(UInt64 pid, ExecutionThreadContext & thread_context, Queue & queue, Queue & async_queue, std::unique_lock<std::mutex> node_lock)
-{
            /// In this method we have ownership on node.
            auto & node = *graph->nodes[pid];

            bool need_expand_pipeline = false;

-    std::vector<ExecutingGraph::Edge *> updated_back_edges;
-    std::vector<ExecutingGraph::Edge *> updated_direct_edges;
+            if (!stack_top_lock)
+                stack_top_lock.emplace(node.status_mutex);

            {
 #ifndef NDEBUG
                Stopwatch watch;
 #endif

-        std::unique_lock<std::mutex> lock(std::move(node_lock));
+                std::unique_lock<std::mutex> lock(std::move(*stack_top_lock));

                try
                {
@ -220,18 +234,19 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, ExecutionThreadContext & thr
                    }
                }

+                if (!need_expand_pipeline)
                {
                    for (auto & edge_id : node.post_updated_input_ports)
                    {
                        auto * edge = static_cast<ExecutingGraph::Edge *>(edge_id);
-                updated_back_edges.emplace_back(edge);
+                        updated_edges.push(edge);
                        edge->update_info.trigger();
                    }

                    for (auto & edge_id : node.post_updated_output_ports)
                    {
                        auto * edge = static_cast<ExecutingGraph::Edge *>(edge_id);
-                updated_direct_edges.emplace_back(edge);
+                        updated_edges.push(edge);
                        edge->update_info.trigger();
                    }

@ -240,40 +255,19 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, ExecutionThreadContext & thr
                }
            }

-    {
-        for (auto & edge : updated_direct_edges)
-        {
-            if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_context))
-                return false;
-        }
-
-        for (auto & edge : updated_back_edges)
-        {
-            if (!tryAddProcessorToStackIfUpdated(*edge, queue, async_queue, thread_context))
-                return false;
-        }
-    }
-
            if (need_expand_pipeline)
            {
-        Stack stack;
-
-        auto callback = [this, &stack, pid = node.processors_id]() { return expandPipeline(stack, pid); };
+                auto callback = [this, &updated_processors, pid = node.processors_id]()
+                {
+                    return expandPipeline(updated_processors, pid);
+                };

                if (!tasks.executeStoppingTask(thread_context, std::move(callback)))
                    return false;

                /// Add itself back to be prepared again.
-        stack.push(pid);
-
-        while (!stack.empty())
-        {
-            auto item = stack.top();
-            auto lock = std::unique_lock<std::mutex>(graph->nodes[item]->status_mutex);
-            if (!prepareProcessor(item, thread_context, queue, async_queue, std::move(lock)))
-                return false;
-
-            stack.pop();
+                updated_processors.push(pid);
+            }
        }
    }

@ -426,11 +420,8 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie
                tasks.enterConcurrentReadSection();

                /// Prepare processor after execution.
-                {
-                    auto lock = context.lockStatus();
-                    if (!prepareProcessor(context.getProcessorID(), context, queue, async_queue, std::move(lock)))
+                if (!prepareProcessor(context.getProcessorID(), context, queue, async_queue))
                    finish();
-                }

                tasks.exitConcurrentReadSection();

@ -473,7 +464,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads)
        UInt64 proc = stack.top();
        stack.pop();

-        prepareProcessor(proc, context, queue, async_queue, std::unique_lock<std::mutex>(graph->nodes[proc]->status_mutex));
+        prepareProcessor(proc, context, queue, async_queue);

        if (!async_queue.empty())
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Async is only possible after work() call. Processor {}",
--- a/src/Processors/Executors/PipelineExecutor.h
+++ b/src/Processors/Executors/PipelineExecutor.h
@ -70,12 +70,11 @@ private:

    /// Pipeline execution related methods.
    void addChildlessProcessorsToStack(Stack & stack);
-    bool tryAddProcessorToStackIfUpdated(ExecutingGraph::Edge & edge, Queue & queue, Queue & async_queue, ExecutionThreadContext & thread_context);

    /// Prepare processor with pid number.
    /// Check parents and children of current processor and push them to stacks if they also need to be prepared.
    /// If processor wants to be expanded, ExpandPipelineTask from thread_number's execution context will be used.
-    bool prepareProcessor(UInt64 pid, ExecutionThreadContext & thread_context, Queue & queue, Queue & async_queue, std::unique_lock<std::mutex> node_lock);
+    bool prepareProcessor(UInt64 pid, ExecutionThreadContext & thread_context, Queue & queue, Queue & async_queue);

    void initializeExecution(size_t num_threads); /// Initialize executor contexts and task_queue.
    void finalizeExecution(); /// Check all processors are finished.
--- a/src/Processors/Executors/TasksQueue.h
+++ b/src/Processors/Executors/TasksQueue.h
@ -1,6 +1,7 @@
 #pragma once
 #include <vector>
 #include <queue>
+#include <Common/Exception.h>

 namespace DB
 {
--- a/src/Processors/Executors/ThreadsQueue.h
+++ b/src/Processors/Executors/ThreadsQueue.h
@ -1,5 +1,6 @@
 #pragma once
-
+#include <Common/Exception.h>
+#include <base/defines.h>
 namespace DB
 {
 namespace ErrorCodes