#include #include #include #include #include #include #include #include #include #include #include #if !defined(__APPLE__) && !defined(__FreeBSD__) #include #endif namespace DB { namespace ErrorCodes { extern const int TOO_MANY_ROWS_OR_BYTES; extern const int QUOTA_EXPIRED; extern const int QUERY_WAS_CANCELLED; } static bool checkCanAddAdditionalInfoToException(const DB::Exception & exception) { /// Don't add additional info to limits and quota exceptions, and in case of kill query (to pass tests). return exception.code() != ErrorCodes::TOO_MANY_ROWS_OR_BYTES && exception.code() != ErrorCodes::QUOTA_EXPIRED && exception.code() != ErrorCodes::QUERY_WAS_CANCELLED; } PipelineExecutor::PipelineExecutor(Processors & processors_) : processors(processors_) , cancelled(false) , finished(false) , num_processing_executors(0) , expand_pipeline_task(nullptr) { buildGraph(); } bool PipelineExecutor::addEdges(UInt64 node) { auto throwUnknownProcessor = [](const IProcessor * proc, const IProcessor * parent, bool from_input_port) { String msg = "Processor " + proc->getName() + " was found as " + (from_input_port ? "input" : "output") + " for processor " + parent->getName() + ", but not found in list of processors."; throw Exception(msg, ErrorCodes::LOGICAL_ERROR); }; const IProcessor * cur = graph[node].processor; auto add_edge = [&](auto & from_port, const IProcessor * to_proc, Edges & edges, bool is_backward, UInt64 input_port_number, UInt64 output_port_number, std::vector * update_list) { auto it = processors_map.find(to_proc); if (it == processors_map.end()) throwUnknownProcessor(to_proc, cur, true); UInt64 proc_num = it->second; auto & edge = edges.emplace_back(proc_num, is_backward, input_port_number, output_port_number, update_list); from_port.setUpdateInfo(&edge.update_info); }; bool was_edge_added = false; auto & inputs = processors[node]->getInputs(); auto from_input = graph[node].backEdges.size(); if (from_input < inputs.size()) { was_edge_added = true; for (auto it = std::next(inputs.begin(), from_input); it != inputs.end(); ++it, ++from_input) { const IProcessor * proc = &it->getOutputPort().getProcessor(); auto output_port_number = proc->getOutputPortNumber(&it->getOutputPort()); add_edge(*it, proc, graph[node].backEdges, true, from_input, output_port_number, &graph[node].post_updated_input_ports); } } auto & outputs = processors[node]->getOutputs(); auto from_output = graph[node].directEdges.size(); if (from_output < outputs.size()) { was_edge_added = true; for (auto it = std::next(outputs.begin(), from_output); it != outputs.end(); ++it, ++from_output) { const IProcessor * proc = &it->getInputPort().getProcessor(); auto input_port_number = proc->getInputPortNumber(&it->getInputPort()); add_edge(*it, proc, graph[node].directEdges, false, input_port_number, from_output, &graph[node].post_updated_output_ports); } } return was_edge_added; } void PipelineExecutor::buildGraph() { UInt64 num_processors = processors.size(); graph.reserve(num_processors); for (UInt64 node = 0; node < num_processors; ++node) { IProcessor * proc = processors[node].get(); processors_map[proc] = node; graph.emplace_back(proc, node); } for (UInt64 node = 0; node < num_processors; ++node) addEdges(node); } void PipelineExecutor::addChildlessProcessorsToStack(Stack & stack) { UInt64 num_processors = processors.size(); for (UInt64 proc = 0; proc < num_processors; ++proc) { if (graph[proc].directEdges.empty()) { stack.push(proc); /// do not lock mutex, as this function is executedin single thread graph[proc].status = ExecStatus::Preparing; } } } static void executeJob(IProcessor * processor) { try { processor->work(); } catch (Exception & exception) { if (checkCanAddAdditionalInfoToException(exception)) exception.addMessage("While executing " + processor->getName() + " (" + toString(reinterpret_cast(processor)) + ") "); throw; } } void PipelineExecutor::addJob(ExecutionState * execution_state) { auto job = [execution_state]() { try { // Stopwatch watch; executeJob(execution_state->processor); // execution_state->execution_time_ns += watch.elapsed(); ++execution_state->num_executed_jobs; } catch (...) { execution_state->exception = std::current_exception(); } }; execution_state->job = std::move(job); } bool PipelineExecutor::expandPipeline(Stack & stack, UInt64 pid) { auto & cur_node = graph[pid]; Processors new_processors; try { new_processors = cur_node.processor->expandPipeline(); } catch (...) { cur_node.execution_state->exception = std::current_exception(); return false; } for (const auto & processor : new_processors) { if (processors_map.count(processor.get())) throw Exception("Processor " + processor->getName() + " was already added to pipeline.", ErrorCodes::LOGICAL_ERROR); processors_map[processor.get()] = graph.size(); graph.emplace_back(processor.get(), graph.size()); } { std::lock_guard guard(processors_mutex); processors.insert(processors.end(), new_processors.begin(), new_processors.end()); } UInt64 num_processors = processors.size(); for (UInt64 node = 0; node < num_processors; ++node) { size_t num_direct_edges = graph[node].directEdges.size(); size_t num_back_edges = graph[node].backEdges.size(); if (addEdges(node)) { std::lock_guard guard(graph[node].status_mutex); for (; num_back_edges < graph[node].backEdges.size(); ++num_back_edges) graph[node].updated_input_ports.emplace_back(num_back_edges); for (; num_direct_edges < graph[node].directEdges.size(); ++num_direct_edges) graph[node].updated_output_ports.emplace_back(num_direct_edges); if (graph[node].status == ExecStatus::Idle) { graph[node].status = ExecStatus::Preparing; stack.push(node); } } } return true; } bool PipelineExecutor::tryAddProcessorToStackIfUpdated(Edge & edge, Queue & queue, size_t thread_number) { /// In this method we have ownership on edge, but node can be concurrently accessed. auto & node = graph[edge.to]; std::unique_lock lock(node.status_mutex); ExecStatus status = node.status; if (status == ExecStatus::Finished) return true; if (edge.backward) node.updated_output_ports.push_back(edge.output_port_number); else node.updated_input_ports.push_back(edge.input_port_number); if (status == ExecStatus::Idle) { node.status = ExecStatus::Preparing; return prepareProcessor(edge.to, thread_number, queue, std::move(lock)); } return true; } bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue & queue, std::unique_lock node_lock) { /// In this method we have ownership on node. auto & node = graph[pid]; bool need_traverse = false; bool need_expand_pipeline = false; std::vector updated_back_edges; std::vector updated_direct_edges; { /// Stopwatch watch; std::unique_lock lock(std::move(node_lock)); try { node.last_processor_status = node.processor->prepare(node.updated_input_ports, node.updated_output_ports); } catch (...) { node.execution_state->exception = std::current_exception(); return false; } /// node.execution_state->preparation_time_ns += watch.elapsed(); node.updated_input_ports.clear(); node.updated_output_ports.clear(); switch (node.last_processor_status) { case IProcessor::Status::NeedData: case IProcessor::Status::PortFull: { need_traverse = true; node.status = ExecStatus::Idle; break; } case IProcessor::Status::Finished: { need_traverse = true; node.status = ExecStatus::Finished; break; } case IProcessor::Status::Ready: { node.status = ExecStatus::Executing; queue.push(node.execution_state.get()); break; } case IProcessor::Status::Async: { throw Exception("Async is temporary not supported.", ErrorCodes::LOGICAL_ERROR); // node.status = ExecStatus::Executing; // addAsyncJob(pid); // break; } case IProcessor::Status::Wait: { throw Exception("Wait is temporary not supported.", ErrorCodes::LOGICAL_ERROR); } case IProcessor::Status::ExpandPipeline: { need_expand_pipeline = true; break; } } if (need_traverse) { for (auto & edge_id : node.post_updated_input_ports) { auto edge = static_cast(edge_id); updated_back_edges.emplace_back(edge); edge->update_info.trigger(); } for (auto & edge_id : node.post_updated_output_ports) { auto edge = static_cast(edge_id); updated_direct_edges.emplace_back(edge); edge->update_info.trigger(); } node.post_updated_input_ports.clear(); node.post_updated_output_ports.clear(); } } if (need_traverse) { for (auto & edge : updated_direct_edges) { if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number)) return false; } for (auto & edge : updated_back_edges) { if (!tryAddProcessorToStackIfUpdated(*edge, queue, thread_number)) return false; } } if (need_expand_pipeline) { Stack stack; executor_contexts[thread_number]->task_list.emplace_back( node.execution_state.get(), &stack ); ExpandPipelineTask * desired = &executor_contexts[thread_number]->task_list.back(); ExpandPipelineTask * expected = nullptr; while (!expand_pipeline_task.compare_exchange_strong(expected, desired)) { if (!doExpandPipeline(expected, true)) return false; expected = nullptr; } if (!doExpandPipeline(desired, true)) return false; /// Add itself back to be prepared again. stack.push(pid); while (!stack.empty()) { auto item = stack.top(); if (!prepareProcessor(item, thread_number, queue, std::unique_lock(graph[item].status_mutex))) return false; stack.pop(); } } return true; } bool PipelineExecutor::doExpandPipeline(ExpandPipelineTask * task, bool processing) { std::unique_lock lock(task->mutex); if (processing) ++task->num_waiting_processing_threads; task->condvar.wait(lock, [&]() { return task->num_waiting_processing_threads >= num_processing_executors || expand_pipeline_task != task; }); bool result = true; /// After condvar.wait() task may point to trash. Can change it only if it is still in expand_pipeline_task. if (expand_pipeline_task == task) { result = expandPipeline(*task->stack, task->node_to_expand->processors_id); expand_pipeline_task = nullptr; lock.unlock(); task->condvar.notify_all(); } return result; } void PipelineExecutor::cancel() { cancelled = true; finish(); std::lock_guard guard(processors_mutex); for (auto & processor : processors) processor->cancel(); } void PipelineExecutor::finish() { { std::lock_guard lock(task_queue_mutex); finished = true; } std::lock_guard guard(executor_contexts_mutex); for (auto & context : executor_contexts) { { std::lock_guard lock(context->mutex); context->wake_flag = true; } context->condvar.notify_one(); } } void PipelineExecutor::execute(size_t num_threads) { try { executeImpl(num_threads); /// Execution can be stopped because of exception. Check and rethrow if any. for (auto & node : graph) if (node.execution_state->exception) std::rethrow_exception(node.execution_state->exception); } catch (Exception & exception) { if (checkCanAddAdditionalInfoToException(exception)) exception.addMessage("\nCurrent state:\n" + dumpPipeline()); throw; } if (cancelled) return; bool all_processors_finished = true; for (auto & node : graph) if (node.status != ExecStatus::Finished) /// Single thread, do not hold mutex all_processors_finished = false; if (!all_processors_finished) throw Exception("Pipeline stuck. Current state:\n" + dumpPipeline(), ErrorCodes::LOGICAL_ERROR); } void PipelineExecutor::executeSingleThread(size_t thread_num, size_t num_threads) { #if !defined(__APPLE__) && !defined(__FreeBSD__) /// Specify CPU core for thread if can. /// It may reduce the number of context swithches. /* if (num_threads > 1) { cpu_set_t cpu_set; CPU_ZERO(&cpu_set); CPU_SET(thread_num, &cpu_set); if (sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set) == -1) LOG_TRACE(log, "Cannot set affinity for thread " << num_threads); } */ #endif // UInt64 total_time_ns = 0; // UInt64 execution_time_ns = 0; // UInt64 processing_time_ns = 0; // UInt64 wait_time_ns = 0; // Stopwatch total_time_watch; ExecutionState * state = nullptr; auto prepare_processor = [&](UInt64 pid, Queue & queue) { if (!prepareProcessor(pid, thread_num, queue, std::unique_lock(graph[pid].status_mutex))) finish(); }; auto wake_up_executor = [&](size_t executor) { std::lock_guard guard(executor_contexts[executor]->mutex); executor_contexts[executor]->wake_flag = true; executor_contexts[executor]->condvar.notify_one(); }; while (!finished) { /// First, find any processor to execute. /// Just travers graph and prepare any processor. while (!finished) { { std::unique_lock lock(task_queue_mutex); if (!task_queue.empty()) { state = task_queue.pop(thread_num); if (!task_queue.empty() && !threads_queue.empty() /*&& task_queue.quota() > threads_queue.size()*/) { auto thread_to_wake = threads_queue.pop_any(); lock.unlock(); wake_up_executor(thread_to_wake); } break; } if (threads_queue.size() + 1 == num_threads) { lock.unlock(); finish(); break; } threads_queue.push(thread_num); } { std::unique_lock lock(executor_contexts[thread_num]->mutex); executor_contexts[thread_num]->condvar.wait(lock, [&] { return finished || executor_contexts[thread_num]->wake_flag; }); executor_contexts[thread_num]->wake_flag = false; } } if (finished) break; while (state) { if (finished) break; addJob(state); { // Stopwatch execution_time_watch; state->job(); // execution_time_ns += execution_time_watch.elapsed(); } if (state->exception) finish(); if (finished) break; // Stopwatch processing_time_watch; /// Try to execute neighbour processor. { Queue queue; ++num_processing_executors; while (auto task = expand_pipeline_task.load()) doExpandPipeline(task, true); /// Execute again if can. prepare_processor(state->processors_id, queue); state = nullptr; /// Take local task from queue if has one. if (!queue.empty()) { state = queue.front(); queue.pop(); } /// Push other tasks to global queue. if (!queue.empty()) { std::unique_lock lock(task_queue_mutex); while (!queue.empty() && !finished) { task_queue.push(queue.front(), thread_num); queue.pop(); } if (!threads_queue.empty() /* && task_queue.quota() > threads_queue.size()*/) { auto thread_to_wake = threads_queue.pop_any(); lock.unlock(); wake_up_executor(thread_to_wake); } } --num_processing_executors; while (auto task = expand_pipeline_task.load()) doExpandPipeline(task, false); } // processing_time_ns += processing_time_watch.elapsed(); } } // total_time_ns = total_time_watch.elapsed(); // wait_time_ns = total_time_ns - execution_time_ns - processing_time_ns; /* LOG_TRACE(log, "Thread finished." << " Total time: " << (total_time_ns / 1e9) << " sec." << " Execution time: " << (execution_time_ns / 1e9) << " sec." << " Processing time: " << (processing_time_ns / 1e9) << " sec." << " Wait time: " << (wait_time_ns / 1e9) << "sec."); */ } void PipelineExecutor::executeImpl(size_t num_threads) { Stack stack; threads_queue.init(num_threads); task_queue.init(num_threads); { std::lock_guard guard(executor_contexts_mutex); executor_contexts.reserve(num_threads); for (size_t i = 0; i < num_threads; ++i) executor_contexts.emplace_back(std::make_unique()); } auto thread_group = CurrentThread::getGroup(); using ThreadsData = std::vector; ThreadsData threads; threads.reserve(num_threads); bool finished_flag = false; SCOPE_EXIT( if (!finished_flag) { finish(); for (auto & thread : threads) if (thread.joinable()) thread.join(); } ); addChildlessProcessorsToStack(stack); { std::lock_guard lock(task_queue_mutex); Queue queue; size_t next_thread = 0; while (!stack.empty()) { UInt64 proc = stack.top(); stack.pop(); prepareProcessor(proc, 0, queue, std::unique_lock(graph[proc].status_mutex)); while (!queue.empty()) { task_queue.push(queue.front(), next_thread); queue.pop(); ++next_thread; if (next_thread >= num_threads) next_thread = 0; } } } if (num_threads > 1) { for (size_t i = 0; i < num_threads; ++i) { threads.emplace_back([this, thread_group, thread_num = i, num_threads] { /// ThreadStatus thread_status; setThreadName("QueryPipelineEx"); if (thread_group) CurrentThread::attachTo(thread_group); SCOPE_EXIT( if (thread_group) CurrentThread::detachQueryIfNotDetached(); ); executeSingleThread(thread_num, num_threads); }); } for (auto & thread : threads) if (thread.joinable()) thread.join(); } else executeSingleThread(0, num_threads); finished_flag = true; } String PipelineExecutor::dumpPipeline() const { for (auto & node : graph) { if (node.execution_state) node.processor->setDescription( "(" + std::to_string(node.execution_state->num_executed_jobs) + " jobs, execution time: " + std::to_string(node.execution_state->execution_time_ns / 1e9) + " sec., preparation time: " + std::to_string(node.execution_state->preparation_time_ns / 1e9) + " sec.)"); } std::vector statuses; std::vector proc_list; statuses.reserve(graph.size()); proc_list.reserve(graph.size()); for (auto & proc : graph) { proc_list.emplace_back(proc.processor); statuses.emplace_back(proc.last_processor_status); } WriteBufferFromOwnString out; printPipeline(processors, statuses, out); out.finish(); return out.str(); } }