ClickHouse/src/Interpreters/AsynchronousInsertQueue.cpp

476 lines
15 KiB
C++
Raw Normal View History

2021-09-01 23:18:09 +00:00
#include <Interpreters/AsynchronousInsertQueue.h>
2021-03-04 11:10:21 +00:00
#include <Core/Settings.h>
2021-10-15 20:18:20 +00:00
#include <QueryPipeline/BlockIO.h>
#include <Interpreters/InterpreterInsertQuery.h>
#include <Interpreters/Context.h>
#include <Processors/Transforms/getSourceFromASTInsertQuery.h>
#include <Processors/Sources/SourceFromSingleChunk.h>
#include <Processors/Executors/StreamingFormatExecutor.h>
2021-09-17 17:52:26 +00:00
#include <Processors/Executors/CompletedPipelineExecutor.h>
#include <Processors/Transforms/AddingDefaultsTransform.h>
2021-10-16 14:03:50 +00:00
#include <QueryPipeline/QueryPipeline.h>
2021-03-04 11:10:21 +00:00
#include <IO/ConcatReadBuffer.h>
#include <IO/ReadBufferFromMemory.h>
2021-03-17 14:11:47 +00:00
#include <IO/ReadBufferFromString.h>
2021-03-04 11:10:21 +00:00
#include <IO/copyData.h>
#include <Parsers/ASTInsertQuery.h>
#include <Parsers/queryToString.h>
#include <Storages/IStorage.h>
2021-08-31 02:16:02 +00:00
#include <Common/SipHash.h>
#include <Common/FieldVisitorHash.h>
#include <Access/Common/AccessFlags.h>
2021-09-16 17:18:34 +00:00
#include <Formats/FormatFactory.h>
2021-11-11 11:41:15 +00:00
#include <base/logger_useful.h>
2021-03-04 11:10:21 +00:00
namespace DB
{
2021-08-31 02:16:02 +00:00
namespace ErrorCodes
2021-03-04 11:10:21 +00:00
{
2021-08-31 02:16:02 +00:00
extern const int TIMEOUT_EXCEEDED;
extern const int UNKNOWN_EXCEPTION;
2021-09-16 17:18:34 +00:00
extern const int UNKNOWN_FORMAT;
2021-08-31 02:16:02 +00:00
}
2021-09-04 00:57:05 +00:00
AsynchronousInsertQueue::InsertQuery::InsertQuery(const ASTPtr & query_, const Settings & settings_)
: query(query_->clone()), settings(settings_)
{
}
AsynchronousInsertQueue::InsertQuery::InsertQuery(const InsertQuery & other)
: query(other.query->clone()), settings(other.settings)
{
}
AsynchronousInsertQueue::InsertQuery &
AsynchronousInsertQueue::InsertQuery::operator=(const InsertQuery & other)
{
if (this != &other)
{
query = other.query->clone();
settings = other.settings;
}
2021-09-04 00:57:05 +00:00
return *this;
}
2021-08-31 02:16:02 +00:00
UInt64 AsynchronousInsertQueue::InsertQuery::Hash::operator()(const InsertQuery & insert_query) const
{
SipHash hash;
insert_query.query->updateTreeHash(hash);
2021-08-31 02:16:02 +00:00
for (const auto & setting : insert_query.settings.allChanged())
{
2021-08-31 02:16:02 +00:00
hash.update(setting.getName());
applyVisitor(FieldVisitorHash(hash), setting.getValue());
}
2021-03-04 11:10:21 +00:00
2021-08-31 02:16:02 +00:00
return hash.get64();
}
2021-03-04 11:10:21 +00:00
2021-08-31 02:16:02 +00:00
bool AsynchronousInsertQueue::InsertQuery::operator==(const InsertQuery & other) const
{
return queryToString(query) == queryToString(other.query) && settings == other.settings;
}
2021-03-04 11:10:21 +00:00
2021-09-10 10:24:09 +00:00
AsynchronousInsertQueue::InsertData::Entry::Entry(String && bytes_, String && query_id_)
: bytes(std::move(bytes_)), query_id(std::move(query_id_))
{
}
2021-03-04 11:10:21 +00:00
2021-08-31 02:16:02 +00:00
void AsynchronousInsertQueue::InsertData::Entry::finish(std::exception_ptr exception_)
{
std::lock_guard lock(mutex);
finished = true;
exception = exception_;
cv.notify_all();
2021-03-04 11:10:21 +00:00
}
2021-09-01 23:18:09 +00:00
bool AsynchronousInsertQueue::InsertData::Entry::wait(const Milliseconds & timeout) const
2021-03-04 11:10:21 +00:00
{
2021-08-31 02:16:02 +00:00
std::unique_lock lock(mutex);
return cv.wait_for(lock, timeout, [&] { return finished; });
2021-03-04 11:10:21 +00:00
}
2021-09-01 23:18:09 +00:00
bool AsynchronousInsertQueue::InsertData::Entry::isFinished() const
{
std::lock_guard lock(mutex);
return finished;
}
std::exception_ptr AsynchronousInsertQueue::InsertData::Entry::getException() const
{
std::lock_guard lock(mutex);
return exception;
}
2021-08-31 02:16:02 +00:00
AsynchronousInsertQueue::AsynchronousInsertQueue(ContextPtr context_, size_t pool_size, size_t max_data_size_, const Timeout & timeouts)
: WithContext(context_)
, max_data_size(max_data_size_)
2021-04-19 14:51:26 +00:00
, busy_timeout(timeouts.busy)
, stale_timeout(timeouts.stale)
, pool(pool_size)
, dump_by_first_update_thread(&AsynchronousInsertQueue::busyCheck, this)
2021-08-31 02:16:02 +00:00
, cleanup_thread(&AsynchronousInsertQueue::cleanup, this)
2021-03-04 11:10:21 +00:00
{
2021-04-19 19:16:34 +00:00
using namespace std::chrono;
assert(pool_size);
if (stale_timeout > 0ms)
2021-04-19 19:16:34 +00:00
dump_by_last_update_thread = ThreadFromGlobalPool(&AsynchronousInsertQueue::staleCheck, this);
2021-03-04 11:10:21 +00:00
}
2021-04-19 14:51:26 +00:00
AsynchronousInsertQueue::~AsynchronousInsertQueue()
{
/// TODO: add a setting for graceful shutdown.
shutdown = true;
assert(dump_by_first_update_thread.joinable());
dump_by_first_update_thread.join();
2021-08-31 02:16:02 +00:00
assert(cleanup_thread.joinable());
cleanup_thread.join();
2021-04-19 19:16:34 +00:00
if (dump_by_last_update_thread.joinable())
dump_by_last_update_thread.join();
2021-04-19 14:51:26 +00:00
pool.wait();
2021-08-31 02:16:02 +00:00
std::lock_guard lock(currently_processing_mutex);
for (const auto & [_, entry] : currently_processing_queries)
{
2021-09-01 23:18:09 +00:00
if (!entry->isFinished())
2021-08-31 02:16:02 +00:00
entry->finish(std::make_exception_ptr(Exception(
ErrorCodes::TIMEOUT_EXCEEDED,
"Wait for async insert timeout exceeded)")));
}
2021-04-19 14:51:26 +00:00
}
void AsynchronousInsertQueue::scheduleDataProcessingJob(const InsertQuery & key, InsertDataPtr data, ContextPtr global_context)
2021-08-31 02:35:36 +00:00
{
/// Wrap 'unique_ptr' with 'shared_ptr' to make this
/// lambda copyable and allow to save it to the thread pool.
pool.scheduleOrThrowOnError([=, data = std::make_shared<InsertDataPtr>(std::move(data))]
{
processData(std::move(key), std::move(*data), std::move(global_context));
});
}
2021-09-01 23:18:09 +00:00
void AsynchronousInsertQueue::push(ASTPtr query, ContextPtr query_context)
{
2021-09-01 23:18:09 +00:00
query = query->clone();
const auto & settings = query_context->getSettingsRef();
auto & insert_query = query->as<ASTInsertQuery &>();
InterpreterInsertQuery interpreter(query, query_context, settings.insert_allow_materialized_columns);
auto table = interpreter.getTable(insert_query);
auto sample_block = interpreter.getSampleBlock(insert_query, table, table->getInMemoryMetadataPtr());
2021-09-16 17:18:34 +00:00
if (!FormatFactory::instance().isInputFormat(insert_query.format))
throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown input format {}", insert_query.format);
query_context->checkAccess(AccessType::INSERT, insert_query.table_id, sample_block.getNames());
2021-09-01 23:18:09 +00:00
2021-09-10 11:56:45 +00:00
String bytes;
{
auto read_buf = getReadBufferFromASTInsertQuery(query);
2021-09-10 11:56:45 +00:00
WriteBufferFromString write_buf(bytes);
copyData(*read_buf, write_buf);
}
2021-08-31 02:16:02 +00:00
2021-09-10 11:56:45 +00:00
auto entry = std::make_shared<InsertData::Entry>(std::move(bytes), query_context->getCurrentQueryId());
InsertQuery key{query, settings};
2021-04-19 14:51:26 +00:00
2021-08-31 02:16:02 +00:00
{
2021-09-09 16:10:53 +00:00
/// Firstly try to get entry from queue without exclusive lock.
2021-08-31 02:16:02 +00:00
std::shared_lock read_lock(rwlock);
if (auto it = queue.find(key); it != queue.end())
{
pushImpl(std::move(entry), it);
return;
}
2021-08-31 02:16:02 +00:00
}
2021-04-19 17:49:02 +00:00
std::unique_lock write_lock(rwlock);
auto it = queue.emplace(key, std::make_shared<Container>()).first;
pushImpl(std::move(entry), it);
}
2021-03-04 11:10:21 +00:00
void AsynchronousInsertQueue::pushImpl(InsertData::EntryPtr entry, QueueIterator it)
{
2021-08-31 02:16:02 +00:00
auto & [data_mutex, data] = *it->second;
std::lock_guard data_lock(data_mutex);
2021-03-04 11:10:21 +00:00
2021-08-31 02:16:02 +00:00
if (!data)
data = std::make_unique<InsertData>();
2021-08-27 21:29:10 +00:00
data->size += entry->bytes.size();
2021-08-31 02:16:02 +00:00
data->last_update = std::chrono::steady_clock::now();
data->entries.emplace_back(entry);
2021-08-31 02:16:02 +00:00
{
std::lock_guard currently_processing_lock(currently_processing_mutex);
2021-09-01 23:18:09 +00:00
currently_processing_queries.emplace(entry->query_id, entry);
2021-08-31 02:16:02 +00:00
}
2021-03-04 11:10:21 +00:00
LOG_TRACE(log, "Have {} pending inserts with total {} bytes of data for query '{}'",
data->entries.size(), data->size, queryToString(it->first.query));
2021-08-31 02:16:02 +00:00
if (data->size > max_data_size)
scheduleDataProcessingJob(it->first, std::move(data), getContext());
2021-08-31 02:16:02 +00:00
}
void AsynchronousInsertQueue::waitForProcessingQuery(const String & query_id, const Milliseconds & timeout)
{
InsertData::EntryPtr entry;
{
std::lock_guard lock(currently_processing_mutex);
auto it = currently_processing_queries.find(query_id);
if (it == currently_processing_queries.end())
return;
entry = it->second;
}
bool finished = entry->wait(timeout);
if (!finished)
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Wait for async insert timeout ({} ms) exceeded)", timeout.count());
2021-09-01 23:18:09 +00:00
if (auto exception = entry->getException())
std::rethrow_exception(exception);
2021-03-04 11:10:21 +00:00
}
2021-04-19 14:51:26 +00:00
void AsynchronousInsertQueue::busyCheck()
{
auto timeout = busy_timeout;
while (!shutdown)
{
std::this_thread::sleep_for(timeout);
/// TODO: use priority queue instead of raw unsorted queue.
timeout = busy_timeout;
2021-08-31 02:16:02 +00:00
std::shared_lock read_lock(rwlock);
2021-08-31 02:16:02 +00:00
for (auto & [key, elem] : queue)
2021-04-19 14:51:26 +00:00
{
2021-08-31 02:16:02 +00:00
std::lock_guard data_lock(elem->mutex);
if (!elem->data)
continue;
2021-04-19 14:51:26 +00:00
2021-08-31 02:16:02 +00:00
auto lag = std::chrono::steady_clock::now() - elem->data->first_update;
if (lag >= busy_timeout)
scheduleDataProcessingJob(key, std::move(elem->data), getContext());
2021-08-31 02:16:02 +00:00
else
timeout = std::min(timeout, std::chrono::ceil<std::chrono::milliseconds>(busy_timeout - lag));
}
2021-08-31 02:16:02 +00:00
}
}
2021-08-31 02:16:02 +00:00
void AsynchronousInsertQueue::staleCheck()
{
while (!shutdown)
{
std::this_thread::sleep_for(stale_timeout);
std::shared_lock read_lock(rwlock);
for (auto & [key, elem] : queue)
{
2021-08-31 02:16:02 +00:00
std::lock_guard data_lock(elem->mutex);
if (!elem->data)
continue;
2021-08-31 02:16:02 +00:00
auto lag = std::chrono::steady_clock::now() - elem->data->last_update;
if (lag >= stale_timeout)
scheduleDataProcessingJob(key, std::move(elem->data), getContext());
2021-04-19 14:51:26 +00:00
}
}
}
2021-08-31 02:16:02 +00:00
void AsynchronousInsertQueue::cleanup()
2021-04-19 14:51:26 +00:00
{
/// Do not run cleanup too often,
/// because it holds exclusive lock.
2021-09-01 23:18:09 +00:00
auto timeout = busy_timeout * 5;
2021-08-31 02:16:02 +00:00
2021-04-21 16:56:06 +00:00
while (!shutdown)
2021-04-19 14:51:26 +00:00
{
2021-08-31 02:16:02 +00:00
std::this_thread::sleep_for(timeout);
std::vector<InsertQuery> keys_to_remove;
2021-04-19 14:51:26 +00:00
2021-08-31 02:16:02 +00:00
{
std::shared_lock read_lock(rwlock);
for (auto & [key, elem] : queue)
{
std::lock_guard data_lock(elem->mutex);
if (!elem->data)
keys_to_remove.push_back(key);
}
}
2021-04-19 14:51:26 +00:00
2021-08-31 02:16:02 +00:00
if (!keys_to_remove.empty())
2021-04-19 14:51:26 +00:00
{
2021-08-31 02:16:02 +00:00
std::unique_lock write_lock(rwlock);
2021-09-01 23:18:09 +00:00
size_t total_removed = 0;
2021-04-19 14:51:26 +00:00
2021-08-31 02:16:02 +00:00
for (const auto & key : keys_to_remove)
{
auto it = queue.find(key);
2021-09-01 23:18:09 +00:00
if (it != queue.end() && !it->second->data)
{
2021-08-31 02:16:02 +00:00
queue.erase(it);
2021-09-01 23:18:09 +00:00
++total_removed;
}
2021-08-31 02:16:02 +00:00
}
2021-04-19 14:51:26 +00:00
2021-09-01 23:18:09 +00:00
if (total_removed)
2021-09-16 17:18:34 +00:00
LOG_TRACE(log, "Removed stale entries for {} queries from asynchronous insertion queue", total_removed);
2021-09-01 23:18:09 +00:00
}
{
std::vector<String> ids_to_remove;
std::lock_guard lock(currently_processing_mutex);
for (const auto & [query_id, entry] : currently_processing_queries)
if (entry->isFinished())
ids_to_remove.push_back(query_id);
if (!ids_to_remove.empty())
{
for (const auto & id : ids_to_remove)
currently_processing_queries.erase(id);
LOG_TRACE(log, "Removed {} finished entries from asynchronous insertion queue", ids_to_remove.size());
2021-09-01 23:18:09 +00:00
}
2021-04-19 14:51:26 +00:00
}
}
}
// static
2021-08-31 02:16:02 +00:00
void AsynchronousInsertQueue::processData(InsertQuery key, InsertDataPtr data, ContextPtr global_context)
try
2021-03-04 11:10:21 +00:00
{
2021-08-31 02:16:02 +00:00
if (!data)
return;
2021-03-04 11:10:21 +00:00
const auto * log = &Poco::Logger::get("AsynchronousInsertQueue");
const auto & insert_query = assert_cast<const ASTInsertQuery &>(*key.query);
auto insert_context = Context::createCopy(global_context);
/// 'resetParser' doesn't work for parallel parsing.
2021-08-31 02:16:02 +00:00
key.settings.set("input_format_parallel_parsing", false);
insert_context->makeQueryContext();
2021-08-31 02:16:02 +00:00
insert_context->setSettings(key.settings);
2021-03-04 11:10:21 +00:00
/// Set initial_query_id, because it's used in InterpreterInsertQuery for table lock.
insert_context->getClientInfo().query_kind = ClientInfo::QueryKind::INITIAL_QUERY;
insert_context->setCurrentQueryId("");
2021-09-19 20:15:10 +00:00
InterpreterInsertQuery interpreter(key.query, insert_context, key.settings.insert_allow_materialized_columns, false, false, true);
2021-09-17 17:52:26 +00:00
auto pipeline = interpreter.execute().pipeline;
assert(pipeline.pushing());
2021-09-17 17:52:26 +00:00
auto header = pipeline.getHeader();
2021-08-31 02:16:02 +00:00
auto format = getInputFormatFromASTInsertQuery(key.query, false, header, insert_context, nullptr);
2021-03-04 11:10:21 +00:00
size_t total_rows = 0;
2021-08-31 02:16:02 +00:00
InsertData::EntryPtr current_entry;
auto on_error = [&](const MutableColumns & result_columns, Exception & e)
{
2021-08-31 02:16:02 +00:00
LOG_ERROR(log, "Failed parsing for query '{}' with query id {}. {}",
queryToString(key.query), current_entry->query_id, e.displayText());
for (const auto & column : result_columns)
if (column->size() > total_rows)
column->popBack(column->size() - total_rows);
2021-08-31 02:16:02 +00:00
current_entry->finish(std::current_exception());
return 0;
};
std::shared_ptr<ISimpleTransform> adding_defaults_transform;
if (insert_context->getSettingsRef().input_format_defaults_for_omitted_fields)
{
StoragePtr storage = DatabaseCatalog::instance().getTable(insert_query.table_id, insert_context);
auto metadata_snapshot = storage->getInMemoryMetadataPtr();
const auto & columns = metadata_snapshot->getColumns();
if (columns.hasDefaults())
adding_defaults_transform = std::make_shared<AddingDefaultsTransform>(header, columns, *format, insert_context);
}
StreamingFormatExecutor executor(header, format, std::move(on_error), std::move(adding_defaults_transform));
2021-10-24 19:32:28 +00:00
std::unique_ptr<ReadBuffer> last_buffer;
2021-08-31 02:16:02 +00:00
for (const auto & entry : data->entries)
{
2021-10-24 19:32:28 +00:00
auto buffer = std::make_unique<ReadBufferFromString>(entry->bytes);
2021-08-31 02:16:02 +00:00
current_entry = entry;
total_rows += executor.execute(*buffer);
2021-10-24 19:32:28 +00:00
/// Keep buffer, because it still can be used
/// in destructor, while resetting buffer at next iteration.
last_buffer = std::move(buffer);
}
2021-03-04 11:10:21 +00:00
2021-10-24 19:32:28 +00:00
format->addBuffer(std::move(last_buffer));
auto chunk = Chunk(executor.getResultColumns(), total_rows);
size_t total_bytes = chunk.bytes();
auto source = std::make_shared<SourceFromSingleChunk>(header, std::move(chunk));
2021-09-17 17:52:26 +00:00
pipeline.complete(Pipe(std::move(source)));
2021-09-17 17:52:26 +00:00
CompletedPipelineExecutor completed_executor(pipeline);
completed_executor.execute();
2021-09-01 15:06:11 +00:00
LOG_INFO(log, "Flushed {} rows, {} bytes for query '{}'",
2021-08-31 02:16:02 +00:00
total_rows, total_bytes, queryToString(key.query));
2021-08-27 21:29:10 +00:00
2021-08-31 02:16:02 +00:00
for (const auto & entry : data->entries)
2021-09-01 23:18:09 +00:00
if (!entry->isFinished())
2021-08-31 02:16:02 +00:00
entry->finish();
2021-03-04 11:10:21 +00:00
}
catch (const Exception & e)
{
finishWithException(key.query, data->entries, e);
}
catch (const Poco::Exception & e)
{
finishWithException(key.query, data->entries, e);
}
catch (const std::exception & e)
{
finishWithException(key.query, data->entries, e);
}
catch (...)
2021-03-04 11:10:21 +00:00
{
finishWithException(key.query, data->entries, Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Unknown exception"));
}
template <typename E>
void AsynchronousInsertQueue::finishWithException(
const ASTPtr & query, const std::list<InsertData::EntryPtr> & entries, const E & exception)
{
tryLogCurrentException("AsynchronousInsertQueue", fmt::format("Failed insertion for query '{}'", queryToString(query)));
for (const auto & entry : entries)
{
if (!entry->isFinished())
{
/// Make a copy of exception to avoid concurrent usage of
/// one exception object from several threads.
entry->finish(std::make_exception_ptr(exception));
}
}
2021-03-04 11:10:21 +00:00
}
}