ClickHouse/src/Interpreters/InterpreterInsertQuery.cpp

449 lines
19 KiB
C++
Raw Normal View History

#include <Interpreters/InterpreterInsertQuery.h>
2017-07-13 20:58:19 +00:00
#include <Access/AccessFlags.h>
#include <DataStreams/AddingDefaultBlockOutputStream.h>
#include <DataStreams/AddingDefaultsBlockInputStream.h>
2019-05-19 05:27:00 +00:00
#include <DataStreams/CheckConstraintsBlockOutputStream.h>
#include <DataStreams/CountingBlockOutputStream.h>
#include <DataStreams/InputStreamFromASTInsertQuery.h>
#include <DataStreams/NullAndDoCopyBlockInputStream.h>
#include <DataStreams/NullBlockOutputStream.h>
#include <DataStreams/PushingToViewsBlockOutputStream.h>
#include <DataStreams/RemoteBlockInputStream.h>
#include <DataStreams/SquashingBlockOutputStream.h>
#include <DataStreams/copyData.h>
#include <IO/ConcatReadBuffer.h>
#include <IO/ConnectionTimeoutsContext.h>
#include <Interpreters/InterpreterSelectWithUnionQuery.h>
#include <Interpreters/InterpreterWatchQuery.h>
#include <Interpreters/JoinedTables.h>
#include <Parsers/ASTFunction.h>
2020-09-03 17:51:16 +00:00
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTInsertQuery.h>
#include <Parsers/ASTSelectQuery.h>
2018-02-25 06:34:20 +00:00
#include <Parsers/ASTSelectWithUnionQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/queryToString.h>
#include <Processors/NullSink.h>
#include <Processors/Sources/SinkToOutputStream.h>
#include <Processors/Sources/SourceFromInputStream.h>
2020-11-17 17:16:55 +00:00
#include <Processors/Transforms/ExpressionTransform.h>
#include <Storages/StorageDistributed.h>
#include <TableFunctions/TableFunctionFactory.h>
#include <Common/checkStackSize.h>
2020-09-03 17:51:16 +00:00
#include <Interpreters/TranslateQualifiedNamesVisitor.h>
#include <Interpreters/getTableExpressions.h>
namespace
{
const UInt64 PARALLEL_DISTRIBUTED_INSERT_SELECT_ALL = 2;
}
2011-10-30 11:30:52 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int NO_SUCH_COLUMN_IN_TABLE;
extern const int ILLEGAL_COLUMN;
extern const int DUPLICATE_COLUMN;
2020-03-20 04:48:20 +00:00
extern const int LOGICAL_ERROR;
}
2011-10-30 11:30:52 +00:00
2018-01-12 13:03:19 +00:00
InterpreterInsertQuery::InterpreterInsertQuery(
const ASTPtr & query_ptr_, const Context & context_, bool allow_materialized_, bool no_squash_, bool no_destination_)
: query_ptr(query_ptr_)
, context(context_)
, allow_materialized(allow_materialized_)
, no_squash(no_squash_)
, no_destination(no_destination_)
2011-10-30 11:30:52 +00:00
{
2019-08-10 17:51:47 +00:00
checkStackSize();
2011-10-30 11:30:52 +00:00
}
2020-01-24 16:20:36 +00:00
StoragePtr InterpreterInsertQuery::getTable(ASTInsertQuery & query)
2011-10-30 11:30:52 +00:00
{
if (query.table_function)
{
const auto & factory = TableFunctionFactory::instance();
TableFunctionPtr table_function_ptr = factory.get(query.table_function, context);
2019-07-18 18:29:49 +00:00
return table_function_ptr->execute(query.table_function, context, table_function_ptr->getName());
}
2020-03-02 20:23:58 +00:00
query.table_id = context.resolveStorageID(query.table_id);
2020-05-28 23:01:18 +00:00
return DatabaseCatalog::instance().getTable(query.table_id, context);
2011-10-30 11:30:52 +00:00
}
Block InterpreterInsertQuery::getSampleBlock(
const ASTInsertQuery & query,
const StoragePtr & table,
const StorageMetadataPtr & metadata_snapshot) const
{
Block table_sample_non_materialized = metadata_snapshot->getSampleBlockNonMaterialized();
2017-04-02 17:37:49 +00:00
/// If the query does not include information about columns
if (!query.columns)
2018-09-20 12:59:33 +00:00
{
if (no_destination)
return metadata_snapshot->getSampleBlockWithVirtuals(table->getVirtuals());
2018-09-20 12:59:33 +00:00
else
return table_sample_non_materialized;
}
2013-10-25 14:56:47 +00:00
Block table_sample = metadata_snapshot->getSampleBlock();
2020-09-14 13:45:32 +00:00
/// Process column transformers (e.g. * EXCEPT(a)), asterisks and qualified columns.
2020-09-03 17:51:16 +00:00
const auto & columns = metadata_snapshot->getColumns();
auto names_and_types = columns.getOrdinary();
removeDuplicateColumns(names_and_types);
auto table_expr = std::make_shared<ASTTableExpression>();
table_expr->database_and_table_name = createTableIdentifier(table->getStorageID());
table_expr->children.push_back(table_expr->database_and_table_name);
TablesWithColumns tables_with_columns;
tables_with_columns.emplace_back(DatabaseAndTableWithAlias(*table_expr, context.getCurrentDatabase()), names_and_types);
tables_with_columns[0].addHiddenColumns(columns.getMaterialized());
tables_with_columns[0].addHiddenColumns(columns.getAliases());
tables_with_columns[0].addHiddenColumns(table->getVirtuals());
NameSet source_columns_set;
for (const auto & identifier : query.columns->children)
source_columns_set.insert(identifier->getColumnName());
TranslateQualifiedNamesVisitor::Data visitor_data(source_columns_set, tables_with_columns);
TranslateQualifiedNamesVisitor visitor(visitor_data);
auto columns_ast = query.columns->clone();
visitor.visit(columns_ast);
2017-04-02 17:37:49 +00:00
/// Form the block based on the column names from the query
Block res;
2020-09-03 17:51:16 +00:00
for (const auto & identifier : columns_ast->children)
{
std::string current_name = identifier->getColumnName();
2013-10-25 14:56:47 +00:00
2017-04-02 17:37:49 +00:00
/// The table does not have a column with that name
if (!table_sample.has(current_name))
2020-08-24 14:29:31 +00:00
throw Exception("No such column " + current_name + " in table " + query.table_id.getNameForLogs(),
ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
2012-03-19 12:57:56 +00:00
if (!allow_materialized && !table_sample_non_materialized.has(current_name))
throw Exception("Cannot insert column " + current_name + ", because it is MATERIALIZED column.", ErrorCodes::ILLEGAL_COLUMN);
if (res.has(current_name))
throw Exception("Column " + current_name + " specified more than once", ErrorCodes::DUPLICATE_COLUMN);
res.insert(ColumnWithTypeAndName(table_sample.getByName(current_name).type, current_name));
}
return res;
2013-10-25 14:56:47 +00:00
}
2012-03-19 12:57:56 +00:00
2011-10-30 11:30:52 +00:00
2020-08-24 14:29:31 +00:00
/** A query that just reads all data without any complex computations or filetering.
* If we just pipe the result to INSERT, we don't have to use too many threads for read.
*/
2020-11-02 05:28:37 +00:00
static bool isTrivialSelect(const ASTPtr & select)
2020-08-24 14:29:31 +00:00
{
2020-11-02 08:02:35 +00:00
if (auto * select_query = select->as<ASTSelectQuery>())
2020-11-02 05:28:37 +00:00
{
const auto & tables = select_query->tables();
2020-08-24 14:29:31 +00:00
2020-11-02 05:28:37 +00:00
if (!tables)
return false;
2020-08-24 14:29:31 +00:00
2020-11-02 05:28:37 +00:00
const auto & tables_in_select_query = tables->as<ASTTablesInSelectQuery &>();
2020-08-24 14:29:31 +00:00
2020-11-02 05:28:37 +00:00
if (tables_in_select_query.children.size() != 1)
return false;
2020-08-24 14:29:31 +00:00
2020-11-02 05:28:37 +00:00
const auto & child = tables_in_select_query.children.front();
const auto & table_element = child->as<ASTTablesInSelectQueryElement &>();
const auto & table_expr = table_element.table_expression->as<ASTTableExpression &>();
2020-08-24 14:29:31 +00:00
2020-11-02 05:28:37 +00:00
if (table_expr.subquery)
return false;
2020-08-24 14:29:31 +00:00
2020-11-02 05:28:37 +00:00
/// Note: how to write it in more generic way?
return (!select_query->distinct
&& !select_query->limit_with_ties
&& !select_query->prewhere()
&& !select_query->where()
&& !select_query->groupBy()
&& !select_query->having()
&& !select_query->orderBy()
&& !select_query->limitBy());
}
/// This query is ASTSelectWithUnionQuery subquery
return false;
2020-08-24 14:29:31 +00:00
};
BlockIO InterpreterInsertQuery::execute()
2012-03-11 08:52:56 +00:00
{
const Settings & settings = context.getSettingsRef();
2020-01-24 16:20:36 +00:00
auto & query = query_ptr->as<ASTInsertQuery &>();
2019-12-17 08:18:17 +00:00
BlockIO res;
2012-03-11 08:52:56 +00:00
2020-01-24 16:20:36 +00:00
StoragePtr table = getTable(query);
2020-06-18 16:10:47 +00:00
auto table_lock = table->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout);
auto metadata_snapshot = table->getInMemoryMetadataPtr();
auto query_sample_block = getSampleBlock(query, table, metadata_snapshot);
2020-03-02 20:23:58 +00:00
if (!query.table_function)
2020-03-13 15:41:36 +00:00
context.checkAccess(AccessType::INSERT, query.table_id, query_sample_block.getNames());
2020-01-24 16:20:36 +00:00
bool is_distributed_insert_select = false;
2019-12-12 10:49:15 +00:00
2020-03-19 17:04:59 +00:00
if (query.select && table->isRemote() && settings.parallel_distributed_insert_select)
{
// Distributed INSERT SELECT
std::shared_ptr<StorageDistributed> storage_src;
2020-03-23 09:42:59 +00:00
auto & select = query.select->as<ASTSelectWithUnionQuery &>();
2020-03-20 04:48:20 +00:00
auto new_query = std::dynamic_pointer_cast<ASTInsertQuery>(query.clone());
2020-03-23 09:42:59 +00:00
if (select.list_of_selects->children.size() == 1)
2019-12-12 10:49:15 +00:00
{
2020-11-02 08:02:35 +00:00
if (auto * select_query = select.list_of_selects->children.at(0)->as<ASTSelectQuery>())
{
2020-11-02 05:28:37 +00:00
JoinedTables joined_tables(Context(context), *select_query);
if (joined_tables.tablesCount() == 1)
{
2020-11-02 05:28:37 +00:00
storage_src = std::dynamic_pointer_cast<StorageDistributed>(joined_tables.getLeftTableStorage());
if (storage_src)
{
const auto select_with_union_query = std::make_shared<ASTSelectWithUnionQuery>();
select_with_union_query->list_of_selects = std::make_shared<ASTExpressionList>();
2020-11-02 05:28:37 +00:00
auto new_select_query = std::dynamic_pointer_cast<ASTSelectQuery>(select_query->clone());
select_with_union_query->list_of_selects->children.push_back(new_select_query);
2020-11-02 05:28:37 +00:00
new_select_query->replaceDatabaseAndTable(storage_src->getRemoteDatabaseName(), storage_src->getRemoteTableName());
2020-11-02 05:28:37 +00:00
new_query->select = select_with_union_query;
}
}
}
2019-12-12 10:49:15 +00:00
}
2020-03-20 04:48:20 +00:00
auto storage_dst = std::dynamic_pointer_cast<StorageDistributed>(table);
if (storage_src && storage_dst && storage_src->cluster_name == storage_dst->cluster_name)
2020-01-24 09:09:47 +00:00
{
is_distributed_insert_select = true;
if (settings.parallel_distributed_insert_select == PARALLEL_DISTRIBUTED_INSERT_SELECT_ALL)
{
new_query->table_id = StorageID(storage_dst->getRemoteDatabaseName(), storage_dst->getRemoteTableName());
}
const auto & cluster = storage_src->getCluster();
const auto & shards_info = cluster->getShardsInfo();
2020-06-18 18:40:02 +00:00
std::vector<std::unique_ptr<QueryPipeline>> pipelines;
2020-05-27 18:20:26 +00:00
String new_query_str = queryToString(new_query);
for (size_t shard_index : ext::range(0, shards_info.size()))
{
const auto & shard_info = shards_info[shard_index];
if (shard_info.isLocal())
{
InterpreterInsertQuery interpreter(new_query, context);
2020-06-18 18:40:02 +00:00
pipelines.emplace_back(std::make_unique<QueryPipeline>(interpreter.execute().pipeline));
}
else
{
auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings);
auto connections = shard_info.pool->getMany(timeouts, &settings, PoolMode::GET_ONE);
if (connections.empty() || connections.front().isNull())
throw Exception(
"Expected exactly one connection for shard " + toString(shard_info.shard_num), ErrorCodes::LOGICAL_ERROR);
2020-03-24 06:34:01 +00:00
/// INSERT SELECT query returns empty block
auto in_stream = std::make_shared<RemoteBlockInputStream>(std::move(connections), new_query_str, Block{}, context);
2020-06-19 18:23:44 +00:00
pipelines.emplace_back(std::make_unique<QueryPipeline>());
2020-06-18 18:40:02 +00:00
pipelines.back()->init(Pipe(std::make_shared<SourceFromInputStream>(std::move(in_stream))));
pipelines.back()->setSinks([](const Block & header, QueryPipeline::StreamType) -> ProcessorPtr
2020-05-27 18:20:26 +00:00
{
return std::make_shared<EmptySink>(header);
});
}
}
2020-05-27 18:20:26 +00:00
2020-08-06 12:24:05 +00:00
res.pipeline = QueryPipeline::unitePipelines(std::move(pipelines), {});
2020-01-24 09:09:47 +00:00
}
}
2019-12-12 10:49:15 +00:00
2020-05-27 18:20:26 +00:00
BlockOutputStreams out_streams;
if (!is_distributed_insert_select || query.watch)
2019-12-12 10:49:15 +00:00
{
size_t out_streams_size = 1;
if (query.select)
{
2020-08-24 14:29:31 +00:00
bool is_trivial_insert_select = false;
if (settings.optimize_trivial_insert_select)
2020-08-02 05:35:58 +00:00
{
2020-11-10 06:42:38 +00:00
const auto & select_query = query.select->as<ASTSelectWithUnionQuery &>();
const auto & selects = select_query.list_of_selects->children;
const auto & union_modes = select_query.list_of_modes;
2020-11-10 06:42:38 +00:00
/// ASTSelectWithUnionQuery is not normalized now, so it may pass some querys which can be Trivial select querys
2020-11-02 05:28:37 +00:00
is_trivial_insert_select
2020-11-10 06:42:38 +00:00
= std::all_of(
union_modes.begin(),
union_modes.end(),
[](const ASTSelectWithUnionQuery::Mode & mode) { return mode == ASTSelectWithUnionQuery::Mode::ALL; })
&& std::all_of(selects.begin(), selects.end(), [](const ASTPtr & select) { return isTrivialSelect(select); });
2020-08-24 14:29:31 +00:00
}
if (is_trivial_insert_select)
{
/** When doing trivial INSERT INTO ... SELECT ... FROM table,
* don't need to process SELECT with more than max_insert_threads
* and it's reasonable to set block size for SELECT to the desired block size for INSERT
* to avoid unnecessary squashing.
*/
Settings new_settings = context.getSettings();
new_settings.max_threads = std::max<UInt64>(1, settings.max_insert_threads);
if (settings.min_insert_block_size_rows)
new_settings.max_block_size = settings.min_insert_block_size_rows;
Context new_context = context;
new_context.setSettings(new_settings);
InterpreterSelectWithUnionQuery interpreter_select{
query.select, new_context, SelectQueryOptions(QueryProcessingStage::Complete, 1)};
res = interpreter_select.execute();
}
else
{
2020-08-24 14:29:31 +00:00
/// Passing 1 as subquery_depth will disable limiting size of intermediate result.
InterpreterSelectWithUnionQuery interpreter_select{
query.select, context, SelectQueryOptions(QueryProcessingStage::Complete, 1)};
res = interpreter_select.execute();
}
2019-12-12 10:49:15 +00:00
if (table->supportsParallelInsert() && settings.max_insert_threads > 1)
2020-05-27 18:20:26 +00:00
out_streams_size = std::min(size_t(settings.max_insert_threads), res.pipeline.getNumStreams());
res.pipeline.resize(out_streams_size);
}
else if (query.watch)
{
InterpreterWatchQuery interpreter_watch{ query.watch, context };
res = interpreter_watch.execute();
2020-05-27 18:20:26 +00:00
res.pipeline.init(Pipe(std::make_shared<SourceFromInputStream>(std::move(res.in))));
}
2018-09-20 11:40:04 +00:00
for (size_t i = 0; i < out_streams_size; i++)
2019-12-12 10:49:15 +00:00
{
/// We create a pipeline of several streams, into which we will write data.
BlockOutputStreamPtr out;
2019-05-19 05:27:00 +00:00
/// NOTE: we explicitly ignore bound materialized views when inserting into Kafka Storage.
/// Otherwise we'll get duplicates when MV reads same rows again from Kafka.
if (table->noPushingToViews() && !no_destination)
out = table->write(query_ptr, metadata_snapshot, context);
else
out = std::make_shared<PushingToViewsBlockOutputStream>(table, metadata_snapshot, context, query_ptr, no_destination);
/// Note that we wrap transforms one on top of another, so we write them in reverse of data processing order.
/// Checking constraints. It must be done after calculation of all defaults, so we can check them on calculated columns.
if (const auto & constraints = metadata_snapshot->getConstraints(); !constraints.empty())
out = std::make_shared<CheckConstraintsBlockOutputStream>(
query.table_id, out, out->getHeader(), metadata_snapshot->getConstraints(), context);
/// Actually we don't know structure of input blocks from query/table,
/// because some clients break insertion protocol (columns != header)
out = std::make_shared<AddingDefaultBlockOutputStream>(
2020-10-02 12:38:50 +00:00
out, query_sample_block, out->getHeader(), metadata_snapshot->getColumns(), context);
/// It's important to squash blocks as early as possible (before other transforms),
/// because other transforms may work inefficient if block size is small.
/// Do not squash blocks if it is a sync INSERT into Distributed, since it lead to double bufferization on client and server side.
/// Client-side bufferization might cause excessive timeouts (especially in case of big blocks).
if (!(context.getSettingsRef().insert_distributed_sync && table->isRemote()) && !no_squash && !query.watch)
{
out = std::make_shared<SquashingBlockOutputStream>(
out,
out->getHeader(),
context.getSettingsRef().min_insert_block_size_rows,
context.getSettingsRef().min_insert_block_size_bytes);
}
2019-12-12 10:49:15 +00:00
auto out_wrapper = std::make_shared<CountingBlockOutputStream>(out);
out_wrapper->setProcessListElement(context.getProcessListElement());
out = std::move(out_wrapper);
out_streams.emplace_back(std::move(out));
}
2019-12-12 10:49:15 +00:00
}
/// What type of query: INSERT or INSERT SELECT or INSERT WATCH?
2020-05-27 18:20:26 +00:00
if (is_distributed_insert_select)
{
/// Pipeline was already built.
}
else if (query.select || query.watch)
{
2020-05-27 18:20:26 +00:00
const auto & header = out_streams.at(0)->getHeader();
2020-11-17 17:16:55 +00:00
auto actions_dag = ActionsDAG::makeConvertingActions(
res.pipeline.getHeader().getColumnsWithTypeAndName(),
header.getColumnsWithTypeAndName(),
ActionsDAG::MatchColumnsMode::Position);
auto actions = std::make_shared<ExpressionActions>(actions_dag);
2020-05-27 18:20:26 +00:00
res.pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
2019-12-12 10:49:15 +00:00
{
2020-11-17 17:16:55 +00:00
return std::make_shared<ExpressionTransform>(in_header, actions);
2020-05-27 18:20:26 +00:00
});
2019-12-12 10:49:15 +00:00
2020-05-27 18:20:26 +00:00
res.pipeline.setSinks([&](const Block &, QueryPipeline::StreamType type) -> ProcessorPtr
2019-12-12 10:49:15 +00:00
{
2020-05-27 18:20:26 +00:00
if (type != QueryPipeline::StreamType::Main)
return nullptr;
2020-05-27 18:20:26 +00:00
auto stream = std::move(out_streams.back());
out_streams.pop_back();
return std::make_shared<SinkToOutputStream>(std::move(stream));
});
if (!allow_materialized)
{
for (const auto & column : metadata_snapshot->getColumns())
2020-05-27 18:20:26 +00:00
if (column.default_desc.kind == ColumnDefaultKind::Materialized && header.has(column.name))
throw Exception("Cannot insert column " + column.name + ", because it is MATERIALIZED column.", ErrorCodes::ILLEGAL_COLUMN);
}
}
2019-02-08 13:24:24 +00:00
else if (query.data && !query.has_tail) /// can execute without additional data
{
2019-12-19 16:09:05 +00:00
// res.out = std::move(out_streams.at(0));
res.in = std::make_shared<InputStreamFromASTInsertQuery>(query_ptr, nullptr, query_sample_block, context, nullptr);
2019-12-19 16:09:05 +00:00
res.in = std::make_shared<NullAndDoCopyBlockInputStream>(res.in, out_streams.at(0));
}
2019-12-16 13:52:32 +00:00
else
2019-12-12 10:49:15 +00:00
res.out = std::move(out_streams.at(0));
2020-05-27 18:20:26 +00:00
2019-12-16 13:52:32 +00:00
res.pipeline.addStorageHolder(table);
if (const auto * mv = dynamic_cast<const StorageMaterializedView *>(table.get()))
{
if (auto inner_table = mv->tryGetTargetTable())
res.pipeline.addStorageHolder(inner_table);
}
return res;
2012-03-11 08:52:56 +00:00
}
2020-03-02 20:23:58 +00:00
StorageID InterpreterInsertQuery::getDatabaseTable() const
2018-07-16 14:52:02 +00:00
{
2020-03-02 20:23:58 +00:00
return query_ptr->as<ASTInsertQuery &>().table_id;
2018-07-16 14:52:02 +00:00
}
2011-10-30 11:30:52 +00:00
}