update MATERIALIZED columns when their dependencies are updated [#CLICKHOUSE-13]

This commit is contained in:
Alexey Zatelepin 2018-09-07 18:44:51 +03:00
parent 8cf666e1b5
commit 1064d1d26b
8 changed files with 204 additions and 133 deletions

View File

@ -1,5 +1,7 @@
#include <Interpreters/MutationsInterpreter.h>
#include <Interpreters/InterpreterSelectQuery.h>
#include <Storages/StorageMergeTree.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <DataStreams/FilterBlockInputStream.h>
#include <DataStreams/ExpressionBlockInputStream.h>
#include <DataStreams/CreatingSetsBlockInputStream.h>
@ -84,36 +86,80 @@ bool MutationsInterpreter::isStorageTouchedByMutations() const
}
static void validateUpdateColumns(const StoragePtr & storage, const std::vector<MutationCommand> & commands)
static NameSet getKeyColumns(const StoragePtr & storage)
{
for (const MutationCommand & command : commands)
const MergeTreeData * merge_tree_data = nullptr;
if (auto merge_tree = dynamic_cast<StorageMergeTree *>(storage.get()))
merge_tree_data = &merge_tree->getData();
else if (auto replicated_merge_tree = dynamic_cast<StorageReplicatedMergeTree *>(storage.get()))
merge_tree_data = &replicated_merge_tree->getData();
else
return {};
NameSet key_columns;
if (merge_tree_data->partition_expr)
for (const String & col : merge_tree_data->partition_expr->getRequiredColumns())
key_columns.insert(col);
auto primary_expr = merge_tree_data->getPrimaryExpression();
if (primary_expr)
for (const String & col : primary_expr->getRequiredColumns())
key_columns.insert(col);
/// We don't process sampling_expression separately because it must be among the primary key columns.
auto secondary_sort_expr = merge_tree_data->getSecondarySortExpression();
if (secondary_sort_expr)
for (const String & col : secondary_sort_expr->getRequiredColumns())
key_columns.insert(col);
if (!merge_tree_data->merging_params.sign_column.empty())
key_columns.insert(merge_tree_data->merging_params.sign_column);
return key_columns;
}
static void validateUpdateColumns(
const StoragePtr & storage, const NameSet & updated_columns,
const std::unordered_map<String, Names> & column_to_affected_materialized)
{
NameSet key_columns = getKeyColumns(storage);
for (const String & column_name : updated_columns)
{
if (command.type != MutationCommand::UPDATE)
continue;
for (const auto & kv : command.column_to_update_expression)
auto found = false;
for (const auto & col : storage->getColumns().ordinary)
{
const String & column_name = kv.first;
if (col.name == column_name)
{
found = true;
break;
}
}
auto found = false;
for (const auto & col : storage->getColumns().ordinary)
if (!found)
{
for (const auto & col : storage->getColumns().materialized)
{
if (col.name == column_name)
{
found = true;
break;
}
throw Exception("Cannot UPDATE materialized column `" + column_name + "`", ErrorCodes::CANNOT_UPDATE_COLUMN);
}
if (!found)
{
for (const auto & col : storage->getColumns().materialized)
{
if (col.name == column_name)
throw Exception("Cannot UPDATE materialized column " + column_name, ErrorCodes::CANNOT_UPDATE_COLUMN);
}
throw Exception("There is no column `" + column_name + "` in table", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
}
throw Exception("There is no column " + column_name + " in table", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
if (key_columns.count(column_name))
throw Exception("Cannot UPDATE key column `" + column_name + "`", ErrorCodes::CANNOT_UPDATE_COLUMN);
auto materialized_it = column_to_affected_materialized.find(column_name);
if (materialized_it != column_to_affected_materialized.end())
{
for (const String & materialized : materialized_it->second)
{
if (key_columns.count(materialized))
throw Exception("Updated column `" + column_name + "` affects MATERIALIZED column `"
+ materialized + "`, which is a key column. Cannot UPDATE it.",
ErrorCodes::CANNOT_UPDATE_COLUMN);
}
}
}
@ -128,25 +174,90 @@ void MutationsInterpreter::prepare(bool dry_run)
if (commands.empty())
throw Exception("Empty mutation commands list", ErrorCodes::LOGICAL_ERROR);
validateUpdateColumns(storage, commands);
const ColumnsDescription & columns_desc = storage->getColumns();
NamesAndTypesList all_columns = columns_desc.getAllPhysical();
NamesAndTypesList all_columns = storage->getColumns().getAllPhysical();
NameSet updated_columns;
for (const MutationCommand & command : commands)
{
for (const auto & kv : command.column_to_update_expression)
updated_columns.insert(kv.first);
}
/// We need to know which columns affect which MATERIALIZED columns to recalculate them if dependencies
/// are updated.
std::unordered_map<String, Names> column_to_affected_materialized;
if (!updated_columns.empty())
{
for (const auto & kv : columns_desc.defaults)
{
const String & column = kv.first;
const ColumnDefault & col_default = kv.second;
if (col_default.kind == ColumnDefaultKind::Materialized)
{
ExpressionAnalyzer analyzer(col_default.expression->clone(), context, nullptr, all_columns);
for (const String & dependency : analyzer.getRequiredSourceColumns())
{
if (updated_columns.count(dependency))
column_to_affected_materialized[dependency].push_back(column);
}
}
}
}
if (!updated_columns.empty())
validateUpdateColumns(storage, updated_columns, column_to_affected_materialized);
/// First, break a sequence of commands into stages.
stages.emplace_back();
for (const auto & command : commands)
{
if (stages.back().update)
if (!stages.back().column_to_updated.empty())
stages.emplace_back();
if (command.type == MutationCommand::DELETE)
stages.back().deletes.push_back(command);
{
auto negated_predicate = makeASTFunction("not", command.predicate->clone());
stages.back().filters.push_back(negated_predicate);
}
else if (command.type == MutationCommand::UPDATE)
{
if (stages.size() == 1) /// First stage only supports DELETEs.
if (stages.size() == 1) /// First stage only supports filtering and can't update columns.
stages.emplace_back();
stages.back().update = command;
NameSet affected_materialized;
for (const auto & kv : command.column_to_update_expression)
{
const String & column = kv.first;
auto materialized_it = column_to_affected_materialized.find(column);
if (materialized_it != column_to_affected_materialized.end())
{
for (const String & mat_column : materialized_it->second)
affected_materialized.emplace(mat_column);
}
const auto & update_expr = kv.second;
auto updated_column = makeASTFunction("CAST",
makeASTFunction("if",
command.predicate->clone(),
update_expr->clone(),
std::make_shared<ASTIdentifier>(column)),
std::make_shared<ASTLiteral>(columns_desc.getPhysical(column).type->getName()));
stages.back().column_to_updated.emplace(column, updated_column);
}
if (!affected_materialized.empty())
{
stages.emplace_back();
for (const auto & column : columns_desc.materialized)
{
stages.back().column_to_updated.emplace(
column.name,
columns_desc.defaults.at(column.name).expression->clone());
}
}
}
else
throw Exception("Unknown mutation command type: " + DB::toString<int>(command.type), ErrorCodes::UNKNOWN_MUTATION_COMMAND);
@ -155,7 +266,7 @@ void MutationsInterpreter::prepare(bool dry_run)
/// Next, for each stage calculate columns changed by this and previous stages.
for (size_t i = 0; i < stages.size(); ++i)
{
if (!stages[i].deletes.empty())
if (!stages[i].filters.empty())
{
for (const auto & column : all_columns)
stages[i].output_columns.insert(column.name);
@ -165,9 +276,9 @@ void MutationsInterpreter::prepare(bool dry_run)
if (i > 0)
stages[i].output_columns = stages[i - 1].output_columns;
if (stages[i].update && stages[i].output_columns.size() < all_columns.size())
if (stages[i].output_columns.size() < all_columns.size())
{
for (const auto & kv : (*stages[i].update).column_to_update_expression)
for (const auto & kv : stages[i].column_to_updated)
stages[i].output_columns.insert(kv.first);
}
}
@ -179,32 +290,12 @@ void MutationsInterpreter::prepare(bool dry_run)
auto & stage = stages[i];
ASTPtr all_asts = std::make_shared<ASTExpressionList>();
ASTs delete_filter_columns;
std::unordered_map<String, ASTPtr> column_to_updated;
for (const auto & command : stage.deletes)
{
auto negated_predicate = makeASTFunction("not", command.predicate->clone());
all_asts->children.push_back(negated_predicate);
delete_filter_columns.push_back(negated_predicate);
}
for (const auto & ast : stage.filters)
all_asts->children.push_back(ast);
if (stage.update)
{
for (const auto & kv : stage.update->column_to_update_expression)
{
const String & column = kv.first;
const auto & update_expr = kv.second;
auto updated_column = makeASTFunction("CAST",
makeASTFunction("if",
stage.update->predicate->clone(),
update_expr->clone(),
std::make_shared<ASTIdentifier>(column)),
std::make_shared<ASTLiteral>(storage->getColumn(column).type->getName()));
column_to_updated.emplace(column, updated_column);
all_asts->children.push_back(updated_column);
}
}
for (const auto & kv : stage.column_to_updated)
all_asts->children.push_back(kv.second);
/// Add all output columns to prevent ExpressionAnalyzer from deleting them from source columns.
for (const String & column : stage.output_columns)
@ -214,23 +305,23 @@ void MutationsInterpreter::prepare(bool dry_run)
ExpressionActionsChain & actions_chain = stage.expressions_chain;
for (const auto & ast : delete_filter_columns)
for (const auto & ast : stage.filters)
{
if (!actions_chain.steps.empty())
actions_chain.addStep();
stage.analyzer->appendExpression(actions_chain, ast, dry_run);
stage.delete_filter_column_names.push_back(ast->getColumnName());
stage.filter_column_names.push_back(ast->getColumnName());
}
if (stage.update)
if (!stage.column_to_updated.empty())
{
if (!actions_chain.steps.empty())
actions_chain.addStep();
for (const auto & kv : column_to_updated)
for (const auto & kv : stage.column_to_updated)
stage.analyzer->appendExpression(actions_chain, kv.second, dry_run);
for (const auto & kv : column_to_updated)
for (const auto & kv : stage.column_to_updated)
{
actions_chain.getLastActions()->add(ExpressionAction::copyColumn(
kv.second->getColumnName(), kv.first, /* can_replace = */ true));
@ -257,22 +348,18 @@ void MutationsInterpreter::prepare(bool dry_run)
for (const auto & column_name : stages[0].output_columns)
select->select_expression_list->children.push_back(std::make_shared<ASTIdentifier>(column_name));
if (!stages[0].deletes.empty())
if (!stages[0].filters.empty())
{
ASTs delete_filters;
for (const auto & delete_ : stages[0].deletes)
delete_filters.push_back(makeASTFunction("not", delete_.predicate->clone()));
ASTPtr where_expression;
if (stages[0].deletes.size() == 1)
where_expression = delete_filters[0];
if (stages[0].filters.size() == 1)
where_expression = stages[0].filters[0];
else
{
auto coalesced_predicates = std::make_shared<ASTFunction>();
coalesced_predicates->name = "and";
coalesced_predicates->arguments = std::make_shared<ASTExpressionList>();
coalesced_predicates->children.push_back(coalesced_predicates->arguments);
coalesced_predicates->arguments->children = delete_filters;
coalesced_predicates->arguments->children = stages[0].filters;
where_expression = std::move(coalesced_predicates);
}
select->where_expression = where_expression;
@ -293,10 +380,10 @@ BlockInputStreamPtr MutationsInterpreter::addStreamsForLaterStages(BlockInputStr
for (size_t i = 0; i < stage.expressions_chain.steps.size(); ++i)
{
const auto & step = stage.expressions_chain.steps[i];
if (i < stage.delete_filter_column_names.size())
if (i < stage.filter_column_names.size())
{
/// Execute DELETE.
in = std::make_shared<FilterBlockInputStream>(in, step.actions, stage.delete_filter_column_names[i]);
/// Execute DELETEs.
in = std::make_shared<FilterBlockInputStream>(in, step.actions, stage.filter_column_names[i]);
}
else
{

View File

@ -44,19 +44,23 @@ private:
const Context & context;
/// A sequence of mutation commands is executed as a sequence of stages. Each stage consists of several
/// DELETEs, possibly followed by an UPDATE. Commands can reuse expressions calculated by the previous
/// commands in the same stage, but at the end of each stage intermediate columns are thrown away
/// (they may contain wrong values because of an UPDATE).
/// filters, followed by updating values of some columns. Commands can reuse expressions calculated by the
/// previous commands in the same stage, but at the end of each stage intermediate columns are thrown away
/// (they may contain wrong values because the column values have been updated).
///
/// If an UPDATE command changes some columns that some MATERIALIZED columns depend on, a stage to
/// recalculate these columns is added.
///
/// Each stage has output_columns that contain columns that are changed at the end of that stage
/// plus columns needed for the next mutations.
///
/// First stage is special: it can contain only DELETEs and is executed using InterpreterSelectQuery
/// to take advantage of table indexes (if there are any).
struct Stage
{
std::vector<MutationCommand> deletes;
std::optional<MutationCommand> update;
ASTs filters;
std::unordered_map<String, ASTPtr> column_to_updated;
/// Contains columns that are changed by this stage,
/// columns changed by the previous stages and also columns needed by the next stages.
@ -65,10 +69,10 @@ private:
std::unique_ptr<ExpressionAnalyzer> analyzer;
/// A chain of actions needed to execute this stage.
/// First steps calculate filter columns for DELETEs (in the same order as in `delete_filter_column_names`),
/// First steps calculate filter columns for DELETEs (in the same order as in `filter_column_names`),
/// then there is (possibly) an UPDATE stage, and finally a projection stage.
ExpressionActionsChain expressions_chain;
Names delete_filter_column_names;
Names filter_column_names;
};
std::unique_ptr<InterpreterSelectQuery> interpreter_select;

View File

@ -1364,49 +1364,6 @@ MergeTreeData::AlterDataPartTransaction::~AlterDataPartTransaction()
}
void MergeTreeData::checkMutations(const MutationCommands & commands)
{
NameSet key_columns;
auto fill_key_columns = [&]()
{
if (!key_columns.empty())
return;
if (partition_expr)
for (const String & col : partition_expr->getRequiredColumns())
key_columns.insert(col);
if (primary_expr)
for (const String & col : primary_expr->getRequiredColumns())
key_columns.insert(col);
/// We don't process sampling_expression separately because it must be among the primary key columns.
if (secondary_sort_expr)
for (const String & col : secondary_sort_expr->getRequiredColumns())
key_columns.insert(col);
if (!merging_params.sign_column.empty())
key_columns.insert(merging_params.sign_column);
};
for (const MutationCommand & command : commands)
{
if (command.type != MutationCommand::UPDATE)
continue;
fill_key_columns();
for (const auto & kv : command.column_to_update_expression)
{
const String & column_name = kv.first;
if (key_columns.count(column_name))
throw Exception("Cannot UPDATE key column " + column_name, ErrorCodes::CANNOT_UPDATE_COLUMN);
}
}
}
MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
const MergeTreePartInfo & new_part_info,
const String & new_part_name,

View File

@ -37,8 +37,6 @@ namespace ErrorCodes
extern const int TABLE_DIFFERS_TOO_MUCH;
}
class MutationCommands;
/// Data structure for *MergeTree engines.
/// Merge tree is used for incremental sorting of data.
@ -469,11 +467,6 @@ public:
/// If something is wrong, throws an exception.
void checkAlter(const AlterCommands & commands);
/// MergeTree-specific checks for mutations:
/// - columns corresponding to primary key, sign, sampling expression and date are not affected.
/// If something is wrong, throws an exception.
void checkMutations(const MutationCommands & commands);
/// Performs ALTER of the data part, writes the result to temporary files.
/// Returns an object allowing to rename temporary files to permanent files.
/// If the number of affected columns is suspiciously high and skip_sanity_checks is false, throws an exception.

View File

@ -304,7 +304,6 @@ struct CurrentlyMergingPartsTagger
void StorageMergeTree::mutate(const MutationCommands & commands, const Context &)
{
data.checkMutations(commands);
MergeTreeMutationEntry entry(commands, full_path, data.insert_increment.get());
{
std::lock_guard lock(currently_merging_mutex);

View File

@ -4117,8 +4117,6 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, const
/// After all needed parts are mutated (i.e. all active parts have the mutation version greater than
/// the version of this mutation), the mutation is considered done and can be deleted.
data.checkMutations(commands);
ReplicatedMergeTreeMutationEntry entry;
entry.source_replica = replica_name;
entry.commands = commands;

View File

@ -15,4 +15,10 @@ Updating with non-UInt8 predicate should fail
*** Test UPDATE of columns that DELETE depends on ***
2000-01-01 234 cde 30
*** Test complex mixture of UPDATEs and DELETEs ***
2000-01-01 456 ijk_materialized_40 40
2000-01-01 456 ijk_materialized_47 40
*** Test updating columns that MATERIALIZED columns depend on ***
Updating column that MATERIALIZED key column depends on should fail
17 materialized_24
27 materialized_34
30 materialized_37
40 materialized_47

View File

@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.alter_update"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.alter_update \
(d Date, key UInt32, value1 String, value2 UInt64, materialized_value String MATERIALIZED concat('materialized_', toString(value2))) \
(d Date, key UInt32, value1 String, value2 UInt64, materialized_value String MATERIALIZED concat('materialized_', toString(value2 + 7))) \
ENGINE MergeTree ORDER BY key PARTITION BY toYYYYMM(d)"
@ -95,7 +95,7 @@ ${CLICKHOUSE_CLIENT} --query="ALTER TABLE test.alter_update \
DELETE WHERE value2 IN (8, 9, 10), \
UPDATE value2 = value2 + 10 WHERE value2 <= 10, \
DELETE WHERE length(value1) + value2 = 23, \
DELETE WHERE materialized_value = 'materialized_30', \
DELETE WHERE materialized_value = 'materialized_37', \
UPDATE value1 = concat(value1, '_', materialized_value) WHERE key = 456"
wait_for_mutation "alter_update" "mutation_10.txt"
@ -104,4 +104,31 @@ ${CLICKHOUSE_CLIENT} --query="SELECT * FROM test.alter_update ORDER BY key"
${CLICKHOUSE_CLIENT} --query="ALTER TABLE test.alter_update DROP PARTITION 200001"
${CLICKHOUSE_CLIENT} --query="SELECT '*** Test updating columns that MATERIALIZED columns depend on ***'"
${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS test.materialized_key"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE test.materialized_key \
(key UInt32 MATERIALIZED value + 1, value UInt32) \
ENGINE MergeTree ORDER BY key"
${CLICKHOUSE_CLIENT} --query="ALTER TABLE test.materialized_key UPDATE value = 1 WHERE 1" 2>/dev/null || echo "Updating column that MATERIALIZED key column depends on should fail"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.materialized_key"
${CLICKHOUSE_CLIENT} --query="INSERT INTO test.alter_update VALUES \
('2000-01-01', 123, 'abc', 10), \
('2000-01-01', 234, 'cde', 20), \
('2000-01-01', 345, 'fgh', 30), \
('2000-01-01', 456, 'ijk', 40)"
${CLICKHOUSE_CLIENT} --query="ALTER TABLE test.alter_update \
UPDATE value2 = value2 + 7 WHERE value2 <= 20"
wait_for_mutation "alter_update" "mutation_12.txt"
${CLICKHOUSE_CLIENT} --query="SELECT value2, materialized_value FROM test.alter_update ORDER BY key"
${CLICKHOUSE_CLIENT} --query="ALTER TABLE test.alter_update DROP PARTITION 200001"
${CLICKHOUSE_CLIENT} --query="DROP TABLE test.alter_update"