Checking that columns from PARTITION BY are present in DEDUPLICATE BY

This commit is contained in:
Vasily Nemkov 2020-12-15 13:41:00 +03:00
parent a2f85a03f3
commit bf8c7cd685
3 changed files with 47 additions and 39 deletions

View File

@ -46,19 +46,25 @@ BlockIO InterpreterOptimizeQuery::execute()
}
metadata_snapshot->check(column_names, NamesAndTypesList{}, table_id);
const auto & sorting_keys = metadata_snapshot->getColumnsRequiredForSortingKey();
for (const auto & sorting_key : sorting_keys)
Names required_columns;
{
required_columns = metadata_snapshot->getColumnsRequiredForSortingKey();
const auto partitioning_cols = metadata_snapshot->getColumnsRequiredForPartitionKey();
required_columns.reserve(required_columns.size() + partitioning_cols.size());
required_columns.insert(required_columns.end(), partitioning_cols.begin(), partitioning_cols.end());
}
for (const auto & required_col : required_columns)
{
// Deduplication is performed only for adjacent rows in a block,
// and all rows in block are in the sorting key order,
// hence deduplication always implicitly takes sorting keys in account.
// and all rows in block are in the sorting key order within a single partition,
// hence deduplication always implicitly takes sorting keys and parition keys in account.
// So we just explicitly state that limitation in order to avoid confusion.
if (std::find(column_names.begin(), column_names.end(), sorting_key) == column_names.end())
if (std::find(column_names.begin(), column_names.end(), required_col) == column_names.end())
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
"DEDUPLICATE BY expression must include all columns used in table's ORDER BY or PRIMARY KEY,"
" but '{}' is missing."
" Expanded deduplicate columns expression: ['{}']",
sorting_key, fmt::join(column_names, "', '"));
"DEDUPLICATE BY expression must include all columns used in table's"
" ORDER BY, PRIMARY KEY, or PARTITION BY but '{}' is missing."
" Expanded DEDUPLICATE BY columns expression: ['{}']",
required_col, fmt::join(column_names, "', '"));
}
}

View File

@ -1,20 +1,20 @@
TOTAL rows 4
OLD DEDUPLICATE
0 0 0
1 1 2
1 1 3
0 0 0 1
1 1 2 1
1 1 3 1
DEDUPLICATE BY *
0 0 0
1 1 2
1 1 3
0 0 0 1
1 1 2 1
1 1 3 1
DEDUPLICATE BY * EXCEPT mat
0 0 0
1 1 2
1 1 3
DEDUPLICATE BY pk,sk,val,mat
0 0 0
1 1 2
1 1 3
0 0 0 1
1 1 2 1
1 1 3 1
DEDUPLICATE BY pk,sk,val,mat,partition_key
0 0 0 1
1 1 2 1
1 1 3 1
Can not remove full duplicates
OLD DEDUPLICATE
4
@ -24,16 +24,16 @@ Remove partial duplicates
DEDUPLICATE BY *
3
DEDUPLICATE BY * EXCEPT mat
0 0 0
1 1 2
1 1 3
0 0 0 1
1 1 2 1
1 1 3 1
DEDUPLICATE BY COLUMNS("*") EXCEPT mat
0 0 0
1 1 2
1 1 3
0 0 0 1
1 1 2 1
1 1 3 1
DEDUPLICATE BY pk,sk
0 0 0
1 1 2
0 0 0 1
1 1 2 1
DEDUPLICATE BY COLUMNS(".*k")
0 0 0
1 1 2
0 0 0 1
1 1 2 1

View File

@ -6,29 +6,31 @@
DROP TABLE IF EXISTS source_data;
CREATE TABLE source_data (
pk Int32, sk Int32, val UInt32,
pk Int32, sk Int32, val UInt32, partition_key UInt32 DEFAULT 1,
PRIMARY KEY (pk)
) ENGINE=MergeTree
ORDER BY (pk, sk);
INSERT INTO source_data VALUES (0, 0, 0), (0, 0, 0), (1, 1, 2), (1, 1, 3);
INSERT INTO source_data (pk, sk, val) VALUES (0, 0, 0), (0, 0, 0), (1, 1, 2), (1, 1, 3);
SELECT 'TOTAL rows', count() FROM source_data;
DROP TABLE IF EXISTS full_duplicates;
-- table with duplicates on MATERIALIZED columns
CREATE TABLE full_duplicates (
pk Int32, sk Int32, val UInt32, mat UInt32 MATERIALIZED 12345, alias UInt32 ALIAS 2,
pk Int32, sk Int32, val UInt32, partition_key UInt32, mat UInt32 MATERIALIZED 12345, alias UInt32 ALIAS 2,
PRIMARY KEY (pk)
) ENGINE=MergeTree
PARTITION BY (partition_key + 1) -- ensure that column in expression is properly handled when deduplicating. See [1] below.
ORDER BY (pk, toString(sk * 10)); -- silly order key to ensure that key column is checked even when it is a part of expression. See [1] below.
-- ERROR cases
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY pk, sk, val, mat, alias; -- { serverError 16 } -- alias column is present
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY sk, val; -- { serverError 8 } -- primary key column is missing
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(pk, sk, val, mat, alias); -- { serverError 51 } -- list is empty
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(pk, sk, val, mat, alias, partition_key); -- { serverError 51 } -- list is empty
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(pk); -- { serverError 8 } -- primary key column is missing [1]
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(sk); -- { serverError 8 } -- sorting key column is missing [1]
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(partition_key); -- { serverError 8 } -- partitioning column is missing [1]
OPTIMIZE TABLE full_duplicates DEDUPLICATE BY; -- { clientError 62 } -- empty list is a syntax error
OPTIMIZE TABLE partial_duplicates DEDUPLICATE BY pk,sk,val,mat EXCEPT mat; -- { clientError 62 } -- invalid syntax
@ -56,9 +58,9 @@ OPTIMIZE TABLE full_duplicates FINAL DEDUPLICATE BY * EXCEPT mat;
SELECT * FROM full_duplicates;
TRUNCATE full_duplicates;
SELECT 'DEDUPLICATE BY pk,sk,val,mat';
SELECT 'DEDUPLICATE BY pk,sk,val,mat,partition_key';
INSERT INTO full_duplicates SELECT * FROM source_data;
OPTIMIZE TABLE full_duplicates FINAL DEDUPLICATE BY pk,sk,val,mat;
OPTIMIZE TABLE full_duplicates FINAL DEDUPLICATE BY pk,sk,val,mat,partition_key;
SELECT * FROM full_duplicates;
TRUNCATE full_duplicates;
@ -67,7 +69,7 @@ TRUNCATE full_duplicates;
-- Now to the partial duplicates when MATERIALIZED column alway has unique value.
DROP TABLE IF EXISTS partial_duplicates;
CREATE TABLE partial_duplicates (
pk Int32, sk Int32, val UInt32, mat UInt32 MATERIALIZED rand(), alias UInt32 ALIAS 2,
pk Int32, sk Int32, val UInt32, partition_key UInt32 DEFAULT 1, mat UInt32 MATERIALIZED rand(), alias UInt32 ALIAS 2,
PRIMARY KEY (pk)
) ENGINE=MergeTree
ORDER BY (pk, sk);