Enforcing all sorting keys to be present in DEDUPLICATE BY columns

Updated test and minor cleanup
2024-11-24 08:32:02 +00:00 · 2020-12-09 18:08:37 +03:00 · 2020-12-09 18:08:37 +03:00 · a2f85a03f3
commit a2f85a03f3
parent 59fc301344
2 changed files with 24 additions and 14 deletions
--- a/src/Interpreters/InterpreterOptimizeQuery.cpp
+++ b/src/Interpreters/InterpreterOptimizeQuery.cpp
@ -39,17 +39,26 @@ BlockIO InterpreterOptimizeQuery::execute()
    {
        // User requested custom set of columns for deduplication, possibly with Column Transformer expression.
        {
+            // Expand asterisk, column transformers, etc into list of column names.
            const auto cols = processColumnTransformers(context.getCurrentDatabase(), table, metadata_snapshot, ast.deduplicate_by_columns);
            for (const auto & col : cols->children)
                column_names.emplace_back(col->getColumnName());
        }

        metadata_snapshot->check(column_names, NamesAndTypesList{}, table_id);
-        for (const auto & primary_key : metadata_snapshot->getPrimaryKeyColumns())
+        const auto & sorting_keys = metadata_snapshot->getColumnsRequiredForSortingKey();
+        for (const auto & sorting_key : sorting_keys)
        {
-            if (std::find(column_names.begin(), column_names.end(), primary_key) == column_names.end())
-                throw Exception("Deduplicate expression doesn't contain primary key column: " + primary_key,
-                        ErrorCodes::THERE_IS_NO_COLUMN);
+            // Deduplication is performed only for adjacent rows in a block,
+            // and all rows in block are in the sorting key order,
+            // hence deduplication always implicitly takes sorting keys in account.
+            // So we just explicitly state that limitation in order to avoid confusion.
+            if (std::find(column_names.begin(), column_names.end(), sorting_key) == column_names.end())
+                throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
+                        "DEDUPLICATE BY expression must include all columns used in table's ORDER BY or PRIMARY KEY,"
+                        " but '{}' is missing."
+                        " Expanded deduplicate columns expression: ['{}']",
+                        sorting_key, fmt::join(column_names, "', '"));
        }
    }

--- a/tests/queries/0_stateless/01581_deduplicate_by_columns_local.sql
+++ b/tests/queries/0_stateless/01581_deduplicate_by_columns_local.sql
@ -6,8 +6,8 @@
 DROP TABLE IF EXISTS source_data;

 CREATE TABLE source_data (
-	pk Int32, sk Int32, val UInt32,
-	PRIMARY KEY (pk)
+    pk Int32, sk Int32, val UInt32,
+    PRIMARY KEY (pk)
 ) ENGINE=MergeTree
 ORDER BY (pk, sk);

@ -18,21 +18,22 @@ SELECT 'TOTAL rows', count() FROM source_data;
 DROP TABLE IF EXISTS full_duplicates;
 -- table with duplicates on MATERIALIZED columns
 CREATE TABLE full_duplicates  (
-	pk Int32, sk Int32, val UInt32, mat UInt32 MATERIALIZED 12345, alias UInt32 ALIAS 2,
-	PRIMARY KEY (pk)
+    pk Int32, sk Int32, val UInt32, mat UInt32 MATERIALIZED 12345, alias UInt32 ALIAS 2,
+    PRIMARY KEY (pk)
 ) ENGINE=MergeTree
-ORDER BY (pk, sk);
+ORDER BY (pk, toString(sk * 10)); -- silly order key to ensure that key column is checked even when it is a part of expression. See [1] below.

 -- ERROR cases
 OPTIMIZE TABLE full_duplicates DEDUPLICATE BY pk, sk, val, mat, alias; -- { serverError 16 } -- alias column is present
 OPTIMIZE TABLE full_duplicates DEDUPLICATE BY sk, val; -- { serverError 8 } -- primary key column is missing
 OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(pk, sk, val, mat, alias); -- { serverError 51 } -- list is empty
-OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(pk); -- { serverError 8 } -- primary key column is missing
+OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(pk); -- { serverError 8 } -- primary key column is missing [1]
+OPTIMIZE TABLE full_duplicates DEDUPLICATE BY * EXCEPT(sk); -- { serverError 8 } -- sorting key column is missing [1]

 OPTIMIZE TABLE full_duplicates DEDUPLICATE BY; -- { clientError 62 } -- empty list is a syntax error
 OPTIMIZE TABLE partial_duplicates DEDUPLICATE BY pk,sk,val,mat EXCEPT mat; -- { clientError 62 } -- invalid syntax
-OPTIMIZE TABLE partial_duplicates DEDUPLICATE BY pk APPLY(pk+1); -- { clientError 62 } -- APPLY column transformer is not supported
-OPTIMIZE TABLE partial_duplicates DEDUPLICATE BY pk REPLACE(pk+1); -- { clientError 62 } -- REPLACE column transformer is not supported
+OPTIMIZE TABLE partial_duplicates DEDUPLICATE BY pk APPLY(pk + 1); -- { clientError 62 } -- APPLY column transformer is not supported
+OPTIMIZE TABLE partial_duplicates DEDUPLICATE BY pk REPLACE(pk + 1); -- { clientError 62 } -- REPLACE column transformer is not supported

 -- Valid cases
 -- NOTE: here and below we need FINAL to force deduplication in such a small set of data in only 1 part.
@ -66,8 +67,8 @@ TRUNCATE full_duplicates;
 -- Now to the partial duplicates when MATERIALIZED column alway has unique value.
 DROP TABLE IF EXISTS partial_duplicates;
 CREATE TABLE partial_duplicates  (
-	pk Int32, sk Int32, val UInt32, mat UInt32 MATERIALIZED rand(), alias UInt32 ALIAS 2,
-	PRIMARY KEY (pk)
+    pk Int32, sk Int32, val UInt32, mat UInt32 MATERIALIZED rand(), alias UInt32 ALIAS 2,
+    PRIMARY KEY (pk)
 ) ENGINE=MergeTree
 ORDER BY (pk, sk);