From fd60954081e548daf157152021f2c3710744f3e3 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 27 Jan 2023 13:01:36 +0800 Subject: [PATCH 1/6] Fix key description on duplicate primary keys This can happen in projections. See https://github.com/ClickHouse/ClickHouse/issues/45590 for details. --- src/Storages/MergeTree/KeyCondition.cpp | 13 +-- .../02540_duplicate_primary_key.reference | 100 +++++++++++++++++ .../02540_duplicate_primary_key.sql | 106 ++++++++++++++++++ 3 files changed, 210 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/02540_duplicate_primary_key.reference create mode 100644 tests/queries/0_stateless/02540_duplicate_primary_key.sql diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 1fcf564693f..72deae9d7e6 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1848,23 +1848,19 @@ KeyCondition::Description KeyCondition::getDescription() const if (rpn_stack.size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::checkInRange"); - std::vector key_names(key_columns.size()); - std::vector is_key_used(key_columns.size(), false); - + std::map key_names; for (const auto & key : key_columns) key_names[key.second] = key.first; WriteBufferFromOwnString buf; std::function describe; - describe = [&describe, &key_names, &is_key_used, &buf](const Node * node) + describe = [&describe, &key_names, &buf](const Node * node) { switch (node->type) { case Node::Type::Leaf: { - is_key_used[node->element->key_column] = true; - /// Note: for condition with double negation, like `not(x not in set)`, /// we can replace it to `x in set` here. /// But I won't do it, because `cloneASTWithInversionPushDown` already push down `not`. @@ -1902,9 +1898,8 @@ KeyCondition::Description KeyCondition::getDescription() const describe(rpn_stack.front().can_be_true.get()); description.condition = std::move(buf.str()); - for (size_t i = 0; i < key_names.size(); ++i) - if (is_key_used[i]) - description.used_keys.emplace_back(key_names[i]); + for (auto && [_, name] : key_names) + description.used_keys.emplace_back(name); return description; } diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.reference b/tests/queries/0_stateless/02540_duplicate_primary_key.reference new file mode 100644 index 00000000000..00ebc45a83b --- /dev/null +++ b/tests/queries/0_stateless/02540_duplicate_primary_key.reference @@ -0,0 +1,100 @@ +2023-01-26 0 +2023-01-27 0 +2023-01-28 0 +2023-01-29 0 +2023-01-30 0 +2023-01-31 0 +2023-02-01 0 +2023-02-02 0 +2023-02-03 0 +2023-02-04 0 +2023-02-05 0 +2023-02-06 0 +2023-02-07 0 +2023-02-08 0 +2023-02-09 0 +2023-02-10 0 +2023-02-11 0 +2023-02-12 0 +2023-02-13 0 +2023-02-14 0 +2023-02-15 0 +2023-02-16 0 +2023-02-17 0 +2023-02-18 0 +2023-02-19 0 +2023-02-20 0 +2023-02-21 0 +2023-02-22 0 +2023-02-23 0 +2023-02-24 0 +2023-02-25 0 +2023-02-26 0 +2023-02-27 0 +2023-02-28 0 +2023-03-01 0 +2023-03-02 0 +2023-03-03 0 +2023-03-04 0 +2023-03-05 0 +2023-03-06 0 +2023-03-07 0 +2023-03-08 0 +2023-03-09 0 +2023-03-10 0 +2023-03-11 0 +2023-03-12 0 +2023-03-13 0 +2023-03-14 0 +2023-03-15 0 +2023-03-16 0 +2023-03-17 0 +2023-03-18 0 +2023-03-19 0 +2023-03-20 0 +2023-03-21 0 +2023-03-22 0 +2023-03-23 0 +2023-03-24 0 +2023-03-25 0 +2023-03-26 0 +2023-03-27 0 +2023-03-28 0 +2023-03-29 0 +2023-03-30 0 +2023-03-31 0 +2023-04-01 0 +2023-04-02 0 +2023-04-03 0 +2023-04-04 0 +2023-04-05 0 +2023-04-06 0 +2023-04-07 0 +2023-04-08 0 +2023-04-09 0 +2023-04-10 0 +2023-04-11 0 +2023-04-12 0 +2023-04-13 0 +2023-04-14 0 +2023-04-15 0 +2023-04-16 0 +2023-04-17 0 +2023-04-18 0 +2023-04-19 0 +2023-04-20 0 +2023-04-21 0 +2023-04-22 0 +2023-04-23 0 +2023-04-24 0 +2023-04-25 0 +2023-04-26 0 +2023-04-27 0 +2023-04-28 0 +2023-04-29 0 +2023-04-30 0 +2023-05-01 0 +2023-05-02 0 +2023-05-03 0 +2023-05-04 0 +2023-05-05 0 diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.sql b/tests/queries/0_stateless/02540_duplicate_primary_key.sql new file mode 100644 index 00000000000..0a356ad2e6f --- /dev/null +++ b/tests/queries/0_stateless/02540_duplicate_primary_key.sql @@ -0,0 +1,106 @@ +drop table if exists test; + +set allow_suspicious_low_cardinality_types = 1; + +CREATE TABLE test +( + `timestamp` DateTime, + `latitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)), + `longitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)), + `m_registered` LowCardinality(UInt8), + `m_mcc` LowCardinality(Nullable(Int16)), + `m_mnc` LowCardinality(Nullable(Int16)), + `m_ci` Nullable(Int32), + `m_tac` LowCardinality(Nullable(Int32)), + `enb_id` Nullable(Int32), + `ci` Nullable(Int32), + `m_earfcn` LowCardinality(Int32), + `rsrp` LowCardinality(Nullable(Int16)), + `rsrq` LowCardinality(Nullable(Int16)), + `cqi` LowCardinality(Nullable(Int16)), + `source` LowCardinality(String), + `gps_accuracy` Nullable(Float32), + `operator_name` LowCardinality(String), + `band` LowCardinality(Nullable(String)), + `NAME_2` LowCardinality(String), + `NAME_1` LowCardinality(String), + `quadkey_19_key` FixedString(19), + `quadkey_17_key` FixedString(17), + `manipulation` LowCardinality(UInt8), + `ss_rsrp` LowCardinality(Nullable(Int16)), + `ss_rsrq` LowCardinality(Nullable(Int16)), + `ss_sinr` LowCardinality(Nullable(Int16)), + `csi_rsrp` LowCardinality(Nullable(Int16)), + `csi_rsrq` LowCardinality(Nullable(Int16)), + `csi_sinr` LowCardinality(Nullable(Int16)), + `altitude` Nullable(Float32), + `access_technology` LowCardinality(Nullable(String)), + `buildingtype` LowCardinality(String), + `LocationType` LowCardinality(String), + `carrier_name` LowCardinality(Nullable(String)), + `CustomPolygonName` LowCardinality(String), + `h3_10_pixel` UInt64, + `stc_cluster` LowCardinality(Nullable(String)), + PROJECTION cumsum_projection_simple + ( + SELECT + m_registered, + toStartOfInterval(timestamp, toIntervalMonth(1)), + toStartOfWeek(timestamp, 8), + toStartOfInterval(timestamp, toIntervalDay(1)), + NAME_1, + NAME_2, + operator_name, + rsrp, + rsrq, + ss_rsrp, + ss_rsrq, + cqi, + sum(multiIf(ss_rsrp IS NULL, 0, 1)), + sum(multiIf(ss_rsrq IS NULL, 0, 1)), + sum(multiIf(ss_sinr IS NULL, 0, 1)), + max(toStartOfInterval(timestamp, toIntervalDay(1))), + max(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')), + min(toStartOfInterval(timestamp, toIntervalDay(1))), + min(CAST(CAST(toStartOfInterval(timestamp, toIntervalDay(1)), 'Nullable(DATE)'), 'Nullable(TIMESTAMP)')), + count(), + sum(1) + GROUP BY + m_registered, + toStartOfInterval(timestamp, toIntervalMonth(1)), + toStartOfWeek(timestamp, 8), + toStartOfInterval(timestamp, toIntervalDay(1)), + m_registered, + toStartOfInterval(timestamp, toIntervalMonth(1)), + toStartOfWeek(timestamp, 8), + toStartOfInterval(timestamp, toIntervalDay(1)), + NAME_1, + NAME_2, + operator_name, + rsrp, + rsrq, + ss_rsrp, + ss_rsrq, + cqi + ) +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(timestamp) +ORDER BY (timestamp, operator_name, NAME_1, NAME_2) +SETTINGS index_granularity = 8192; + +insert into test select * from generateRandom() limit 10; + +with tt as ( + select cast(toStartOfInterval(timestamp, INTERVAL 1 day) as Date) as dd, count() as samples + from test + group by dd having dd >= toDate(now())-100 + ), +tt2 as ( + select dd, samples from tt + union distinct + select toDate(now())-1, ifnull((select samples from tt where dd = toDate(now())-1),0) as samples +) +select dd, samples from tt2 order by dd with fill step 1 limit 100; + +drop table test; From 716516f84ad9a79f31a36ff71770e34e0fcd0409 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 27 Jan 2023 16:58:41 +0800 Subject: [PATCH 2/6] Fix test --- src/Storages/MergeTree/KeyCondition.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 72deae9d7e6..d191bde15ac 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1848,9 +1848,10 @@ KeyCondition::Description KeyCondition::getDescription() const if (rpn_stack.size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::checkInRange"); - std::map key_names; + /// column_index -> {column_name, is_used_flag} + std::map> key_names; for (const auto & key : key_columns) - key_names[key.second] = key.first; + key_names[key.second] = {key.first, false}; WriteBufferFromOwnString buf; @@ -1861,13 +1862,15 @@ KeyCondition::Description KeyCondition::getDescription() const { case Node::Type::Leaf: { + key_names[node->element->key_column].second = true; // key is used. + /// Note: for condition with double negation, like `not(x not in set)`, /// we can replace it to `x in set` here. /// But I won't do it, because `cloneASTWithInversionPushDown` already push down `not`. /// So, this seem to be impossible for `can_be_true` tree. if (node->negate) buf << "not("; - buf << node->element->toString(key_names[node->element->key_column], true); + buf << node->element->toString(key_names[node->element->key_column].first, true); if (node->negate) buf << ")"; break; @@ -1898,8 +1901,11 @@ KeyCondition::Description KeyCondition::getDescription() const describe(rpn_stack.front().can_be_true.get()); description.condition = std::move(buf.str()); - for (auto && [_, name] : key_names) - description.used_keys.emplace_back(name); + for (auto && [_, v] : key_names) + { + if (v.second) + description.used_keys.emplace_back(v.first); + } return description; } From deacfadc894528709d7730c9438da368e71127fb Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 27 Jan 2023 18:00:52 +0800 Subject: [PATCH 3/6] Fix another test --- .../02540_duplicate_primary_key.reference | 100 ------------------ .../02540_duplicate_primary_key.sql | 2 +- 2 files changed, 1 insertion(+), 101 deletions(-) diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.reference b/tests/queries/0_stateless/02540_duplicate_primary_key.reference index 00ebc45a83b..e69de29bb2d 100644 --- a/tests/queries/0_stateless/02540_duplicate_primary_key.reference +++ b/tests/queries/0_stateless/02540_duplicate_primary_key.reference @@ -1,100 +0,0 @@ -2023-01-26 0 -2023-01-27 0 -2023-01-28 0 -2023-01-29 0 -2023-01-30 0 -2023-01-31 0 -2023-02-01 0 -2023-02-02 0 -2023-02-03 0 -2023-02-04 0 -2023-02-05 0 -2023-02-06 0 -2023-02-07 0 -2023-02-08 0 -2023-02-09 0 -2023-02-10 0 -2023-02-11 0 -2023-02-12 0 -2023-02-13 0 -2023-02-14 0 -2023-02-15 0 -2023-02-16 0 -2023-02-17 0 -2023-02-18 0 -2023-02-19 0 -2023-02-20 0 -2023-02-21 0 -2023-02-22 0 -2023-02-23 0 -2023-02-24 0 -2023-02-25 0 -2023-02-26 0 -2023-02-27 0 -2023-02-28 0 -2023-03-01 0 -2023-03-02 0 -2023-03-03 0 -2023-03-04 0 -2023-03-05 0 -2023-03-06 0 -2023-03-07 0 -2023-03-08 0 -2023-03-09 0 -2023-03-10 0 -2023-03-11 0 -2023-03-12 0 -2023-03-13 0 -2023-03-14 0 -2023-03-15 0 -2023-03-16 0 -2023-03-17 0 -2023-03-18 0 -2023-03-19 0 -2023-03-20 0 -2023-03-21 0 -2023-03-22 0 -2023-03-23 0 -2023-03-24 0 -2023-03-25 0 -2023-03-26 0 -2023-03-27 0 -2023-03-28 0 -2023-03-29 0 -2023-03-30 0 -2023-03-31 0 -2023-04-01 0 -2023-04-02 0 -2023-04-03 0 -2023-04-04 0 -2023-04-05 0 -2023-04-06 0 -2023-04-07 0 -2023-04-08 0 -2023-04-09 0 -2023-04-10 0 -2023-04-11 0 -2023-04-12 0 -2023-04-13 0 -2023-04-14 0 -2023-04-15 0 -2023-04-16 0 -2023-04-17 0 -2023-04-18 0 -2023-04-19 0 -2023-04-20 0 -2023-04-21 0 -2023-04-22 0 -2023-04-23 0 -2023-04-24 0 -2023-04-25 0 -2023-04-26 0 -2023-04-27 0 -2023-04-28 0 -2023-04-29 0 -2023-04-30 0 -2023-05-01 0 -2023-05-02 0 -2023-05-03 0 -2023-05-04 0 -2023-05-05 0 diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.sql b/tests/queries/0_stateless/02540_duplicate_primary_key.sql index 0a356ad2e6f..c69a5408b15 100644 --- a/tests/queries/0_stateless/02540_duplicate_primary_key.sql +++ b/tests/queries/0_stateless/02540_duplicate_primary_key.sql @@ -101,6 +101,6 @@ tt2 as ( union distinct select toDate(now())-1, ifnull((select samples from tt where dd = toDate(now())-1),0) as samples ) -select dd, samples from tt2 order by dd with fill step 1 limit 100; +select dd, samples from tt2 order by dd with fill step 1 limit 100 format Null; drop table test; From b05838cc321f134958a522dd77ae7f4e4cd08377 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 27 Jan 2023 12:18:00 +0100 Subject: [PATCH 4/6] Update 02540_duplicate_primary_key.sql Remove LowCardinality --- .../02540_duplicate_primary_key.sql | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/queries/0_stateless/02540_duplicate_primary_key.sql b/tests/queries/0_stateless/02540_duplicate_primary_key.sql index c69a5408b15..5934f597334 100644 --- a/tests/queries/0_stateless/02540_duplicate_primary_key.sql +++ b/tests/queries/0_stateless/02540_duplicate_primary_key.sql @@ -7,40 +7,40 @@ CREATE TABLE test `timestamp` DateTime, `latitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)), `longitude` Nullable(Float32) CODEC(Gorilla, ZSTD(1)), - `m_registered` LowCardinality(UInt8), - `m_mcc` LowCardinality(Nullable(Int16)), - `m_mnc` LowCardinality(Nullable(Int16)), + `m_registered` UInt8, + `m_mcc` Nullable(Int16), + `m_mnc` Nullable(Int16), `m_ci` Nullable(Int32), - `m_tac` LowCardinality(Nullable(Int32)), + `m_tac` Nullable(Int32), `enb_id` Nullable(Int32), `ci` Nullable(Int32), - `m_earfcn` LowCardinality(Int32), - `rsrp` LowCardinality(Nullable(Int16)), - `rsrq` LowCardinality(Nullable(Int16)), - `cqi` LowCardinality(Nullable(Int16)), - `source` LowCardinality(String), + `m_earfcn` Int32, + `rsrp` Nullable(Int16), + `rsrq` Nullable(Int16), + `cqi` Nullable(Int16), + `source` String, `gps_accuracy` Nullable(Float32), - `operator_name` LowCardinality(String), - `band` LowCardinality(Nullable(String)), - `NAME_2` LowCardinality(String), - `NAME_1` LowCardinality(String), + `operator_name` String, + `band` Nullable(String), + `NAME_2` String, + `NAME_1` String, `quadkey_19_key` FixedString(19), `quadkey_17_key` FixedString(17), - `manipulation` LowCardinality(UInt8), - `ss_rsrp` LowCardinality(Nullable(Int16)), - `ss_rsrq` LowCardinality(Nullable(Int16)), - `ss_sinr` LowCardinality(Nullable(Int16)), - `csi_rsrp` LowCardinality(Nullable(Int16)), - `csi_rsrq` LowCardinality(Nullable(Int16)), - `csi_sinr` LowCardinality(Nullable(Int16)), + `manipulation` UInt8, + `ss_rsrp` Nullable(Int16), + `ss_rsrq` Nullable(Int16), + `ss_sinr` Nullable(Int16), + `csi_rsrp` Nullable(Int16), + `csi_rsrq` Nullable(Int16), + `csi_sinr` Nullable(Int16), `altitude` Nullable(Float32), - `access_technology` LowCardinality(Nullable(String)), - `buildingtype` LowCardinality(String), - `LocationType` LowCardinality(String), - `carrier_name` LowCardinality(Nullable(String)), - `CustomPolygonName` LowCardinality(String), + `access_technology` Nullable(String), + `buildingtype` String, + `LocationType` String, + `carrier_name` Nullable(String), + `CustomPolygonName` String, `h3_10_pixel` UInt64, - `stc_cluster` LowCardinality(Nullable(String)), + `stc_cluster` Nullable(String), PROJECTION cumsum_projection_simple ( SELECT From b716e2d754927f24d88b7a735392392043e9d5dd Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 27 Jan 2023 12:37:23 +0000 Subject: [PATCH 5/6] A bit cleaner fix. --- src/Storages/MergeTree/KeyCondition.cpp | 30 ++++++++----------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index d191bde15ac..9c7cfc9a13f 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -739,12 +739,9 @@ KeyCondition::KeyCondition( , single_point(single_point_) , strict(strict_) { - for (size_t i = 0, size = key_column_names.size(); i < size; ++i) - { - const auto & name = key_column_names[i]; + for (const auto & name : key_column_names) if (!key_columns.contains(name)) - key_columns[name] = i; - } + key_columns[name] = key_columns.size(); auto filter_node = buildFilterNode(query, additional_filter_asts); @@ -807,12 +804,9 @@ KeyCondition::KeyCondition( , single_point(single_point_) , strict(strict_) { - for (size_t i = 0, size = key_column_names.size(); i < size; ++i) - { - const auto & name = key_column_names[i]; + for (const auto & name : key_column_names) if (!key_columns.contains(name)) - key_columns[name] = i; - } + key_columns[name] = key_columns.size(); if (!filter_dag) { @@ -1848,10 +1842,9 @@ KeyCondition::Description KeyCondition::getDescription() const if (rpn_stack.size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::checkInRange"); - /// column_index -> {column_name, is_used_flag} - std::map> key_names; + std::map key_names; for (const auto & key : key_columns) - key_names[key.second] = {key.first, false}; + key_names[key.second] = key.first; WriteBufferFromOwnString buf; @@ -1862,15 +1855,13 @@ KeyCondition::Description KeyCondition::getDescription() const { case Node::Type::Leaf: { - key_names[node->element->key_column].second = true; // key is used. - /// Note: for condition with double negation, like `not(x not in set)`, /// we can replace it to `x in set` here. /// But I won't do it, because `cloneASTWithInversionPushDown` already push down `not`. /// So, this seem to be impossible for `can_be_true` tree. if (node->negate) buf << "not("; - buf << node->element->toString(key_names[node->element->key_column].first, true); + buf << node->element->toString(key_names[node->element->key_column], true); if (node->negate) buf << ")"; break; @@ -1901,11 +1892,8 @@ KeyCondition::Description KeyCondition::getDescription() const describe(rpn_stack.front().can_be_true.get()); description.condition = std::move(buf.str()); - for (auto && [_, v] : key_names) - { - if (v.second) - description.used_keys.emplace_back(v.first); - } + for (auto && [_, name] : key_names) + description.used_keys.emplace_back(name); return description; } From f6ad6296ba56036c89074b7c97753fbeb125c9bb Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 27 Jan 2023 12:45:41 +0000 Subject: [PATCH 6/6] Restore. --- src/Storages/MergeTree/KeyCondition.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 9c7cfc9a13f..fda1daec3a3 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1842,19 +1842,23 @@ KeyCondition::Description KeyCondition::getDescription() const if (rpn_stack.size() != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::checkInRange"); - std::map key_names; + std::vector key_names(key_columns.size()); + std::vector is_key_used(key_columns.size(), false); + for (const auto & key : key_columns) key_names[key.second] = key.first; WriteBufferFromOwnString buf; std::function describe; - describe = [&describe, &key_names, &buf](const Node * node) + describe = [&describe, &key_names, &is_key_used, &buf](const Node * node) { switch (node->type) { case Node::Type::Leaf: { + is_key_used[node->element->key_column] = true; + /// Note: for condition with double negation, like `not(x not in set)`, /// we can replace it to `x in set` here. /// But I won't do it, because `cloneASTWithInversionPushDown` already push down `not`. @@ -1892,8 +1896,9 @@ KeyCondition::Description KeyCondition::getDescription() const describe(rpn_stack.front().can_be_true.get()); description.condition = std::move(buf.str()); - for (auto && [_, name] : key_names) - description.used_keys.emplace_back(name); + for (size_t i = 0; i < key_names.size(); ++i) + if (is_key_used[i]) + description.used_keys.emplace_back(key_names[i]); return description; }