Merge pull request #62268 from nickitat/test_for_bug

Fix PartsSplitter
2024-11-23 16:12:01 +00:00 · 2024-04-08 15:11:25 +00:00 · 2024-04-08 15:11:25 +00:00 · 5b4f67cc39
commit 5b4f67cc39
parent a73d927990 378d330d9d
3 changed files with 32 additions and 4 deletions
--- a/src/Processors/QueryPlan/PartsSplitter.cpp
+++ b/src/Processors/QueryPlan/PartsSplitter.cpp
@ -1,4 +1,5 @@
 #include <algorithm>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <queue>
@ -125,14 +126,20 @@ int compareValues(const Values & lhs, const Values & rhs)
 class IndexAccess
 {
 public:
-    explicit IndexAccess(const RangesInDataParts & parts_) : parts(parts_) { }
+    explicit IndexAccess(const RangesInDataParts & parts_) : parts(parts_)
+    {
+        /// Some suffix of index columns might not be loaded (see `primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns`)
+        /// and we need to use the same set of index columns across all parts.
+        for (const auto & part : parts)
+            loaded_columns = std::min(loaded_columns, part.data_part->getIndex().size());
+    }

    Values getValue(size_t part_idx, size_t mark) const
    {
        const auto & index = parts[part_idx].data_part->getIndex();
-        size_t size = index.size();
-        Values values(size);
-        for (size_t i = 0; i < size; ++i)
+        chassert(index.size() >= loaded_columns);
+        Values values(loaded_columns);
+        for (size_t i = 0; i < loaded_columns; ++i)
        {
            index[i]->get(mark, values[i]);
            if (values[i].isNull())
@ -199,6 +206,7 @@ public:
    }
 private:
    const RangesInDataParts & parts;
+    size_t loaded_columns = std::numeric_limits<size_t>::max();
 };

 class RangesInDataPartsBuilder
--- a/tests/queries/0_stateless/03033_parts_splitter_bug_and_index_loading.reference
+++ b/tests/queries/0_stateless/03033_parts_splitter_bug_and_index_loading.reference
@ -0,0 +1 @@
+200
--- a/tests/queries/0_stateless/03033_parts_splitter_bug_and_index_loading.sql
+++ b/tests/queries/0_stateless/03033_parts_splitter_bug_and_index_loading.sql
@ -0,0 +1,19 @@
+create table t(a UInt32, b UInt32) engine=MergeTree order by (a, b) settings index_granularity=1;
+
+system stop merges t;
+
+-- for this part the first columns is useless, so we have to use both
+insert into t select 42, number from numbers_mt(100);
+
+-- for this part the first columns is enough
+insert into t select number, number from numbers_mt(100);
+
+-- force reloading index
+detach table t;
+attach table t;
+
+set merge_tree_min_bytes_for_concurrent_read=1, merge_tree_min_rows_for_concurrent_read=1, merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=1.0, max_threads=4;
+
+-- the bug happened when we used (a, b) index values for one part and only (a) for another in PartsSplitter. even a simple count query is enough,
+-- because some granules were assinged to wrong layers and hence not returned from the reading step (because they were filtered out by `FilterSortedStreamByRange`)
+select count() from t where not ignore(*);