Fix bad size of marks

2024-09-21 01:00:48 +00:00 · 2019-08-30 17:29:08 +03:00 · 2019-08-30 17:29:08 +03:00 · f6120558df
commit f6120558df
parent 5851316742
6 changed files with 47 additions and 9 deletions
--- a/dbms/src/Storages/MergeTree/IMergedBlockOutputStream.cpp
+++ b/dbms/src/Storages/MergeTree/IMergedBlockOutputStream.cpp
@ -333,7 +333,7 @@ void IMergedBlockOutputStream::calculateAndSerializeSkipIndices(
 {
    /// Creating block for update
    Block indices_update_block(skip_indexes_columns);
-    size_t skip_index_current_mark = 0;
+    size_t skip_index_current_data_mark = 0;

    /// Filling and writing skip indices like in IMergedBlockOutputStream::writeColumn
    for (size_t i = 0; i < skip_indices.size(); ++i)
@ -341,7 +341,7 @@ void IMergedBlockOutputStream::calculateAndSerializeSkipIndices(
        const auto index = skip_indices[i];
        auto & stream = *skip_indices_streams[i];
        size_t prev_pos = 0;
-        skip_index_current_mark = skip_index_mark;
+        skip_index_current_data_mark = skip_index_data_mark;
        while (prev_pos < rows)
        {
            UInt64 limit = 0;
@ -351,7 +351,7 @@ void IMergedBlockOutputStream::calculateAndSerializeSkipIndices(
            }
            else
            {
-                limit = index_granularity.getMarkRows(skip_index_current_mark);
+                limit = index_granularity.getMarkRows(skip_index_current_data_mark);
                if (skip_indices_aggregators[i]->empty())
                {
                    skip_indices_aggregators[i] = index->createIndexAggregator();
@ -366,9 +366,9 @@ void IMergedBlockOutputStream::calculateAndSerializeSkipIndices(
                    /// to be compatible with normal .mrk2 file format
                    if (can_use_adaptive_granularity)
                        writeIntBinary(1UL, stream.marks);
-
-                    ++skip_index_current_mark;
                }
+                /// this mark is aggregated, go to the next one
+                skip_index_current_data_mark++;
            }

            size_t pos = prev_pos;
@ -388,7 +388,7 @@ void IMergedBlockOutputStream::calculateAndSerializeSkipIndices(
            prev_pos = pos;
        }
    }
-    skip_index_mark = skip_index_current_mark;
+    skip_index_data_mark = skip_index_current_data_mark;
 }

 void IMergedBlockOutputStream::finishSkipIndicesSerialization(
--- a/dbms/src/Storages/MergeTree/IMergedBlockOutputStream.h
+++ b/dbms/src/Storages/MergeTree/IMergedBlockOutputStream.h
@ -141,7 +141,10 @@ protected:
    size_t aio_threshold;

    size_t current_mark = 0;
-    size_t skip_index_mark = 0;
+
+    /// Number of mark in data from which skip indicies have to start
+    /// aggregation. I.e. it's data mark number, not skip indices mark.
+    size_t skip_index_data_mark = 0;

    const bool can_use_adaptive_granularity;
    const std::string marks_file_extension;
--- a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
+++ b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
@ -332,7 +332,7 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
            else if (skip_indexes_column_name_to_position.end() != skip_index_column_it)
            {
                const auto & index_column = *skip_indexes_columns[skip_index_column_it->second].column;
-                writeColumn(column.name, *column.type, index_column, offset_columns, false, serialization_states[i], current_mark);
+                std::tie(std::ignore, new_index_offset) = writeColumn(column.name, *column.type, index_column, offset_columns, false, serialization_states[i], current_mark);
            }
            else
            {
@ -349,6 +349,8 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm

    rows_count += rows;

+    /// Should be written before index offset update, because we calculate,
+    /// indices of currently written granules
    calculateAndSerializeSkipIndices(skip_indexes_columns, rows);

    {
--- a/dbms/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp
+++ b/dbms/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp
@ -68,7 +68,6 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
    if (!rows)
        return;

-    calculateAndSerializeSkipIndices(skip_indexes_columns, rows);

    size_t new_index_offset = 0;
    size_t new_current_mark = 0;
@ -79,6 +78,10 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
        std::tie(new_current_mark, new_index_offset) = writeColumn(column.name, *column.type, *column.column, offset_columns, skip_offsets, serialization_states[i], current_mark);
    }

+    /// Should be written before index offset update, because we calculate,
+    /// indices of currently written granules
+    calculateAndSerializeSkipIndices(skip_indexes_columns, rows);
+
    index_offset = new_index_offset;
    current_mark = new_current_mark;
 }
--- a/dbms/tests/queries/0_stateless/01000_bad_size_of_marks_skip_idx.reference
+++ b/dbms/tests/queries/0_stateless/01000_bad_size_of_marks_skip_idx.reference
@ -0,0 +1,2 @@
+1
+1
--- a/dbms/tests/queries/0_stateless/01000_bad_size_of_marks_skip_idx.sql
+++ b/dbms/tests/queries/0_stateless/01000_bad_size_of_marks_skip_idx.sql
@ -0,0 +1,28 @@
+SET allow_experimental_data_skipping_indices=1;
+
+DROP TABLE IF EXISTS bad_skip_idx;
+
+CREATE TABLE bad_skip_idx
+(
+  id UInt64,
+  value String
+) ENGINE MergeTree()
+ORDER BY id SETTINGS index_granularity_bytes = 64, vertical_merge_algorithm_min_rows_to_activate = 0, vertical_merge_algorithm_min_columns_to_activate = 0; -- actually vertical merge is not required condition for this bug, but it's more easy to reproduce (becuse we don't recalc granularities)
+
+-- 7 rows per granule
+INSERT INTO bad_skip_idx SELECT number, concat('x', toString(number)) FROM numbers(1000);
+
+-- 3 rows per granule
+INSERT INTO bad_skip_idx SELECT number, concat('xxxxxxxxxx', toString(number)) FROM numbers(1000,1000);
+
+SELECT COUNT(*) from bad_skip_idx WHERE value = 'xxxxxxxxxx1015'; -- check no exception
+
+INSERT INTO bad_skip_idx SELECT number, concat('x', toString(number)) FROM numbers(1000);
+
+ALTER TABLE bad_skip_idx ADD INDEX idx value TYPE bloom_filter(0.01) GRANULARITY 4;
+
+OPTIMIZE TABLE bad_skip_idx FINAL;
+
+SELECT COUNT(*) from bad_skip_idx WHERE value = 'xxxxxxxxxx1015'; -- check no exception
+
+DROP TABLE IF EXISTS bad_skip_idx;