Implement force_data_skipping_indices setting

2024-11-21 15:12:02 +00:00 · 2020-10-05 23:50:03 +03:00 · 2020-10-05 23:50:03 +03:00 · ef6d12967f
commit ef6d12967f
parent df7c788346
6 changed files with 63 additions and 0 deletions
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -70,6 +70,12 @@ Works with tables in the MergeTree family.

 If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md).

+## force\_data\_skipping\_indices {#settings-force_data_skipping_indices}
+
+Disables query execution if passed data skipping indices wasn't used.
+
+Works with tables in the MergeTree family.
+
 ## format\_schema {#format-schema}

 This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format.
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -135,6 +135,7 @@ class IColumn;
    \
    M(Bool, force_index_by_date, 0, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \
    M(Bool, force_primary_key, 0, "Throw an exception if there is primary key in a table, and it is not used.", 0) \
+    M(String, force_data_skipping_indices, "", "Comma separated list of data skipping indices that should be used, otherwise an exception will be thrown.", 0) \
    \
    M(Float, max_streams_to_max_threads_ratio, 1, "Allows you to use more sources than the number of threads - to more evenly distribute work across threads. It is assumed that this is a temporary solution, since it will be possible in the future to make the number of sources equal to the number of threads, but for each source to dynamically select available work for itself.", 0) \
    M(Float, max_streams_multiplier_for_merge_tables, 5, "Ask more streams when reading from Merge table. Streams will be spread across tables that Merge table will use. This allows more even distribution of work across threads and especially helpful when merged tables differ in size.", 0) \
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@ -1,6 +1,8 @@
 #include <boost/rational.hpp>   /// For calculations related to sampling coefficients.
+#include <boost/algorithm/string/split.hpp>
 #include <ext/scope_guard.h>
 #include <optional>
+#include <unordered_set>

 #include <Poco/File.h>

@ -552,6 +554,31 @@ Pipe MergeTreeDataSelectExecutor::readFromParts(
            useful_indices.emplace_back(index_helper, condition);
    }

+    if (settings.force_data_skipping_indices.changed)
+    {
+        std::unordered_set<std::string> useful_indices_names;
+        for (const auto & useful_index : useful_indices)
+            useful_indices_names.insert(useful_index.first->index.name);
+
+        std::vector<std::string> forced_indices;
+        boost::split(forced_indices,
+            settings.force_data_skipping_indices.toString(),
+            [](char c){ return c == ','; });
+
+        for (const auto & index_name : forced_indices)
+        {
+            if (index_name.empty())
+                continue;
+
+            if (!useful_indices_names.count(index_name))
+            {
+                throw Exception(ErrorCodes::INDEX_NOT_USED,
+                    "Index {} is not used and setting 'force_data_skipping_indices' contains it",
+                    backQuote(index_name));
+            }
+        }
+    }
+
    RangesInDataParts parts_with_ranges(parts.size());
    size_t sum_marks = 0;
    std::atomic<size_t> sum_marks_pk = 0;
--- a/tests/queries/0_stateless/01515_force_data_skipping_indices.reference
+++ b/tests/queries/0_stateless/01515_force_data_skipping_indices.reference
--- a/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
+++ b/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
@ -0,0 +1,28 @@
+DROP TABLE IF EXISTS data_01515;
+CREATE TABLE data_01515
+(
+    key Int,
+    d1 Int,
+    d1_null Nullable(Int),
+    INDEX d1_idx d1 TYPE minmax GRANULARITY 1,
+    INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1
+)
+Engine=MergeTree()
+ORDER BY key;
+
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='';
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
+
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',d1_idx,';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_idx,,';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx,d1_null_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx,d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx,d1_idx,,'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_null_idx,d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_null_idx,,'; -- { serverError 277 }
+
+SELECT * FROM data_01515 WHERE d1_null = 0 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='d1_null_idx';
--- a/tests/queries/0_stateless/arcadia_skip_list.txt
+++ b/tests/queries/0_stateless/arcadia_skip_list.txt
@ -149,3 +149,4 @@
 00609_mv_index_in_in
 00510_materizlized_view_and_deduplication_zookeeper
 00738_lock_for_inner_table
+01515_force_data_skipping_indices