From b701a94750af09198fd492ac3ef2dc15695dcbd7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 14 Oct 2024 07:07:11 +0000 Subject: [PATCH 01/11] Query-time ef_search --- .../mergetree-family/annindexes.md | 3 ++ docs/en/operations/settings/settings.md | 8 ++++ src/Core/Settings.cpp | 5 ++- .../MergeTreeIndexVectorSimilarity.cpp | 38 ++++++++++++++++ .../MergeTreeIndexVectorSimilarity.h | 1 + ...ctor_search_query_time_ef_search.reference | 1 + ...354_vector_search_query_time_ef_search.sql | 44 +++++++++++++++++++ 7 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02354_vector_search_query_time_ef_search.reference create mode 100644 tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index f507e2b9f86..a6e41fdc949 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -135,6 +135,9 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista ``` ::: +To use a different value for HNSW parameter `ef_search` in SELECT queries than the value specified when the index was created, run the query +with `SETTINGS ef_search = `. + **Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 53727bbc9b0..2764813794d 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5333,6 +5333,14 @@ Default value: 1000000 SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes. +# ef_search {#ef_search} + +Type: UInt64 + +Default value: 0 + +The value of HNSW parameter `ef_search` in vector similarity searches. Overrides the value of `ef_search` specified at index construction time. + ## max_live_view_insert_blocks_before_refresh {#max_live_view_insert_blocks_before_refresh} Type: UInt64 diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 01d08fa4238..04028dd812d 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -5550,7 +5550,10 @@ If it is set to true, allow to specify experimental compression codecs (but we d Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin )", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, R"( -SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes. +SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes. +)", 0) \ + M(UInt64, ef_search, 0, R"( +The value of HNSW parameter 'ef_search' in vector similarity searches. Overrides the value of 'ef_search' specified at index construction time. )", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, R"( Throw exception if unsupported query is used inside transaction diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 9bea7f650c1..73ac14dbd20 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,11 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +namespace Setting +{ + extern const SettingsUInt64 ef_search; +} + namespace { @@ -395,6 +401,14 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity : vector_similarity_condition(query, context) , metric_kind(metric_kind_) { + const auto & settings = context->getSettingsRef(); + bool changed = settings.isChanged("ef_search"); + if (changed) + { + non_default_expansion_search = settings[Setting::ef_search]; + if (non_default_expansion_search == 0) + throw Exception(ErrorCodes::INCORRECT_DATA, "Setting 'ef_search' must not be 0"); + } } bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const @@ -431,6 +445,30 @@ std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximat const std::vector reference_vector = vector_similarity_condition.getReferenceVector(); + struct ExpansionSearchChangeScope + { + explicit ExpansionSearchChangeScope(std::optional expansion_search, USearchIndexWithSerializationPtr index_) + : index(index_) + { + if (expansion_search) + { + old_expansion_search = index_->expansion_search(); + index->change_expansion_search(*expansion_search); + } + } + + ~ExpansionSearchChangeScope() + { + if (old_expansion_search) + index->change_expansion_search(*old_expansion_search); + } + + USearchIndexWithSerializationPtr index; + std::optional old_expansion_search; + }; + + ExpansionSearchChangeScope expansion_search_change_scope(non_default_expansion_search, index); + auto search_result = index->search(reference_vector.data(), limit); if (!search_result) throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release())); diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index b77473e7c2b..9a4db31c990 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -142,6 +142,7 @@ public: private: const VectorSimilarityCondition vector_similarity_condition; const unum::usearch::metric_kind_t metric_kind; + std::optional non_default_expansion_search; }; diff --git a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.reference b/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.reference new file mode 100644 index 00000000000..0cfbf08886f --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.reference @@ -0,0 +1 @@ +2 diff --git a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql b/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql new file mode 100644 index 00000000000..c4263f33f15 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql @@ -0,0 +1,44 @@ +-- Tags: no-fasttest + +-- Tests vector search with setting 'ef_search' + +SET allow_experimental_vector_similarity_index = 1; +SET enable_analyzer = 0; + +DROP TABLE IF EXISTS tab; + +-- Generate some data set that is large enough +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +INSERT INTO tab SELECT number, [toFloat32(randCanonical(1)), toFloat32(randCanonical(2))] FROM numbers(500000); -- if the test fails sporadically, increase the table size + +-- Value = 0 is illegal. +WITH [0.5, 0.5] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3 +SETTINGS ef_search = 0; -- { serverError INCORRECT_DATA } + +DROP TABLE IF EXISTS results; +CREATE TABLE results(id Int32) ENGINE = Memory; + +-- Standard vector search, ef_search is by default 64 +INSERT INTO results + SELECT id + FROM tab + ORDER BY L2Distance(vec, [0.5, 0.5]) + LIMIT 1; + +-- Vector search with custom ef_search +INSERT INTO results + SELECT id + FROM tab + ORDER BY L2Distance(vec, [0.5, 0.5]) + LIMIT 1 + SETTINGS ef_search = 1; + +-- Expect that matches are different +SELECT count(distinct *) from results; + +DROP TABLE results; +DROP TABLE tab; From 317346b6ae91523de69c79014235de1591d78bca Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 14 Oct 2024 09:17:37 +0000 Subject: [PATCH 02/11] Minor fixes --- .../MergeTreeIndexVectorSimilarity.cpp | 19 ++++++++++--------- .../MergeTreeIndexVectorSimilarity.h | 2 +- ...354_vector_search_query_time_ef_search.sql | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 73ac14dbd20..6d8a4e437f1 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -405,8 +405,8 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity bool changed = settings.isChanged("ef_search"); if (changed) { - non_default_expansion_search = settings[Setting::ef_search]; - if (non_default_expansion_search == 0) + query_time_expansion_search = settings[Setting::ef_search]; + if (query_time_expansion_search == 0) throw Exception(ErrorCodes::INCORRECT_DATA, "Setting 'ef_search' must not be 0"); } } @@ -447,27 +447,28 @@ std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximat struct ExpansionSearchChangeScope { + USearchIndexWithSerializationPtr index; + std::optional last_expansion_search; + explicit ExpansionSearchChangeScope(std::optional expansion_search, USearchIndexWithSerializationPtr index_) : index(index_) { if (expansion_search) { - old_expansion_search = index_->expansion_search(); + last_expansion_search = index_->expansion_search(); index->change_expansion_search(*expansion_search); } } ~ExpansionSearchChangeScope() { - if (old_expansion_search) - index->change_expansion_search(*old_expansion_search); + if (last_expansion_search) + index->change_expansion_search(*last_expansion_search); } - - USearchIndexWithSerializationPtr index; - std::optional old_expansion_search; }; - ExpansionSearchChangeScope expansion_search_change_scope(non_default_expansion_search, index); + /// Apply query-time setting `ef_search` which overrides the corresponding HNSW parameter specified at index construction time. + ExpansionSearchChangeScope expansion_search_change_scope(query_time_expansion_search, index); auto search_result = index->search(reference_vector.data(), limit); if (!search_result) diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index 9a4db31c990..506e1fc8a5c 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -142,7 +142,7 @@ public: private: const VectorSimilarityCondition vector_similarity_condition; const unum::usearch::metric_kind_t metric_kind; - std::optional non_default_expansion_search; + std::optional query_time_expansion_search; }; diff --git a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql b/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql index c4263f33f15..cc23821cf7a 100644 --- a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql +++ b/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql @@ -9,7 +9,7 @@ DROP TABLE IF EXISTS tab; -- Generate some data set that is large enough CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; -INSERT INTO tab SELECT number, [toFloat32(randCanonical(1)), toFloat32(randCanonical(2))] FROM numbers(500000); -- if the test fails sporadically, increase the table size +INSERT INTO tab SELECT number, [toFloat32(randCanonical(1)), toFloat32(randCanonical(2))] FROM numbers(500000); -- if the test fails sporadically, increase the table size, HNSW is non-deterministic ... -- Value = 0 is illegal. WITH [0.5, 0.5] AS reference_vec From 395ad883afe7ab2f1530e697b5a3b77b8e9d365d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 14 Oct 2024 17:35:42 +0000 Subject: [PATCH 03/11] Better --- .../mergetree-family/annindexes.md | 17 ++-- docs/en/operations/settings/settings.md | 4 +- src/Core/Settings.cpp | 4 +- src/Core/SettingsChangesHistory.cpp | 1 + .../MergeTreeIndexVectorSimilarity.cpp | 80 ++++++------------- .../MergeTreeIndexVectorSimilarity.h | 7 +- .../0_stateless/02354_vector_search_bugs.sql | 2 +- ..._vector_search_expansion_search.reference} | 0 ... 02354_vector_search_expansion_search.sql} | 14 +--- ...r_search_index_creation_negative.reference | 2 +- ..._vector_search_index_creation_negative.sql | 10 +-- .../02354_vector_search_queries.reference | 2 +- .../02354_vector_search_queries.sql | 14 ++-- 13 files changed, 60 insertions(+), 97 deletions(-) rename tests/queries/0_stateless/{02354_vector_search_query_time_ef_search.reference => 02354_vector_search_expansion_search.reference} (100%) rename tests/queries/0_stateless/{02354_vector_search_query_time_ef_search.sql => 02354_vector_search_expansion_search.sql} (74%) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index a6e41fdc949..d81f019dd40 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -43,7 +43,7 @@ CREATE TABLE table ( id Int64, vectors Array(Float32), - INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N] + INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, hnsw_max_connections_per_layer, hnsw_candidate_list_size_for_construction]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; @@ -55,11 +55,13 @@ Parameters: line between two points in Euclidean space), or `cosineDistance` (the [cosine distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors). - `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`) -- `m`: the number of neighbors per graph node (optional, default: 16) -- `ef_construction`: (optional, default: 128) -- `ef_search`: (optional, default: 64) +- `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW + paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 16) +- `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as + `ef_construction` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 128) -Value 0 for parameters `m`, `ef_construction`, and `ef_search` refers to the default value. +Values 0 for parameters `hnsw_max_connections_per_layer` and `hnsw_candidate_list_size_for_construction` means using the default values of +these parameters. Example: @@ -124,6 +126,7 @@ FROM table WHERE ... -- WHERE clause is optional ORDER BY Distance(vectors, reference_vector) LIMIT N +SETTINGS enable_analyzer = 0; -- Temporary limitation, will be lifted ``` :::tip @@ -135,8 +138,8 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista ``` ::: -To use a different value for HNSW parameter `ef_search` in SELECT queries than the value specified when the index was created, run the query -with `SETTINGS ef_search = `. +To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the +original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), with `SETTINGS hnsw_candidate_list_size_for_search = `. **Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2764813794d..b889ff214b6 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5333,13 +5333,13 @@ Default value: 1000000 SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes. -# ef_search {#ef_search} +# hnsw_candidate_list_size_for_search {#hnsw_candidate_list_size_for_search} Type: UInt64 Default value: 0 -The value of HNSW parameter `ef_search` in vector similarity searches. Overrides the value of `ef_search` specified at index construction time. +The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64). ## max_live_view_insert_blocks_before_refresh {#max_live_view_insert_blocks_before_refresh} diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 04028dd812d..d1af0bac7b3 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -5552,8 +5552,8 @@ Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin M(UInt64, max_limit_for_ann_queries, 1'000'000, R"( SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes. )", 0) \ - M(UInt64, ef_search, 0, R"( -The value of HNSW parameter 'ef_search' in vector similarity searches. Overrides the value of 'ef_search' specified at index construction time. + M(UInt64, hnsw_candidate_list_size_for_search, 0, R"( +The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64). )", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, R"( Throw exception if unsupported query is used inside transaction diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 42c92481fce..b20ce9c7b3d 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -99,6 +99,7 @@ static std::initializer_listgetSettingsRef()[Setting::hnsw_candidate_list_size_for_search]) { - const auto & settings = context->getSettingsRef(); - bool changed = settings.isChanged("ef_search"); - if (changed) - { - query_time_expansion_search = settings[Setting::ef_search]; - if (query_time_expansion_search == 0) - throw Exception(ErrorCodes::INCORRECT_DATA, "Setting 'ef_search' must not be 0"); - } } bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const @@ -439,38 +432,17 @@ std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximat const USearchIndexWithSerializationPtr index = granule->index; if (vector_similarity_condition.getDimensions() != index->dimensions()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " - "does not match the dimension in the index ({})", + throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) does not match the dimension in the index ({})", vector_similarity_condition.getDimensions(), index->dimensions()); const std::vector reference_vector = vector_similarity_condition.getReferenceVector(); - struct ExpansionSearchChangeScope - { - USearchIndexWithSerializationPtr index; - std::optional last_expansion_search; + /// We want to run the search with the user-provided value for setting hnsw_candidate_list_size_for_search (aka. expansion_search). + /// The way to do this in USearch is to call index_dense_gt::change_expansion_search. Unfortunately, this introduces a need to + /// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method + /// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index. - explicit ExpansionSearchChangeScope(std::optional expansion_search, USearchIndexWithSerializationPtr index_) - : index(index_) - { - if (expansion_search) - { - last_expansion_search = index_->expansion_search(); - index->change_expansion_search(*expansion_search); - } - } - - ~ExpansionSearchChangeScope() - { - if (last_expansion_search) - index->change_expansion_search(*last_expansion_search); - } - }; - - /// Apply query-time setting `ef_search` which overrides the corresponding HNSW parameter specified at index construction time. - ExpansionSearchChangeScope expansion_search_change_scope(query_time_expansion_search, index); - - auto search_result = index->search(reference_vector.data(), limit); + auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, (expansion_search == 0) ? unum::usearch::default_expansion_search() : expansion_search); if (!search_result) throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release())); @@ -535,13 +507,12 @@ MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index) UsearchHnswParams usearch_hnsw_params; /// Optional parameters: - const bool has_six_args = (index.arguments.size() == 6); - if (has_six_args) + const bool has_five_args = (index.arguments.size() == 5); + if (has_five_args) { scalar_kind = quantizationToScalarKind.at(index.arguments[2].safeGet()); - usearch_hnsw_params = {.m = index.arguments[3].safeGet(), - .ef_construction = index.arguments[4].safeGet(), - .ef_search = index.arguments[5].safeGet()}; + usearch_hnsw_params = {.connectivity = index.arguments[3].safeGet(), + .expansion_add = index.arguments[4].safeGet()}; } return std::make_shared(index, metric_kind, scalar_kind, usearch_hnsw_params); @@ -550,25 +521,23 @@ MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index) void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* attach */) { const bool has_two_args = (index.arguments.size() == 2); - const bool has_six_args = (index.arguments.size() == 6); + const bool has_five_args = (index.arguments.size() == 5); /// Check number and type of arguments - if (!has_two_args && !has_six_args) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or six arguments"); + if (!has_two_args && !has_five_args) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or five arguments"); if (index.arguments[0].getType() != Field::Types::String) throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (method) must be of type String"); if (index.arguments[1].getType() != Field::Types::String) throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (metric) must be of type String"); - if (has_six_args) + if (has_five_args) { if (index.arguments[2].getType() != Field::Types::String) throw Exception(ErrorCodes::INCORRECT_QUERY, "Third argument of vector similarity index (quantization) must be of type String"); if (index.arguments[3].getType() != Field::Types::UInt64) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (M) must be of type UInt64"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (hnsw_max_connections_per_layer) must be of type UInt64"); if (index.arguments[4].getType() != Field::Types::UInt64) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (ef_construction) must be of type UInt64"); - if (index.arguments[5].getType() != Field::Types::UInt64) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Sixth argument of vector similarity index (ef_search) must be of type UInt64"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (hnsw_candidate_list_size_for_construction) must be of type UInt64"); } /// Check that passed arguments are supported @@ -576,18 +545,17 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta throw Exception(ErrorCodes::INCORRECT_DATA, "First argument (method) of vector similarity index is not supported. Supported methods are: {}", joinByComma(methods)); if (!distanceFunctionToMetricKind.contains(index.arguments[1].safeGet())) throw Exception(ErrorCodes::INCORRECT_DATA, "Second argument (distance function) of vector similarity index is not supported. Supported distance function are: {}", joinByComma(distanceFunctionToMetricKind)); - if (has_six_args) + if (has_five_args) { if (!quantizationToScalarKind.contains(index.arguments[2].safeGet())) throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind)); /// Call Usearch's own parameter validation method for HNSW-specific parameters - UInt64 m = index.arguments[3].safeGet(); - UInt64 ef_construction = index.arguments[4].safeGet(); - UInt64 ef_search = index.arguments[5].safeGet(); - - unum::usearch::index_dense_config_t config(m, ef_construction, ef_search); + UInt64 connectivity = index.arguments[3].safeGet(); + UInt64 expansion_add = index.arguments[4].safeGet(); + UInt64 expansion_search = unum::usearch::default_expansion_search(); + unum::usearch::index_dense_config_t config(connectivity, expansion_add, expansion_search); if (auto error = config.validate(); error) throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parameters passed to vector similarity index. Error: {}", String(error.release())); } diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index 506e1fc8a5c..fd1f7dc7669 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -13,9 +13,8 @@ namespace DB struct UsearchHnswParams { - size_t m = unum::usearch::default_connectivity(); - size_t ef_construction = unum::usearch::default_expansion_add(); - size_t ef_search = unum::usearch::default_expansion_search(); + size_t connectivity = unum::usearch::default_connectivity(); + size_t expansion_add = unum::usearch::default_expansion_add(); }; using USearchIndex = unum::usearch::index_dense_t; @@ -142,7 +141,7 @@ public: private: const VectorSimilarityCondition vector_similarity_condition; const unum::usearch::metric_kind_t metric_kind; - std::optional query_time_expansion_search; + size_t expansion_search; }; diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.sql b/tests/queries/0_stateless/02354_vector_search_bugs.sql index e0015d04b7e..45487541d6e 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.sql +++ b/tests/queries/0_stateless/02354_vector_search_bugs.sql @@ -56,7 +56,7 @@ DROP TABLE tab; SELECT 'Issue #69085: Reference vector computed by a subquery'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -- works diff --git a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.reference b/tests/queries/0_stateless/02354_vector_search_expansion_search.reference similarity index 100% rename from tests/queries/0_stateless/02354_vector_search_query_time_ef_search.reference rename to tests/queries/0_stateless/02354_vector_search_expansion_search.reference diff --git a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql similarity index 74% rename from tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql rename to tests/queries/0_stateless/02354_vector_search_expansion_search.sql index cc23821cf7a..4f8cff827d0 100644 --- a/tests/queries/0_stateless/02354_vector_search_query_time_ef_search.sql +++ b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql @@ -11,31 +11,23 @@ DROP TABLE IF EXISTS tab; CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; INSERT INTO tab SELECT number, [toFloat32(randCanonical(1)), toFloat32(randCanonical(2))] FROM numbers(500000); -- if the test fails sporadically, increase the table size, HNSW is non-deterministic ... --- Value = 0 is illegal. -WITH [0.5, 0.5] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 3 -SETTINGS ef_search = 0; -- { serverError INCORRECT_DATA } - DROP TABLE IF EXISTS results; CREATE TABLE results(id Int32) ENGINE = Memory; --- Standard vector search, ef_search is by default 64 +-- Standard vector search with default hnsw_candidate_list_size_for_search = 64 INSERT INTO results SELECT id FROM tab ORDER BY L2Distance(vec, [0.5, 0.5]) LIMIT 1; --- Vector search with custom ef_search +-- Vector search with custom hnsw_candidate_list_size_for_search INSERT INTO results SELECT id FROM tab ORDER BY L2Distance(vec, [0.5, 0.5]) LIMIT 1 - SETTINGS ef_search = 1; + SETTINGS hnsw_candidate_list_size_for_search = 1; -- Expect that matches are different SELECT count(distinct *) from results; diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference index 5963f4b5834..9c6487f0669 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference @@ -1,4 +1,4 @@ -Two or six index arguments +Two or five index arguments 1st argument (method) must be String and hnsw 2nd argument (distance function) must be String and L2Distance or cosineDistance 3nd argument (quantization), if given, must be String and f32, f16, ... diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql index e8e6aaee1b2..07d78bceeb0 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql @@ -6,12 +6,12 @@ SET allow_experimental_vector_similarity_index = 1; DROP TABLE IF EXISTS tab; -SELECT 'Two or six index arguments'; +SELECT 'Two or five index arguments'; CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant_have_one_arg')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'three_args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'six', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'five', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } SELECT '1st argument (method) must be String and hnsw'; CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3, 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } @@ -22,11 +22,11 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'invalid_distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT '3nd argument (quantization), if given, must be String and f32, f16, ...'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT '4nd argument (M), if given, must be UInt64 and > 1'; CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 'invalid', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT 'Must be created on single column'; CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index 34dcccc84c5..223a18b57bf 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -37,7 +37,7 @@ Expression (Projection) Parts: 1/1 Granules: 2/4 Special cases --- Non-default metric, M, ef_construction, ef_search +-- Non-default metric, hnsw_max_connections_per_layer, hnsw_candidate_list_size_for_construction 6 [1,9.3] 0.005731362878640178 4 [2.4,5.2] 0.09204062768384846 1 [2,3.2] 0.15200169244542905 diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 8769e5c56bb..71b8a1e520a 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -53,8 +53,8 @@ DROP TABLE tab; SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen. -SELECT '-- Non-default metric, M, ef_construction, ef_search'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +SELECT '-- Non-default metric, hnsw_max_connections_per_layer, hnsw_candidate_list_size_for_construction'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); WITH [0.0, 2.0] AS reference_vec @@ -82,11 +82,11 @@ SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann ind DROP TABLE tab; SELECT '-- Non-default quantization'; -CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); From d8ea2198b7b64b5de220986a28026c96c0834d02 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 14 Oct 2024 17:38:11 +0000 Subject: [PATCH 04/11] Better --- contrib/usearch | 2 +- docs/en/engines/table-engines/mergetree-family/annindexes.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/contrib/usearch b/contrib/usearch index d1d33eac94a..1706420acaf 160000 --- a/contrib/usearch +++ b/contrib/usearch @@ -1 +1 @@ -Subproject commit d1d33eac94acd3b628e0b446c927ec3295ef63c7 +Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48 diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index d81f019dd40..636b379536e 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -139,7 +139,8 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista ::: To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the -original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), with `SETTINGS hnsw_candidate_list_size_for_search = `. +original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), run the `SELECT` query with `SETTINGS hnsw_candidate_list_size_for_search += `. **Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting From c5260bae0819fabd6a385533ed4f3183661a5520 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 14 Oct 2024 18:52:53 +0000 Subject: [PATCH 05/11] Fix spelling --- docs/en/operations/settings/settings.md | 2 +- .../aspell-ignore/en/aspell-dict.txt | 178 +++++++++--------- 2 files changed, 91 insertions(+), 89 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index b889ff214b6..5c7a72d6bcd 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5339,7 +5339,7 @@ Type: UInt64 Default value: 0 -The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64). +The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means the default value of USearch (64). ## max_live_view_insert_blocks_before_refresh {#max_live_view_insert_blocks_before_refresh} diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 616ad4a800c..f027a1138d5 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -31,6 +31,7 @@ AnyEvent AppleClang Approximative ArrayJoin +ArrowCompression ArrowStream AsyncInsertCacheSize AsynchronousHeavyMetricsCalculationTimeSpent @@ -123,6 +124,7 @@ CMPLNT CMake CMakeLists CODECS +CORS COVID CPUFrequencyMHz CPUs @@ -138,11 +140,13 @@ CacheDictionaryThreadsActive CacheDictionaryUpdateQueueBatches CacheDictionaryUpdateQueueKeys CacheFileSegments +CacheWarmer CamelCase Cap'n CapContains CapUnion CapnProto +CapnProtoEnumComparingMode CatBoost CellAreaM CellAreaRads @@ -206,6 +210,7 @@ DDLWORKER DDLWorker DDLWorkerThreads DDLWorkerThreadsActive +DDLs DECRYPT DELETEs DESC @@ -214,6 +219,7 @@ DOGEFI Damerau DataGrip DataLens +DataPacket DataTime DataTypes DatabaseCatalog @@ -225,11 +231,15 @@ DatabaseOnDiskThreadsActive DatabaseOrdinaryThreads DatabaseOrdinaryThreadsActive DateTime +DateTimeInputFormat +DateTimeOutputFormat +DateTimeOverflowBehavior DateTimes DbCL Decrypted Deduplicate Deduplication +DefaultTableEngine DelayedInserts DeliveryTag DeltaLake @@ -245,7 +255,11 @@ DiskSpaceReservedForMerge DiskTotal DiskUnreserved DiskUsed +DistributedCacheLogMode +DistributedCachePoolBehaviourOnLimit +DistributedDDLOutputMode DistributedFilesToInsert +DistributedProductMode DistributedSend DockerHub DoubleDelta @@ -254,6 +268,7 @@ Dresseler Durre ECMA ETag +EachRow Ecto EdgeAngle EdgeLengthKm @@ -266,6 +281,7 @@ Enum Enums Eoan EphemeralNode +EscapingRule Ethereum ExactEdgeLengthKm ExactEdgeLengthM @@ -370,6 +386,7 @@ IMDS INFILE INSERTed INSERTs +INVOKER IOPrefetchThreads IOPrefetchThreadsActive IOThreads @@ -381,7 +398,10 @@ IOWriterThreadsActive IPTrie IProcessor IPv +ITION Identifiant +IdentifierQuotingRule +IdentifierQuotingStyle Incrementing IndexesAreNeighbors InfluxDB @@ -400,6 +420,7 @@ IntervalMilliseconds IntervalMinute IntervalMonth IntervalNanosecond +IntervalOutputFormat IntervalQuarter IntervalSecond IntervalWeek @@ -461,6 +482,8 @@ Jepsen JetBrains Jitter Joda +JoinAlgorithm +JoinStrictness JumpConsistentHash Jupyter KDevelop @@ -509,10 +532,16 @@ LinfNorm LinfNormalize LinksDeployment Linq +ListObject +ListObjects LoadAverage +LoadBalancing +LocalFSReadMethod LocalThread LocalThreadActive LogQL +LogQueriesType +LogsLevel Logstash LookML LoongArch @@ -549,6 +578,7 @@ MaxDDLEntryID MaxMind MaxPartCountForPartition MaxPushedDDLEntryID +MaxThreads Mbps McNeal Memcheck @@ -556,6 +586,7 @@ MemoryCode MemoryDataAndStack MemoryResident MemoryResidentMax +MemorySample MemorySanitizer MemoryShared MemoryTracking @@ -591,6 +622,7 @@ Mongo Mongodb Monotonicity MsgPack +MsgPackUUIDRepresentation MultiLineString MultiPolygon Multiline @@ -599,6 +631,7 @@ Multithreading Multiword MurmurHash MySQLConnection +MySQLDataTypesSupport MySQLDump MySQLThreads NATS @@ -634,6 +667,7 @@ NetworkSendPackets Noaa NodeJs NonMonotonic +NonZeroUInt NuRaft NumHexagons NumPy @@ -646,6 +680,7 @@ NumberOfTables OFNS OLAP OLTP +ORCCompression OSContextSwitches OSGuestNiceTime OSGuestNiceTimeCPU @@ -712,6 +747,8 @@ OrDefault OrNull OrZero OvercommitTracker +OverflowMode +OverflowModeGroupBy PAAMAYIM PCRE PRCP @@ -725,8 +762,11 @@ ParallelFormattingOutputFormatThreadsActive ParallelParsingInputFormat ParallelParsingInputFormatThreads ParallelParsingInputFormatThreadsActive +ParallelReplicasMode Parametrized +ParquetCompression ParquetMetadata +ParquetVersion Parsers PartMutation Partitioner @@ -743,6 +783,7 @@ PartsWide PeerDB PendingAsyncInsert Percona +PerfEventInfo PhpStorm PlantUML Poess @@ -794,6 +835,8 @@ QueryCacheBytes QueryCacheEntries QueryCacheHits QueryCacheMisses +QueryCacheNondeterministicFunctionHandling +QueryCacheSystemTableHandling QueryPreempted QueryThread QuickAssist @@ -802,6 +845,7 @@ QuoteMeta RBAC RClickHouse RHEL +RIPEMD ROLLUP RWLock RWLockActiveReaders @@ -854,7 +898,7 @@ RestartReplicaThreads RestartReplicaThreadsActive RestoreThreads RestoreThreadsActive -RIPEMD +RetryStrategy RoaringBitmap RocksDB Rollup @@ -878,6 +922,7 @@ SQLAlchemy SQLConsoleDetail SQLInsert SQLSTATE +SQLSecurityType SSDCache SSDComplexKeyCache SSDs @@ -890,6 +935,7 @@ Sankey Scalable Scatterplot Schaefer +SchemaInferenceMode Schemas Schwartzian SeasClick @@ -898,8 +944,12 @@ SelfManaged Sematext SendExternalTables SendScalars +SetOperationMode ShareAlike +ShareSet +SharedJoin SharedMergeTree +ShortCircuitFunctionEvaluation Shortkeys Signup SimHash @@ -950,6 +1000,7 @@ SystemReplicasThreadsActive TABLUM TAVG TCPConnection +TCPHandler TCPThreads TDigest TINYINT @@ -1017,8 +1068,10 @@ TotalPrimaryKeyBytesInMemory TotalPrimaryKeyBytesInMemoryAllocated TotalRowsOfMergeTreeTables TotalTemporaryFiles +TotalsMode Tradeoff Transactional +TransactionsWaitCSNMode Tsai Tukey TwoColumnList @@ -1040,6 +1093,7 @@ URLHash URLHierarchy URLPathHierarchy USearch +USearch UTCTimestamp UUIDNumToString UUIDStringToNum @@ -1143,6 +1197,7 @@ aggregatio aggretate aggthrow aiochclient +alloc allocator alphaTokens amplab @@ -1425,6 +1480,7 @@ config configs conformant congruential +conjuctive connectionId const contrib @@ -1434,9 +1490,11 @@ corrMatrix corrStable corrmatrix corrstable +cors cosineDistance countDigits countEqual +countIf countMatches countMatchesCaseInsensitive countSubstrings @@ -1558,7 +1616,9 @@ denormalizing denormals dequeued dequeues +dereference deserialization +deserialize deserialized deserializing dest @@ -1601,6 +1661,7 @@ domainWithoutWWW domainWithoutWWWRFC dont dotProduct +dotall downsampling dplyr dragonbox @@ -1704,6 +1765,7 @@ formatReadableSize formatReadableTimeDelta formatRow formatRowNoNewline +formatdatetime formatschema formatter formatters @@ -1846,6 +1908,7 @@ heredocs hilbertDecode hilbertEncode hiveHash +hnsw holistics homebrew hopEnd @@ -1876,6 +1939,7 @@ ilike incrementing indexHint indexOf +inequal infi inflight infty @@ -1952,6 +2016,7 @@ kRing kafka kafkaMurmurHash kafkacat +keepalive keepermap kerberized kerberos @@ -2144,15 +2209,19 @@ multiSearchFirstPosition multiSearchFirstPositionCaseInsensitive multiSearchFirstPositionCaseInsensitiveUTF multiSearchFirstPositionUTF +multibuffer multibyte multidirectory +multiif multiline multilinestring multiplyDecimal multipolygon +multiread multisearchany multisets multithread +multithreading multiword munmap murmurHash @@ -2204,6 +2273,7 @@ ngrambf ngrams noaa nonNegativeDerivative +nonconst noop normalizeQuery normalizeQueryKeepNames @@ -2225,6 +2295,7 @@ nullIf nullability nullable nullables +nullptr num numerics nypd @@ -2254,6 +2325,7 @@ pageviews parallelization parallelize parallelized +param params parseDateTime parseDateTimeBestEffort @@ -2272,13 +2344,16 @@ parseReadableSizeOrNull parseReadableSizeOrZero parseTimeDelta parseable +parsedatetime parsers partitionID partitionId pathFull pclmulqdq pcre +perf performant +perkey perl persistency phpclickhouse @@ -2313,6 +2388,7 @@ positionUTF positiveModulo postfix postfixes +postgres postgresql pre pread @@ -2322,7 +2398,11 @@ prebuilt preemptable preferServerCiphers prefetch +prefetched +prefetches +prefetching prefetchsize +preimage preloaded prem prepend @@ -2476,6 +2556,7 @@ reinterpretAsInt reinterpretAsString reinterpretAsUInt reinterpretAsUUID +remerge remoteSecure repivot replaceAll @@ -2483,6 +2564,7 @@ replaceOne replaceRegexpAll replaceRegexpOne replacingmergetree +replcase replicatable replicatedmergetree replxx @@ -2490,6 +2572,7 @@ repo representable requestor requireTLSv +rerange resharding reshards resolvers @@ -2521,6 +2604,7 @@ rowbinary rowbinarywithdefaults rowbinarywithnames rowbinarywithnamesandtypes +rowlist rsync rsyslog runnable @@ -2709,6 +2793,7 @@ subtrees subtype sudo sumCount +sumIf sumKahan sumMap sumMapFiltered @@ -2754,6 +2839,7 @@ theilsu themself threadpool throwIf +throwif timeDiff timeSeriesData timeSeriesMetrics @@ -2919,9 +3005,11 @@ typename ubuntu uint ulid +unacked unary unbin uncomment +undelete undrop unencoded unencrypted @@ -2950,6 +3038,7 @@ uniqthetasketch unix unixODBC unixodbc +unmerged unoptimized unparsed unpooled @@ -3056,90 +3145,3 @@ znode znodes zookeeperSessionUptime zstd -postgres -ArrowCompression -CapnProtoEnumComparingMode -DateTimeInputFormat -DateTimeOutputFormat -DateTimeOverflowBehavior -deserialize -dotall -EachRow -EscapingRule -IdentifierQuotingRule -IdentifierQuotingStyle -IntervalOutputFormat -MsgPackUUIDRepresentation -ORCCompression -ParquetCompression -ParquetVersion -SchemaInferenceMode -alloc -CacheWarmer -conjuctive -cors -CORS -countIf -DefaultTableEngine -dereference -DistributedDDLOutputMode -DistributedProductMode -formatdatetime -inequal -INVOKER -ITION -JoinAlgorithm -JoinStrictness -keepalive -ListObject -ListObjects -LoadBalancing -LocalFSReadMethod -LogQueriesType -LogsLevel -MaxThreads -MemorySample -multibuffer -multiif -multiread -multithreading -MySQLDataTypesSupport -nonconst -NonZeroUInt -nullptr -OverflowMode -OverflowModeGroupBy -ParallelReplicasMode -param -parsedatetime -perf -PerfEventInfo -perkey -prefetched -prefetches -prefetching -preimage -QueryCacheNondeterministicFunctionHandling -QueryCacheSystemTableHandling -remerge -replcase -rerange -RetryStrategy -rowlist -SetOperationMode -ShortCircuitFunctionEvaluation -SQLSecurityType -sumIf -TCPHandler -throwif -TotalsMode -TransactionsWaitCSNMode -undelete -unmerged -DataPacket -DDLs -DistributedCacheLogMode -DistributedCachePoolBehaviourOnLimit -SharedJoin -ShareSet -unacked From f23cf21ac50fd83b409804eaa33c61a5d01696b1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 15 Oct 2024 07:07:14 +0000 Subject: [PATCH 06/11] Disable test in slow builds --- .../0_stateless/02354_vector_search_expansion_search.sql | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql index 4f8cff827d0..bf6baaf3e84 100644 --- a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql +++ b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql @@ -1,6 +1,8 @@ --- Tags: no-fasttest +-- Tags: no-fasttest, long, no-asan, no-asan, no-ubsan, no-debug +-- ^^ Disable test for slow builds: generating data takes time but a sufficiently large data set +-- is necessary for different hnsw_candidate_list_size_for_search settings to make a difference --- Tests vector search with setting 'ef_search' +-- Tests vector search with setting 'hnsw_candidate_list_size_for_search' SET allow_experimental_vector_similarity_index = 1; SET enable_analyzer = 0; @@ -30,7 +32,7 @@ INSERT INTO results SETTINGS hnsw_candidate_list_size_for_search = 1; -- Expect that matches are different -SELECT count(distinct *) from results; +SELECT count(distinct *) FROM results; DROP TABLE results; DROP TABLE tab; From a197f193964854cb91b8f3fd53a2d9e7464e1fe9 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 15 Oct 2024 16:57:31 +0000 Subject: [PATCH 07/11] Some unrelated minor fixes --- .../functions/random-functions.md | 42 +++++++++---------- .../sql-reference/table-functions/generate.md | 4 +- .../sql-reference/table-functions/generate.md | 4 +- .../sql-reference/table-functions/generate.md | 4 +- .../{canonicalRand.cpp => randCanonical.cpp} | 0 5 files changed, 27 insertions(+), 27 deletions(-) rename src/Functions/{canonicalRand.cpp => randCanonical.cpp} (100%) diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index a9b483aa0e5..6b7b45378a4 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -24,7 +24,7 @@ Returns a random UInt32 number with uniform distribution. Uses a linear congruential generator with an initial state obtained from the system, which means that while it appears random, it's not truly random and can be predictable if the initial state is known. For scenarios where true randomness is crucial, consider using alternative methods like system-level calls or integrating with external libraries. -### Syntax +**Syntax** ```sql rand() @@ -32,15 +32,15 @@ rand() Alias: `rand32` -### Arguments +**Arguments** None. -### Returned value +**Returned value** Returns a number of type UInt32. -### Example +**Example** ```sql SELECT rand(); @@ -54,23 +54,23 @@ SELECT rand(); Returns a random UInt64 integer (UInt64) number -### Syntax +**Syntax** ```sql rand64() ``` -### Arguments +**Arguments** None. -### Returned value +**Arguments** Returns a number UInt64 number with uniform distribution. Uses a linear congruential generator with an initial state obtained from the system, which means that while it appears random, it's not truly random and can be predictable if the initial state is known. For scenarios where true randomness is crucial, consider using alternative methods like system-level calls or integrating with external libraries. -### Example +**Example** ```sql SELECT rand64(); @@ -84,21 +84,21 @@ SELECT rand64(); Returns a random Float64 number. -### Syntax +**Syntax** ```sql randCanonical() ``` -### Arguments +**Arguments** None. -### Returned value +**Arguments** Returns a Float64 value between 0 (inclusive) and 1 (exclusive). -### Example +**Example** ```sql SELECT randCanonical(); @@ -112,25 +112,25 @@ SELECT randCanonical(); Generates a single constant column filled with a random value. Unlike `rand`, this function ensures the same random value appears in every row of the generated column, making it useful for scenarios requiring a consistent random seed across rows in a single query. -### Syntax +**Syntax** ```sql randConstant([x]); ``` -### Arguments +**Arguments** - **[x] (Optional):** An optional expression that influences the generated random value. Even if provided, the resulting value will still be constant within the same query execution. Different queries using the same expression will likely generate different constant values. -### Returned value +**Arguments** Returns a column of type UInt32 containing the same random value in each row. -### Implementation details +**Implementation details** The actual output will be different for each query execution, even with the same optional expression. The optional parameter may not significantly change the generated value compared to using `randConstant` alone. -### Examples +**Example** ```sql SELECT randConstant() AS random_value; @@ -156,22 +156,22 @@ SELECT randConstant(10) AS random_value; Returns a random Float64 drawn uniformly from interval [`min`, `max`]. -### Syntax +**Syntax** ```sql randUniform(min, max) ``` -### Arguments +**Arguments** - `min` - `Float64` - left boundary of the range, - `max` - `Float64` - right boundary of the range. -### Returned value +**Arguments** A random number of type [Float64](../data-types/float.md). -### Example +**Example** ```sql SELECT randUniform(5.5, 10) FROM numbers(5) diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index a78015e9830..e15da495991 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -18,9 +18,9 @@ generateRandom(['name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_str - `name` — Name of corresponding column. - `TypeName` — Type of corresponding column. -- `max_array_length` — Maximum elements for all generated arrays or maps. Defaults to `10`. -- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`. - `random_seed` — Specify random seed manually to produce stable results. If NULL — seed is randomly generated. +- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`. +- `max_array_length` — Maximum elements for all generated arrays or maps. Defaults to `10`. **Returned Value** diff --git a/docs/ru/sql-reference/table-functions/generate.md b/docs/ru/sql-reference/table-functions/generate.md index f0dd595d436..fdaf2210c3b 100644 --- a/docs/ru/sql-reference/table-functions/generate.md +++ b/docs/ru/sql-reference/table-functions/generate.md @@ -18,9 +18,9 @@ generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_stri - `name` — название соответствующего столбца. - `TypeName` — тип соответствующего столбца. -- `max_array_length` — максимальная длина массива для всех сгенерированных массивов. По умолчанию `10`. -- `max_string_length` — максимальная длина строки для всех генерируемых строк. По умолчанию `10`. - `random_seed` — укажите состояние генератора случайных чисел вручную, чтобы получить стабильные результаты. Если значение равно `NULL` - генератор инициализируется случайным состоянием. +- `max_string_length` — максимальная длина строки для всех генерируемых строк. По умолчанию `10`. +- `max_array_length` — максимальная длина массива для всех сгенерированных массивов. По умолчанию `10`. **Возвращаемое значение** diff --git a/docs/zh/sql-reference/table-functions/generate.md b/docs/zh/sql-reference/table-functions/generate.md index 3735b22b439..5b790c4cec9 100644 --- a/docs/zh/sql-reference/table-functions/generate.md +++ b/docs/zh/sql-reference/table-functions/generate.md @@ -18,9 +18,9 @@ generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_stri - `name` — 对应列的名称。 - `TypeName` — 对应列的类型。 -- `max_array_length` — 生成数组的最大长度。 默认为10。 -- `max_string_length` — 生成字符串的最大长度。 默认为10。 - `random_seed` — 手动指定随机种子以产生稳定的结果。 如果为NULL-种子是随机生成的。 +- `max_string_length` — 生成字符串的最大长度。 默认为10。 +- `max_array_length` — 生成数组的最大长度。 默认为10。 **返回值** diff --git a/src/Functions/canonicalRand.cpp b/src/Functions/randCanonical.cpp similarity index 100% rename from src/Functions/canonicalRand.cpp rename to src/Functions/randCanonical.cpp From 63ea58d77486761a4bc63a1bfb9f976b3774eecd Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 15 Oct 2024 17:17:11 +0000 Subject: [PATCH 08/11] Incorporate review feedback --- src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h | 2 +- .../0_stateless/02354_vector_search_expansion_search.sql | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index fd1f7dc7669..ecaa331466f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -141,7 +141,7 @@ public: private: const VectorSimilarityCondition vector_similarity_condition; const unum::usearch::metric_kind_t metric_kind; - size_t expansion_search; + const size_t expansion_search; }; diff --git a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql index bf6baaf3e84..8b0d4470e20 100644 --- a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql +++ b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql @@ -9,9 +9,12 @@ SET enable_analyzer = 0; DROP TABLE IF EXISTS tab; --- Generate some data set that is large enough CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; -INSERT INTO tab SELECT number, [toFloat32(randCanonical(1)), toFloat32(randCanonical(2))] FROM numbers(500000); -- if the test fails sporadically, increase the table size, HNSW is non-deterministic ... + +-- Generate random values but with a fixed seed (conceptually), so that the data is deterministic. +-- Unfortunately, no random functions in ClickHouse accepts a seed. Instead, abuse the numbers table + hash functions to provide +-- deterministic randomness. +INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); DROP TABLE IF EXISTS results; CREATE TABLE results(id Int32) ENGINE = Memory; From 0caae391420f45b30cacae7d526cb2bd28256f20 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 16 Oct 2024 09:40:41 +0000 Subject: [PATCH 09/11] Disable vector index use with index_granularity_bytes = 0 --- .../settings/merge-tree-settings.md | 2 +- src/Storages/IndicesDescription.cpp | 8 +++++++ src/Storages/IndicesDescription.h | 2 ++ src/Storages/MergeTree/MergeTreeData.cpp | 21 +++++++++++++++++++ .../02354_vector_search_bugs.reference | 1 + .../0_stateless/02354_vector_search_bugs.sql | 19 ++++++++++++++++- .../02354_vector_search_expansion_search.sql | 2 +- 7 files changed, 52 insertions(+), 3 deletions(-) diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index 4863858358d..2fd34c4067c 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -49,7 +49,7 @@ Default value: 8192. Maximum size of data granules in bytes. -Default value: 10Mb. +Default value: 10485760 (ca. 10 MiB). To restrict the granule size only by number of rows, set to 0 (not recommended). diff --git a/src/Storages/IndicesDescription.cpp b/src/Storages/IndicesDescription.cpp index 753fbf1d635..cd2f88c6751 100644 --- a/src/Storages/IndicesDescription.cpp +++ b/src/Storages/IndicesDescription.cpp @@ -159,6 +159,14 @@ bool IndicesDescription::has(const String & name) const return false; } +bool IndicesDescription::hasType(const String & type) const +{ + for (const auto & index : *this) + if (index.type == type) + return true; + return false; +} + String IndicesDescription::toString() const { if (empty()) diff --git a/src/Storages/IndicesDescription.h b/src/Storages/IndicesDescription.h index 21ba5fb632e..4981eafa941 100644 --- a/src/Storages/IndicesDescription.h +++ b/src/Storages/IndicesDescription.h @@ -65,6 +65,8 @@ struct IndicesDescription : public std::vector, IHints<> { /// Index with name exists bool has(const String & name) const; + /// Index with type exists + bool hasType(const String & type) const; /// Convert description to string String toString() const; /// Parse description from string diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9d1fcb91236..a4361e1df9c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -259,6 +259,7 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; extern const int TOO_MANY_SIMULTANEOUS_QUERIES; extern const int INCORRECT_QUERY; + extern const int INVALID_SETTING_VALUE; extern const int CANNOT_RESTORE_TABLE; extern const int ZERO_COPY_REPLICATION_ERROR; extern const int NOT_INITIALIZED; @@ -756,6 +757,16 @@ void MergeTreeData::checkProperties( } } + /// If adaptive index granularity is disabled, certain vector search queries with PREWHERE run into LOGICAL_ERRORs. + /// SET allow_experimental_vector_similarity_index = 1; + /// CREATE TABLE tab (`id` Int32, `vec` Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 100000000) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0; + /// INSERT INTO tab SELECT number, [toFloat32(number), 0.] FROM numbers(10000); + /// WITH [1., 0.] AS reference_vec SELECT id, L2Distance(vec, reference_vec) FROM tab PREWHERE toLowCardinality(10) ORDER BY L2Distance(vec, reference_vec) ASC LIMIT 100; + /// As a workaround, force enabled adaptive index granularity for now (it is the default anyways). + if (new_metadata.secondary_indices.hasType("vector_similarity") && (*getSettings())[MergeTreeSetting::index_granularity_bytes] == 0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, + "Experimental vector similarity index can only be used with MergeTree setting 'index_granularity_bytes' != 0"); + if (!new_metadata.projections.empty()) { std::unordered_set projections_names; @@ -3320,6 +3331,16 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Experimental vector similarity index is disabled (turn on setting 'allow_experimental_vector_similarity_index')"); + /// If adaptive index granularity is disabled, certain vector search queries with PREWHERE run into LOGICAL_ERRORs. + /// SET allow_experimental_vector_similarity_index = 1; + /// CREATE TABLE tab (`id` Int32, `vec` Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 100000000) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0; + /// INSERT INTO tab SELECT number, [toFloat32(number), 0.] FROM numbers(10000); + /// WITH [1., 0.] AS reference_vec SELECT id, L2Distance(vec, reference_vec) FROM tab PREWHERE toLowCardinality(10) ORDER BY L2Distance(vec, reference_vec) ASC LIMIT 100; + /// As a workaround, force enabled adaptive index granularity for now (it is the default anyways). + if (AlterCommands::hasVectorSimilarityIndex(new_metadata) && (*getSettings())[MergeTreeSetting::index_granularity_bytes] == 0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, + "Experimental vector similarity index can only be used with MergeTree setting 'index_granularity_bytes' != 0"); + for (const auto & disk : getDisks()) if (!disk->supportsHardLinks() && !commands.isSettingsAlter() && !commands.isCommentAlter()) throw Exception( diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.reference b/tests/queries/0_stateless/02354_vector_search_bugs.reference index ce006359f5c..9b610cf543a 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.reference +++ b/tests/queries/0_stateless/02354_vector_search_bugs.reference @@ -40,3 +40,4 @@ Expression (Projection) Condition: true Parts: 1/1 Granules: 4/4 +index_granularity_bytes = 0 is disallowed diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.sql b/tests/queries/0_stateless/02354_vector_search_bugs.sql index 45487541d6e..d55bdb88a76 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.sql +++ b/tests/queries/0_stateless/02354_vector_search_bugs.sql @@ -37,7 +37,7 @@ DROP TABLE tab; SELECT 'Correctness of index with > 1 mark'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000); WITH [1.0, 0.0] AS reference_vec @@ -100,3 +100,20 @@ FROM tab ORDER BY distance LIMIT 1 SETTINGS enable_analyzer = 0; + +DROP TABLE tab; + +SELECT 'index_granularity_bytes = 0 is disallowed'; + +-- If adaptive index granularity is disabled, certain vector search queries with PREWHERE run into LOGICAL_ERRORs. +-- SET allow_experimental_vector_similarity_index = 1; +-- CREATE TABLE tab (`id` Int32, `vec` Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 100000000) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0; +-- INSERT INTO tab SELECT number, [toFloat32(number), 0.] FROM numbers(10000); +-- WITH [1., 0.] AS reference_vec SELECT id, L2Distance(vec, reference_vec) FROM tab PREWHERE toLowCardinality(10) ORDER BY L2Distance(vec, reference_vec) ASC LIMIT 100; +-- As a workaround, force enabled adaptive index granularity for now (it is the default anyways). +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0; -- { serverError INVALID_SETTING_VALUE } + +CREATE TABLE tab(id Int32, vec Array(Float32)) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0; +ALTER TABLE tab ADD INDEX vec_idx1(vec) TYPE vector_similarity('hnsw', 'cosineDistance'); -- { serverError INVALID_SETTING_VALUE } + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql index 8b0d4470e20..c4cf12ed995 100644 --- a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql +++ b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql @@ -14,7 +14,7 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar -- Generate random values but with a fixed seed (conceptually), so that the data is deterministic. -- Unfortunately, no random functions in ClickHouse accepts a seed. Instead, abuse the numbers table + hash functions to provide -- deterministic randomness. -INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); +INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest UInt64 DROP TABLE IF EXISTS results; CREATE TABLE results(id Int32) ENGINE = Memory; From 4c3de5ceb41ba86c57720a08906675b07f78559c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 17 Oct 2024 12:15:14 +0000 Subject: [PATCH 10/11] Update docs --- .../en/engines/table-engines/mergetree-family/annindexes.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 636b379536e..3a60445276b 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -117,6 +117,12 @@ ANN indexes are built during column insertion and merge. As a result, `INSERT` a tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write requests. + +:::tip +To reduce the cost of building vector similarity indexes, consider setting `materialize_skip_indexes_on_insert` which disables the +construction of skipping indexes on newly inserted parts. Search would fall back to exact search but as inserted parts are typically small +compared to the total table size, the performance impact of that would be negligible. + ANN indexes support this type of query: ``` sql From c69cd58ec8974408e8fe69223bdc218080eaa8dc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 17 Oct 2024 12:16:46 +0000 Subject: [PATCH 11/11] Docs cosmetics --- docs/en/engines/table-engines/mergetree-family/annindexes.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 3a60445276b..614f0a72c13 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -117,7 +117,6 @@ ANN indexes are built during column insertion and merge. As a result, `INSERT` a tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write requests. - :::tip To reduce the cost of building vector similarity indexes, consider setting `materialize_skip_indexes_on_insert` which disables the construction of skipping indexes on newly inserted parts. Search would fall back to exact search but as inserted parts are typically small