Update default HNSW parameter settings

2024-11-15 12:14:18 +00:00 · 2024-10-21 07:19:29 +00:00 · 2024-10-21 07:19:29 +00:00 · 5f94239f99
commit 5f94239f99
parent d02a31da44
5 changed files with 30 additions and 10 deletions
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@ -56,7 +56,7 @@ Parameters:
  distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors).
 - `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`)
 - `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW
-  paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 16)
+  paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 32)
 - `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as
  `ef_construction` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 128)

@ -143,7 +143,7 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista
 ```
 :::

-To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the
+To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 256), also known as `ef_search` in the
 original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), run the `SELECT` query with `SETTINGS hnsw_candidate_list_size_for_search
 = <value>`.

--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@ -5558,8 +5558,8 @@ Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
    M(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
 SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
 )", 0) \
-    M(UInt64, hnsw_candidate_list_size_for_search, 0, R"(
-The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64).
+    M(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
+The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
 )", 0) \
    M(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
 Throw exception if unsupported query is used inside transaction
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
@ -42,6 +42,7 @@ namespace ErrorCodes
    extern const int INCORRECT_DATA;
    extern const int INCORRECT_NUMBER_OF_COLUMNS;
    extern const int INCORRECT_QUERY;
+    extern const int INVALID_SETTING_VALUE;
    extern const int LOGICAL_ERROR;
    extern const int NOT_IMPLEMENTED;
 }
@ -110,7 +111,7 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
 {
    USearchIndex::metric_t metric(dimensions, metric_kind, scalar_kind);

-    unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, unum::usearch::default_expansion_search());
+    unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, default_expansion_search);
    config.enable_key_lookups = false; /// we don't do row-to-vector lookups

    auto result = USearchIndex::make(metric, config);
@ -407,6 +408,9 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity
    , metric_kind(metric_kind_)
    , expansion_search(context->getSettingsRef()[Setting::hnsw_candidate_list_size_for_search])
 {
+    if (expansion_search == 0)
+        throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'hnsw_candidate_list_size_for_search' must not be 0");
+
 }

 bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const
@ -447,7 +451,7 @@ std::vector<UInt64> MergeTreeIndexConditionVectorSimilarity::calculateApproximat
    /// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method
    /// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index.

-    auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, (expansion_search == 0) ? unum::usearch::default_expansion_search() : expansion_search);
+    auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search);
    if (!search_result)
        throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release()));

@ -558,7 +562,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
        /// Call Usearch's own parameter validation method for HNSW-specific parameters
        UInt64 connectivity = index.arguments[3].safeGet<UInt64>();
        UInt64 expansion_add = index.arguments[4].safeGet<UInt64>();
-        UInt64 expansion_search = unum::usearch::default_expansion_search();
+        UInt64 expansion_search = default_expansion_search;

        unum::usearch::index_dense_config_t config(connectivity, expansion_add, expansion_search);
        if (auto error = config.validate(); error)
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
@ -11,10 +11,18 @@
 namespace DB
 {

+/// Defaults for HNSW parameters. Instead of using the default parameters provided by USearch (default_connectivity(),
+/// default_expansion_add(), default_expansion_search()), we experimentally came up with our own default parameters. They provide better
+/// trade-offs with regards to index construction time, search precision and queries-per-second (speed).
+static constexpr size_t default_connectivity = 32;
+static constexpr size_t default_expansion_add = 128;
+static constexpr size_t default_expansion_search = 256;
+
+/// Parameters for HNSW index construction.
 struct UsearchHnswParams
 {
-    size_t connectivity = unum::usearch::default_connectivity();
-    size_t expansion_add = unum::usearch::default_expansion_add();
+    size_t connectivity = default_connectivity;
+    size_t expansion_add = default_expansion_add;
 };

 using USearchIndex = unum::usearch::index_dense_t;
--- a/tests/queries/0_stateless/02354_vector_search_expansion_search.sql
+++ b/tests/queries/0_stateless/02354_vector_search_expansion_search.sql
@ -14,7 +14,15 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
 -- Generate random values but with a fixed seed (conceptually), so that the data is deterministic.
 -- Unfortunately, no random functions in ClickHouse accepts a seed. Instead, abuse the numbers table + hash functions to provide
 -- deterministic randomness.
-INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest UInt64
+INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest pUInt64
+
+-- hnsw_candidate_list_size_for_search = 0 is illegal
+WITH [0.5, 0.5] AS reference_vec
+SELECT id, vec, L2Distance(vec, reference_vec)
+FROM tab
+ORDER BY L2Distance(vec, reference_vec)
+LIMIT 3
+SETTINGS hnsw_candidate_list_size_for_search = 0; -- { serverError INVALID_SETTING_VALUE }

 DROP TABLE IF EXISTS results;
 CREATE TABLE results(id Int32) ENGINE = Memory;