Update default HNSW parameter settings

This commit is contained in:
Robert Schulze 2024-10-21 07:19:29 +00:00
parent d02a31da44
commit 5f94239f99
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
5 changed files with 30 additions and 10 deletions

View File

@ -56,7 +56,7 @@ Parameters:
distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors).
- `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`)
- `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW
paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 16)
paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 32)
- `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as
`ef_construction` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 128)
@ -143,7 +143,7 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista
```
:::
To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the
To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 256), also known as `ef_search` in the
original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), run the `SELECT` query with `SETTINGS hnsw_candidate_list_size_for_search
= <value>`.

View File

@ -5558,8 +5558,8 @@ Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
M(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
)", 0) \
M(UInt64, hnsw_candidate_list_size_for_search, 0, R"(
The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64).
M(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
)", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
Throw exception if unsupported query is used inside transaction

View File

@ -42,6 +42,7 @@ namespace ErrorCodes
extern const int INCORRECT_DATA;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_QUERY;
extern const int INVALID_SETTING_VALUE;
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
}
@ -110,7 +111,7 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
{
USearchIndex::metric_t metric(dimensions, metric_kind, scalar_kind);
unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, unum::usearch::default_expansion_search());
unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, default_expansion_search);
config.enable_key_lookups = false; /// we don't do row-to-vector lookups
auto result = USearchIndex::make(metric, config);
@ -407,6 +408,9 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity
, metric_kind(metric_kind_)
, expansion_search(context->getSettingsRef()[Setting::hnsw_candidate_list_size_for_search])
{
if (expansion_search == 0)
throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'hnsw_candidate_list_size_for_search' must not be 0");
}
bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const
@ -447,7 +451,7 @@ std::vector<UInt64> MergeTreeIndexConditionVectorSimilarity::calculateApproximat
/// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method
/// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index.
auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, (expansion_search == 0) ? unum::usearch::default_expansion_search() : expansion_search);
auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search);
if (!search_result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release()));
@ -558,7 +562,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
/// Call Usearch's own parameter validation method for HNSW-specific parameters
UInt64 connectivity = index.arguments[3].safeGet<UInt64>();
UInt64 expansion_add = index.arguments[4].safeGet<UInt64>();
UInt64 expansion_search = unum::usearch::default_expansion_search();
UInt64 expansion_search = default_expansion_search;
unum::usearch::index_dense_config_t config(connectivity, expansion_add, expansion_search);
if (auto error = config.validate(); error)

View File

@ -11,10 +11,18 @@
namespace DB
{
/// Defaults for HNSW parameters. Instead of using the default parameters provided by USearch (default_connectivity(),
/// default_expansion_add(), default_expansion_search()), we experimentally came up with our own default parameters. They provide better
/// trade-offs with regards to index construction time, search precision and queries-per-second (speed).
static constexpr size_t default_connectivity = 32;
static constexpr size_t default_expansion_add = 128;
static constexpr size_t default_expansion_search = 256;
/// Parameters for HNSW index construction.
struct UsearchHnswParams
{
size_t connectivity = unum::usearch::default_connectivity();
size_t expansion_add = unum::usearch::default_expansion_add();
size_t connectivity = default_connectivity;
size_t expansion_add = default_expansion_add;
};
using USearchIndex = unum::usearch::index_dense_t;

View File

@ -14,7 +14,15 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
-- Generate random values but with a fixed seed (conceptually), so that the data is deterministic.
-- Unfortunately, no random functions in ClickHouse accepts a seed. Instead, abuse the numbers table + hash functions to provide
-- deterministic randomness.
INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest UInt64
INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest pUInt64
-- hnsw_candidate_list_size_for_search = 0 is illegal
WITH [0.5, 0.5] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3
SETTINGS hnsw_candidate_list_size_for_search = 0; -- { serverError INVALID_SETTING_VALUE }
DROP TABLE IF EXISTS results;
CREATE TABLE results(id Int32) ENGINE = Memory;