mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-15 12:14:18 +00:00
Update default HNSW parameter settings
This commit is contained in:
parent
d02a31da44
commit
5f94239f99
@ -56,7 +56,7 @@ Parameters:
|
||||
distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors).
|
||||
- `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`)
|
||||
- `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW
|
||||
paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 16)
|
||||
paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 32)
|
||||
- `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as
|
||||
`ef_construction` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 128)
|
||||
|
||||
@ -143,7 +143,7 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista
|
||||
```
|
||||
:::
|
||||
|
||||
To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the
|
||||
To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 256), also known as `ef_search` in the
|
||||
original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), run the `SELECT` query with `SETTINGS hnsw_candidate_list_size_for_search
|
||||
= <value>`.
|
||||
|
||||
|
@ -5558,8 +5558,8 @@ Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
|
||||
M(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
|
||||
SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
|
||||
)", 0) \
|
||||
M(UInt64, hnsw_candidate_list_size_for_search, 0, R"(
|
||||
The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64).
|
||||
M(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
|
||||
The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
|
||||
)", 0) \
|
||||
M(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
|
||||
Throw exception if unsupported query is used inside transaction
|
||||
|
@ -42,6 +42,7 @@ namespace ErrorCodes
|
||||
extern const int INCORRECT_DATA;
|
||||
extern const int INCORRECT_NUMBER_OF_COLUMNS;
|
||||
extern const int INCORRECT_QUERY;
|
||||
extern const int INVALID_SETTING_VALUE;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
}
|
||||
@ -110,7 +111,7 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
|
||||
{
|
||||
USearchIndex::metric_t metric(dimensions, metric_kind, scalar_kind);
|
||||
|
||||
unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, unum::usearch::default_expansion_search());
|
||||
unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, default_expansion_search);
|
||||
config.enable_key_lookups = false; /// we don't do row-to-vector lookups
|
||||
|
||||
auto result = USearchIndex::make(metric, config);
|
||||
@ -407,6 +408,9 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity
|
||||
, metric_kind(metric_kind_)
|
||||
, expansion_search(context->getSettingsRef()[Setting::hnsw_candidate_list_size_for_search])
|
||||
{
|
||||
if (expansion_search == 0)
|
||||
throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'hnsw_candidate_list_size_for_search' must not be 0");
|
||||
|
||||
}
|
||||
|
||||
bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const
|
||||
@ -447,7 +451,7 @@ std::vector<UInt64> MergeTreeIndexConditionVectorSimilarity::calculateApproximat
|
||||
/// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method
|
||||
/// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index.
|
||||
|
||||
auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, (expansion_search == 0) ? unum::usearch::default_expansion_search() : expansion_search);
|
||||
auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search);
|
||||
if (!search_result)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release()));
|
||||
|
||||
@ -558,7 +562,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
|
||||
/// Call Usearch's own parameter validation method for HNSW-specific parameters
|
||||
UInt64 connectivity = index.arguments[3].safeGet<UInt64>();
|
||||
UInt64 expansion_add = index.arguments[4].safeGet<UInt64>();
|
||||
UInt64 expansion_search = unum::usearch::default_expansion_search();
|
||||
UInt64 expansion_search = default_expansion_search;
|
||||
|
||||
unum::usearch::index_dense_config_t config(connectivity, expansion_add, expansion_search);
|
||||
if (auto error = config.validate(); error)
|
||||
|
@ -11,10 +11,18 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Defaults for HNSW parameters. Instead of using the default parameters provided by USearch (default_connectivity(),
|
||||
/// default_expansion_add(), default_expansion_search()), we experimentally came up with our own default parameters. They provide better
|
||||
/// trade-offs with regards to index construction time, search precision and queries-per-second (speed).
|
||||
static constexpr size_t default_connectivity = 32;
|
||||
static constexpr size_t default_expansion_add = 128;
|
||||
static constexpr size_t default_expansion_search = 256;
|
||||
|
||||
/// Parameters for HNSW index construction.
|
||||
struct UsearchHnswParams
|
||||
{
|
||||
size_t connectivity = unum::usearch::default_connectivity();
|
||||
size_t expansion_add = unum::usearch::default_expansion_add();
|
||||
size_t connectivity = default_connectivity;
|
||||
size_t expansion_add = default_expansion_add;
|
||||
};
|
||||
|
||||
using USearchIndex = unum::usearch::index_dense_t;
|
||||
|
@ -14,7 +14,15 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
|
||||
-- Generate random values but with a fixed seed (conceptually), so that the data is deterministic.
|
||||
-- Unfortunately, no random functions in ClickHouse accepts a seed. Instead, abuse the numbers table + hash functions to provide
|
||||
-- deterministic randomness.
|
||||
INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest UInt64
|
||||
INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest pUInt64
|
||||
|
||||
-- hnsw_candidate_list_size_for_search = 0 is illegal
|
||||
WITH [0.5, 0.5] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3
|
||||
SETTINGS hnsw_candidate_list_size_for_search = 0; -- { serverError INVALID_SETTING_VALUE }
|
||||
|
||||
DROP TABLE IF EXISTS results;
|
||||
CREATE TABLE results(id Int32) ENGINE = Memory;
|
||||
|
Loading…
Reference in New Issue
Block a user