From 7cc4f8460f8bcf31df623d90e6e8612b2379ebd5 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Nov 2024 20:20:59 +0000 Subject: [PATCH] Scalar quantization for i8 --- contrib/SimSIMD | 2 +- contrib/usearch | 2 +- .../mergetree-family/annindexes.md | 5 +- .../MergeTree/MergeTreeIOSettings.cpp | 4 + src/Storages/MergeTree/MergeTreeIOSettings.h | 3 + .../MergeTreeIndexVectorSimilarity.cpp | 242 ++++++++++++++++-- .../MergeTreeIndexVectorSimilarity.h | 21 +- src/Storages/MergeTree/MergeTreeSettings.cpp | 4 +- .../02354_vector_search_queries.reference | 2 +- ...ector_search_scalar_quantization.reference | 3 + ...2354_vector_search_scalar_quantization.sql | 34 +++ .../aspell-ignore/en/aspell-dict.txt | 3 +- 12 files changed, 289 insertions(+), 36 deletions(-) create mode 100644 tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference create mode 100644 tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql diff --git a/contrib/SimSIMD b/contrib/SimSIMD index fa60f1b8e35..da2d3853729 160000 --- a/contrib/SimSIMD +++ b/contrib/SimSIMD @@ -1 +1 @@ -Subproject commit fa60f1b8e3582c50978f0ae86c2ebb6c9af957f3 +Subproject commit da2d38537299ade247c2499131d936fb8db38f03 diff --git a/contrib/usearch b/contrib/usearch index 7efe8b710c9..9561fcae124 160000 --- a/contrib/usearch +++ b/contrib/usearch @@ -1 +1 @@ -Subproject commit 7efe8b710c9831bfe06573b1df0fad001b04a2b5 +Subproject commit 9561fcae1249ea8effbf71250e8a7a7ea97e5dfe diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index fcdc16637e6..81c1b2e067e 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -93,7 +93,10 @@ Vector similarity indexes currently support two distance functions: ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16`, `bf16`, -and `i8`. If no scalar kind was specified during index creation, `bf16` is used as default. +and `i8`. If no scalar kind was specified during index creation, `bf16` is used as default. For scalar kinds `f64`, `f32`, and `bf16`, the +values are simply downsampled. For `i8`, the values are mapped to range [-127, 127]. To improve precision, scalar quantization is applied to +the uncompressed values. The quantization quantile can be specified using MergeTree setting +`scalar_quantization_quantile_for_vector_similarity_index` (default: 0.99). For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no distance function was specified during index creation, `L2Distance` is used as default. diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.cpp b/src/Storages/MergeTree/MergeTreeIOSettings.cpp index bacfbbd5720..60a445d22ef 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeIOSettings.cpp @@ -26,6 +26,8 @@ namespace MergeTreeSetting extern const MergeTreeSettingsString primary_key_compression_codec; extern const MergeTreeSettingsBool use_adaptive_write_buffer_for_dynamic_subcolumns; extern const MergeTreeSettingsBool use_compact_variant_discriminators_serialization; + extern const MergeTreeSettingsFloat scalar_quantization_quantile_for_vector_similarity_index; + extern const MergeTreeSettingsUInt64 scalar_quantization_buffer_size_for_vector_similarity_index; } MergeTreeWriterSettings::MergeTreeWriterSettings( @@ -55,6 +57,8 @@ MergeTreeWriterSettings::MergeTreeWriterSettings( , use_compact_variant_discriminators_serialization((*storage_settings)[MergeTreeSetting::use_compact_variant_discriminators_serialization]) , use_adaptive_write_buffer_for_dynamic_subcolumns((*storage_settings)[MergeTreeSetting::use_adaptive_write_buffer_for_dynamic_subcolumns]) , adaptive_write_buffer_initial_size((*storage_settings)[MergeTreeSetting::adaptive_write_buffer_initial_size]) + , scalar_quantization_quantile_for_vector_similarity_index((*storage_settings)[MergeTreeSetting::scalar_quantization_quantile_for_vector_similarity_index]) + , scalar_quantization_buffer_size_for_vector_similarity_index((*storage_settings)[MergeTreeSetting::scalar_quantization_buffer_size_for_vector_similarity_index]) { } diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index 7506c726bc4..7ae3d25efc8 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -87,6 +87,9 @@ struct MergeTreeWriterSettings bool use_compact_variant_discriminators_serialization; bool use_adaptive_write_buffer_for_dynamic_subcolumns; size_t adaptive_write_buffer_initial_size; + + Float64 scalar_quantization_quantile_for_vector_similarity_index; + UInt64 scalar_quantization_buffer_size_for_vector_similarity_index; }; } diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 0b17fa05072..9dd9bc2a384 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -72,6 +72,13 @@ const std::unordered_map quantizationToSca {"i8", unum::usearch::scalar_kind_t::i8_k}}; /// Usearch provides more quantizations but ^^ above ones seem the only ones comprehensively supported across all distance functions. +/// The vector similarity index implements scalar quantization on top of Usearch. This is the target type (currently, only i8 is supported). +using QuantizedValue = unum::usearch::i8_t; + +/// The maximum number of dimensions for scalar quantization. The purpose is to be able to allocate space for the result row on the stack +/// (std::array) instead of the heap (std::vector). The value can be chosen randomly as long as the stack doesn't overflow. +constexpr size_t MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION = 3000; + template concept is_set = std::same_as>; @@ -214,6 +221,16 @@ void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr) index->serialize(ostr); + writeIntBinary(index->scalar_quantization_codebooks ? static_cast(1) : static_cast(0), ostr); + if (index->scalar_quantization_codebooks) + { + for (const auto codebook : *(index->scalar_quantization_codebooks)) + { + writeFloatBinary(codebook.min, ostr); + writeFloatBinary(codebook.max, ostr); + } + } + auto statistics = index->getStatistics(); LOG_TRACE(logger, "Wrote vector similarity index: {}", statistics.toString()); } @@ -232,12 +249,27 @@ void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr, /// More fancy error handling would be: Set a flag on the index that it failed to load. During usage return all granules, i.e. /// behave as if the index does not exist. Since format changes are expected to happen only rarely and it is "only" an index, keep it simple for now. - UInt64 dimension; - readIntBinary(dimension, istr); - index = std::make_shared(dimension, metric_kind, scalar_kind, usearch_hnsw_params); + UInt64 dimensions; + readIntBinary(dimensions, istr); + index = std::make_shared(dimensions, metric_kind, scalar_kind, usearch_hnsw_params); index->deserialize(istr); + UInt64 has_scalar_quantization_codebooks; + readIntBinary(has_scalar_quantization_codebooks, istr); + if (has_scalar_quantization_codebooks) + { + index->scalar_quantization_codebooks = std::make_optional(); + for (size_t dimension = 0; dimension < dimensions; ++dimension) + { + Float64 min; + Float64 max; + readFloatBinary(min, istr); + readFloatBinary(max, istr); + index->scalar_quantization_codebooks->push_back({min, max}); + } + } + auto statistics = index->getStatistics(); LOG_TRACE(logger, "Loaded vector similarity index: {}", statistics.toString()); } @@ -247,12 +279,16 @@ MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilari const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_, - UsearchHnswParams usearch_hnsw_params_) + UsearchHnswParams usearch_hnsw_params_, + Float64 scalar_quantization_quantile_, + size_t scalar_quantization_buffer_size_) : index_name(index_name_) , index_sample_block(index_sample_block_) , metric_kind(metric_kind_) , scalar_kind(scalar_kind_) , usearch_hnsw_params(usearch_hnsw_params_) + , scalar_quantization_quantile(scalar_quantization_quantile_) + , scalar_quantization_buffer_size(scalar_quantization_buffer_size_) { } @@ -266,8 +302,80 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAnd namespace { +template +ScalarQuantizationCodebook calculateCodebook(std::vector & values, Float64 quantile) +{ + if (values.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "values is empty"); + + std::ranges::sort(values); + + size_t minimum_element_index = static_cast(values.size() * (1.0 - quantile)); + size_t maximum_element_index = std::min(static_cast(values.size() * quantile), values.size() - 1); + + return {values[minimum_element_index], values[maximum_element_index]}; +} + +template +void quantize( + const Value * values, size_t dimensions, const ScalarQuantizationCodebooks & codebooks, + std::array & quantized_vector) +{ + /// Does a similar calculation as in Usearch's cast_to_i8_gt::try_(byte_t const* input, std::size_t dim, byte_t* output) + + /// For some reason, USearch does not map into range [-std::numeric_limits, std::numeric_limits] + /// aka. [-128, 127], it maps into [-127, 127]. Do the same here. + constexpr QuantizedValue i8_min = -127; + constexpr QuantizedValue i8_max = 127; + + Float64 magnitude = 0.0; + for (size_t dimension = 0; dimension != dimensions; ++dimension) + { + Float64 value = static_cast(*(values + dimension)); + magnitude += value * value; + } + magnitude = std::sqrt(magnitude); + + if (magnitude == 0.0) + { + for (std::size_t dimension = 0; dimension != dimensions; ++dimension) + quantized_vector[dimension] = 0; + return; + } + + for (std::size_t dimension = 0; dimension != dimensions; ++dimension) + { + Float64 value = static_cast(*(values + dimension)); + + const ScalarQuantizationCodebook & codebook = codebooks[dimension]; + if (value < codebook.min) + { + quantized_vector[dimension] = i8_min; + continue; + } + if (value > codebook.max) + { + quantized_vector[dimension] = i8_max; + continue; + } + + quantized_vector[dimension] = static_cast(std::clamp(value * i8_max / magnitude, static_cast(i8_min), static_cast(i8_max))); + + } + + /// for (size_t dimension = 0; dimension < dimensions; ++dimension) + /// { + /// const ScalarQuantizationCodebook & codebook = codebooks[dimension]; + /// Float64 value = static_cast(*(values + dimension)); + /// LOG_TRACE(getLogger("Vector Similarity Index"), "{}: {} --> {} (cb: [{}, {}])", dimension, value, quantized_vector[dimension], codebook.min, codebook.max); + /// } +} + template -void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & column_array_offsets, USearchIndexWithSerializationPtr & index, size_t dimensions, size_t rows) +void updateImpl( + const ColumnArray * column_array, const ColumnArray::Offsets & column_array_offsets, USearchIndexWithSerializationPtr & index, + size_t dimensions, size_t rows, + Float64 scalar_quantization_quantile, size_t scalar_quantization_buffer_size) { const auto & column_array_data = column_array->getData(); const auto & column_array_data_float = typeid_cast(column_array_data); @@ -278,6 +386,51 @@ void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & c if (column_array_offsets[row + 1] - column_array_offsets[row] != dimensions) throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length"); + /// ------------------ + /// "Quantization" in Usearch means mere downsampling. We implement scalar quantization by ourselves. + /// The math only works for i8 and cosine distance. + /// --> compute for every dimension the quantiles and store them as "codebook" in the index. + if (index->scalar_kind() == unum::usearch::scalar_kind_t::i8_k + && index->metric_kind() == unum::usearch::metric_kind_t::cos_k + && scalar_quantization_buffer_size != 0 && dimensions < MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION) + { + const size_t buffer_size = std::min(rows, scalar_quantization_buffer_size); + /// Note: This function (update) can theoretically be called in a chunked fashion but this is currently not done, i.e. update is + /// called exactly once per index granule. This simplifies the code, so we make this assumption for now (otherwise, we'd need to + /// integrate with getGranuleAndReset which "finalizes" the insert of rows). + + using ColumnValue = std::conditional_t, Float32, Float64>; + std::vector> values_per_dimension; + + values_per_dimension.resize(dimensions); + for (auto & values : values_per_dimension) + values.resize(buffer_size); + + /// Row-to-column conversion, needed because calculateCodebook sorts along each dimension + for (size_t i = 0; i < buffer_size * dimensions; ++i) + { + ColumnValue value = column_array_data_float_data[i]; + size_t x = i % dimensions; + size_t y = i / dimensions; + values_per_dimension[x][y] = value; + } + + index->scalar_quantization_codebooks = std::make_optional(); + for (size_t dimension = 0; dimension < dimensions; ++dimension) + { + ScalarQuantizationCodebook codebook = calculateCodebook(values_per_dimension[dimension], scalar_quantization_quantile); + /// Invalid codebook that would lead to division-by-0 during quantizaiton. May happen if buffer size is too small or the data + /// distribution is too weird. Continue without quantization. + if (codebook.min == codebook.max) + { + index->scalar_quantization_codebooks = std::nullopt; + break; + } + index->scalar_quantization_codebooks->push_back(codebook); + } + } + /// ------------------ + /// Reserving space is mandatory size_t max_thread_pool_size = Context::getGlobalContextInstance()->getServerSettings()[ServerSetting::max_build_vector_similarity_index_thread_pool_size]; if (max_thread_pool_size == 0) @@ -299,24 +452,33 @@ void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & c if (thread_group) CurrentThread::attachToGroupIfDetached(thread_group); - /// add is thread-safe - auto result = index->add(key, &column_array_data_float_data[column_array_offsets[row - 1]]); - if (!result) + USearchIndexWithSerialization::add_result_t add_result; + + if (index->scalar_quantization_codebooks) { - throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release())); + const ScalarQuantizationCodebooks & codebooks = *(index->scalar_quantization_codebooks); + std::array quantized_vector; + quantize(&column_array_data_float_data[column_array_offsets[row - 1]], dimensions, codebooks, quantized_vector); + add_result = index->add(key, quantized_vector.data()); + } + else + { + add_result = index->add(key, &column_array_data_float_data[column_array_offsets[row - 1]]); } + if (!add_result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(add_result.error.release())); + ProfileEvents::increment(ProfileEvents::USearchAddCount); - ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members); - ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances); + ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, add_result.visited_members); + ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, add_result.computed_distances); }; - size_t index_size = index->size(); - + const size_t index_size = index->size(); for (size_t row = 0; row < rows; ++row) { auto key = static_cast(index_size + row); - auto task = [group = CurrentThread::getGroup(), &add_vector_to_index, key, row] { add_vector_to_index(key, row, group); }; + auto task = [&add_vector_to_index, key, row, thread_group = CurrentThread::getGroup()] { add_vector_to_index(key, row, thread_group); }; thread_pool.scheduleOrThrowOnError(task); } @@ -386,13 +548,12 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_ const TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); if (WhichDataType(nested_type_index).isFloat32()) - updateImpl(column_array, column_array_offsets, index, dimensions, rows); + updateImpl(column_array, column_array_offsets, index, dimensions, rows, scalar_quantization_quantile, scalar_quantization_buffer_size); else if (WhichDataType(nested_type_index).isFloat64()) - updateImpl(column_array, column_array_offsets, index, dimensions, rows); + updateImpl(column_array, column_array_offsets, index, dimensions, rows, scalar_quantization_quantile, scalar_quantization_buffer_size); else throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected data type Array(Float*)"); - *pos += rows_read; } @@ -448,12 +609,35 @@ std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximat /// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method /// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index. - auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search); - if (!search_result) - throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release())); + std::vector neighbors; /// indexes of vectors which were closest to the reference vector - std::vector neighbors(search_result.size()); /// indexes of vectors which were closest to the reference vector - search_result.dump_to(neighbors.data()); + if (index->scalar_quantization_codebooks) + { + const ScalarQuantizationCodebooks & codebooks = *(index->scalar_quantization_codebooks); + std::array quantized_vector; + quantize(reference_vector.data(), index->dimensions(), codebooks, quantized_vector); + auto search_result = index->search(quantized_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search); + if (!search_result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", search_result.error.release()); + neighbors.resize(search_result.size()); + search_result.dump_to(neighbors.data()); + + ProfileEvents::increment(ProfileEvents::USearchSearchCount); + ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members); + ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances); + } + else + { + auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search); + if (!search_result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", search_result.error.release()); + neighbors.resize(search_result.size()); + search_result.dump_to(neighbors.data()); + + ProfileEvents::increment(ProfileEvents::USearchSearchCount); + ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members); + ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances); + } std::sort(neighbors.begin(), neighbors.end()); @@ -466,10 +650,6 @@ std::vector MergeTreeIndexConditionVectorSimilarity::calculateApproximat neighbors.erase(std::unique(neighbors.begin(), neighbors.end()), neighbors.end()); #endif - ProfileEvents::increment(ProfileEvents::USearchSearchCount); - ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members); - ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances); - return neighbors; } @@ -490,9 +670,15 @@ MergeTreeIndexGranulePtr MergeTreeIndexVectorSimilarity::createIndexGranule() co return std::make_shared(index.name, metric_kind, scalar_kind, usearch_hnsw_params); } -MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const +MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & settings) const { - return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params); + Float64 scalar_quantization_quantile = settings.scalar_quantization_quantile_for_vector_similarity_index; + size_t scalar_quantization_buffer_size = settings.scalar_quantization_buffer_size_for_vector_similarity_index; + + if (scalar_quantization_quantile < 0.5 || scalar_quantization_quantile > 1.0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'scalar_quantization_quantile_for_vector_similarity_index' must be in [0.5, 1.0]"); + + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params, scalar_quantization_quantile, scalar_quantization_buffer_size); } MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index fe5049daf77..6c858c0b7f6 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -4,6 +4,7 @@ #if USE_USEARCH +#include #include #include #include @@ -25,6 +26,16 @@ struct UsearchHnswParams size_t expansion_add = default_expansion_add; }; +/// Statistics required to apply scalar quantization to a single dimension of a vector. +struct ScalarQuantizationCodebook +{ + Float64 min; + Float64 max; +}; + +/// Statistics required to apply scalar quantization to all dimensions of a vector. +using ScalarQuantizationCodebooks = std::vector; + using USearchIndex = unum::usearch::index_dense_t; class USearchIndexWithSerialization : public USearchIndex @@ -59,6 +70,8 @@ public: String toString() const; }; + std::optional scalar_quantization_codebooks; + Statistics getStatistics() const; }; @@ -100,7 +113,7 @@ private: /// Note: USearch prefixes the serialized data with its own version header. We can't rely on that because 1. the index in ClickHouse /// is (at least in theory) agnostic of specific vector search libraries, and 2. additional data (e.g. the number of dimensions) /// outside USearch exists which we should version separately. - static constexpr UInt64 FILE_FORMAT_VERSION = 1; + static constexpr UInt64 FILE_FORMAT_VERSION = 2; }; @@ -111,7 +124,9 @@ struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregato const Block & index_sample_block, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_, - UsearchHnswParams usearch_hnsw_params_); + UsearchHnswParams usearch_hnsw_params_, + Float64 scalar_quantization_quantile_, + size_t quantization_buffer_size_); ~MergeTreeIndexAggregatorVectorSimilarity() override = default; @@ -124,6 +139,8 @@ struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregato const unum::usearch::metric_kind_t metric_kind; const unum::usearch::scalar_kind_t scalar_kind; const UsearchHnswParams usearch_hnsw_params; + const Float64 scalar_quantization_quantile; + const size_t scalar_quantization_buffer_size; USearchIndexWithSerializationPtr index; }; diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp index 28ae933c3de..5066afe9e6d 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeSettings.cpp @@ -117,8 +117,8 @@ namespace ErrorCodes DECLARE(Bool, add_implicit_sign_column_constraint_for_collapsing_engine, false, "If true, add implicit constraint for sign column for CollapsingMergeTree engine.", 0) \ DECLARE(Milliseconds, sleep_before_commit_local_part_in_replicated_table_ms, 0, "For testing. Do not change it.", 0) \ DECLARE(Bool, optimize_row_order, false, "Allow reshuffling of rows during part inserts and merges to improve the compressibility of the new part", 0) \ - DECLARE(Bool, use_adaptive_write_buffer_for_dynamic_subcolumns, true, "Allow to use adaptive writer buffers during writing dynamic subcolumns to reduce memory usage", 0) \ DECLARE(UInt64, adaptive_write_buffer_initial_size, 16 * 1024, "Initial size of an adaptive write buffer", 0) \ + DECLARE(Bool, use_adaptive_write_buffer_for_dynamic_subcolumns, true, "Allow to use adaptive writer buffers during writing dynamic subcolumns to reduce memory usage", 0) \ DECLARE(UInt64, min_free_disk_bytes_to_perform_insert, 0, "Minimum free disk space bytes to perform an insert.", 0) \ DECLARE(Float, min_free_disk_ratio_to_perform_insert, 0.0, "Minimum free disk space ratio to perform an insert.", 0) \ \ @@ -228,6 +228,8 @@ namespace ErrorCodes DECLARE(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", EXPERIMENTAL) \ DECLARE(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", EXPERIMENTAL) \ DECLARE(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", EXPERIMENTAL) \ + DECLARE(Float, scalar_quantization_quantile_for_vector_similarity_index, 0.99f, "The quantile for scalar quantization in the vector similarity index. Must be in [0.5, 1.0].", EXPERIMENTAL) \ + DECLARE(UInt64, scalar_quantization_buffer_size_for_vector_similarity_index, 10'000, "The buffer size for scalar quantization in the vector similarity index. 0 disables scalar quantization.", EXPERIMENTAL) \ \ /** Compress marks and primary key. */ \ DECLARE(Bool, compress_marks, true, "Marks support compression, reduce mark file size and speed up network transmission.", 0) \ diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index cf80f46f53c..46be0a9782b 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -247,7 +247,7 @@ Expression (Projection) Name: idx Description: vector_similarity GRANULARITY 2 Parts: 1/1 - Granules: 3/4 + Granules: 4/4 -- Index on Array(Float64) column 6 [0,2] 0 7 [0,2.1] 0.10000000000000009 diff --git a/tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference new file mode 100644 index 00000000000..89cd5ebf948 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference @@ -0,0 +1,3 @@ +4 [1.4,2.4] 9.830249853916663e-8 +14 [1.4,2.4] 9.830249853916663e-8 +3 [1.3,2.3] 0.14142142367226698 diff --git a/tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql new file mode 100644 index 00000000000..656d098db70 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql @@ -0,0 +1,34 @@ +-- Tags: no-fasttest, no-ordinary-database + +-- Tests various scalar quantization for vector similarity indexes with i8 quantization. +-- The effect of quantization is extremely subtle and hard to test, so we are only testing the related settings. + +SET allow_experimental_vector_similarity_index = 1; + +SET enable_analyzer = 0; + +DROP TABLE IF EXISTS tab; + +-- Quantization interval invalid + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5, scalar_quantization_quantile_for_vector_similarity_index = 1.1; +INSERT INTO tab VALUES (0, [1.0, 2.0]), (1, [1.1, 2.1]), (2, [1.2, 2.2]), (3, [1.3, 2.3]), (4, [1.4, 2.4]), (5, [1.5, 2.5]), (6, [1.6, 2.6]), (7, [1.7, 2.7]), (8, [1.8, 2.8]), (9, [1.9, 2.9]), (10, [1.0, 2.0]), (11, [1.1, 2.1]), (12, [1.2, 2.2]), (13, [1.3, 2.3]), (14, [1.4, 2.4]), (15, [1.5, 2.5]), (16, [1.6, 2.6]), (17, [1.7, 2.7]), (18, [1.8, 2.8]), (19, [1.9, 2.9]); -- { serverError INVALID_SETTING_VALUE } + +DROP TABLE tab; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5, scalar_quantization_quantile_for_vector_similarity_index = 0.4; +INSERT INTO tab VALUES (0, [1.0, 2.0]), (1, [1.1, 2.1]), (2, [1.2, 2.2]), (3, [1.3, 2.3]), (4, [1.4, 2.4]), (5, [1.5, 2.5]), (6, [1.6, 2.6]), (7, [1.7, 2.7]), (8, [1.8, 2.8]), (9, [1.9, 2.9]), (10, [1.0, 2.0]), (11, [1.1, 2.1]), (12, [1.2, 2.2]), (13, [1.3, 2.3]), (14, [1.4, 2.4]), (15, [1.5, 2.5]), (16, [1.6, 2.6]), (17, [1.7, 2.7]), (18, [1.8, 2.8]), (19, [1.9, 2.9]); -- { serverError INVALID_SETTING_VALUE } + +DROP TABLE tab; + +-- Test that no bad things happen +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5, scalar_quantization_quantile_for_vector_similarity_index = 0.9; +INSERT INTO tab VALUES (0, [1.0, 2.0]), (1, [1.1, 2.1]), (2, [1.2, 2.2]), (3, [1.3, 2.3]), (4, [1.4, 2.4]), (5, [1.5, 2.5]), (6, [1.6, 2.6]), (7, [1.7, 2.7]), (8, [1.8, 2.8]), (9, [1.9, 2.9]), (10, [1.0, 2.0]), (11, [1.1, 2.1]), (12, [1.2, 2.2]), (13, [1.3, 2.3]), (14, [1.4, 2.4]), (15, [1.5, 2.5]), (16, [1.6, 2.6]), (17, [1.7, 2.7]), (18, [1.8, 2.8]), (19, [1.9, 2.9]); + +WITH [1.4, 2.4] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec), id +LIMIT 3; + +DROP TABLE tab; diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index a53104581bb..af8b02af839 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -49,6 +49,7 @@ AutoML Autocompletion AvroConfluent AzureQueue +BFloat BIGINT BIGSERIAL BORO @@ -1668,6 +1669,7 @@ domainWithoutWWWRFC dont dotProduct dotall +downsampled downsampling dplyr dragonbox @@ -3157,4 +3159,3 @@ znode znodes zookeeperSessionUptime zstd -BFloat