From 7cc4f8460f8bcf31df623d90e6e8612b2379ebd5 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Mon, 11 Nov 2024 20:20:59 +0000
Subject: [PATCH] Scalar quantization for i8

---
 contrib/SimSIMD                               |   2 +-
 contrib/usearch                               |   2 +-
 .../mergetree-family/annindexes.md            |   5 +-
 .../MergeTree/MergeTreeIOSettings.cpp         |   4 +
 src/Storages/MergeTree/MergeTreeIOSettings.h  |   3 +
 .../MergeTreeIndexVectorSimilarity.cpp        | 242 ++++++++++++++++--
 .../MergeTreeIndexVectorSimilarity.h          |  21 +-
 src/Storages/MergeTree/MergeTreeSettings.cpp  |   4 +-
 .../02354_vector_search_queries.reference     |   2 +-
 ...ector_search_scalar_quantization.reference |   3 +
 ...2354_vector_search_scalar_quantization.sql |  34 +++
 .../aspell-ignore/en/aspell-dict.txt          |   3 +-
 12 files changed, 289 insertions(+), 36 deletions(-)
 create mode 100644 tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference
 create mode 100644 tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index fa60f1b8e35..da2d3853729 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit fa60f1b8e3582c50978f0ae86c2ebb6c9af957f3
+Subproject commit da2d38537299ade247c2499131d936fb8db38f03
diff --git a/contrib/usearch b/contrib/usearch
index 7efe8b710c9..9561fcae124 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit 7efe8b710c9831bfe06573b1df0fad001b04a2b5
+Subproject commit 9561fcae1249ea8effbf71250e8a7a7ea97e5dfe
diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md
index fcdc16637e6..81c1b2e067e 100644
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@@ -93,7 +93,10 @@ Vector similarity indexes currently support two distance functions:
   ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
 
 Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16`, `bf16`,
-and `i8`. If no scalar kind was specified during index creation, `bf16` is used as default.
+and `i8`. If no scalar kind was specified during index creation, `bf16` is used as default. For scalar kinds `f64`, `f32`, and `bf16`, the
+values are simply downsampled. For `i8`, the values are mapped to range [-127, 127]. To improve precision, scalar quantization is applied to
+the uncompressed values. The quantization quantile can be specified using MergeTree setting
+`scalar_quantization_quantile_for_vector_similarity_index` (default: 0.99).
 
 For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
 distance function was specified during index creation, `L2Distance` is used as default.
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.cpp b/src/Storages/MergeTree/MergeTreeIOSettings.cpp
index bacfbbd5720..60a445d22ef 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.cpp
@@ -26,6 +26,8 @@ namespace MergeTreeSetting
     extern const MergeTreeSettingsString primary_key_compression_codec;
     extern const MergeTreeSettingsBool use_adaptive_write_buffer_for_dynamic_subcolumns;
     extern const MergeTreeSettingsBool use_compact_variant_discriminators_serialization;
+    extern const MergeTreeSettingsFloat scalar_quantization_quantile_for_vector_similarity_index;
+    extern const MergeTreeSettingsUInt64 scalar_quantization_buffer_size_for_vector_similarity_index;
 }
 
 MergeTreeWriterSettings::MergeTreeWriterSettings(
@@ -55,6 +57,8 @@ MergeTreeWriterSettings::MergeTreeWriterSettings(
     , use_compact_variant_discriminators_serialization((*storage_settings)[MergeTreeSetting::use_compact_variant_discriminators_serialization])
     , use_adaptive_write_buffer_for_dynamic_subcolumns((*storage_settings)[MergeTreeSetting::use_adaptive_write_buffer_for_dynamic_subcolumns])
     , adaptive_write_buffer_initial_size((*storage_settings)[MergeTreeSetting::adaptive_write_buffer_initial_size])
+    , scalar_quantization_quantile_for_vector_similarity_index((*storage_settings)[MergeTreeSetting::scalar_quantization_quantile_for_vector_similarity_index])
+    , scalar_quantization_buffer_size_for_vector_similarity_index((*storage_settings)[MergeTreeSetting::scalar_quantization_buffer_size_for_vector_similarity_index])
 {
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h
index 7506c726bc4..7ae3d25efc8 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.h
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.h
@@ -87,6 +87,9 @@ struct MergeTreeWriterSettings
     bool use_compact_variant_discriminators_serialization;
     bool use_adaptive_write_buffer_for_dynamic_subcolumns;
     size_t adaptive_write_buffer_initial_size;
+
+    Float64 scalar_quantization_quantile_for_vector_similarity_index;
+    UInt64 scalar_quantization_buffer_size_for_vector_similarity_index;
 };
 
 }
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
index 0b17fa05072..9dd9bc2a384 100644
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
@@ -72,6 +72,13 @@ const std::unordered_map<String, unum::usearch::scalar_kind_t> quantizationToSca
     {"i8", unum::usearch::scalar_kind_t::i8_k}};
 /// Usearch provides more quantizations but ^^ above ones seem the only ones comprehensively supported across all distance functions.
 
+/// The vector similarity index implements scalar quantization on top of Usearch. This is the target type (currently, only i8 is supported).
+using QuantizedValue = unum::usearch::i8_t;
+
+/// The maximum number of dimensions for scalar quantization. The purpose is to be able to allocate space for the result row on the stack
+/// (std::array) instead of the heap (std::vector). The value can be chosen randomly as long as the stack doesn't overflow.
+constexpr size_t MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION = 3000;
+
 template<typename T>
 concept is_set = std::same_as<T, std::set<typename T::key_type, typename T::key_compare, typename T::allocator_type>>;
 
@@ -214,6 +221,16 @@ void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr)
 
     index->serialize(ostr);
 
+    writeIntBinary(index->scalar_quantization_codebooks ? static_cast<UInt64>(1) : static_cast<UInt64>(0), ostr);
+    if (index->scalar_quantization_codebooks)
+    {
+        for (const auto codebook : *(index->scalar_quantization_codebooks))
+        {
+            writeFloatBinary(codebook.min, ostr);
+            writeFloatBinary(codebook.max, ostr);
+        }
+    }
+
     auto statistics = index->getStatistics();
     LOG_TRACE(logger, "Wrote vector similarity index: {}", statistics.toString());
 }
@@ -232,12 +249,27 @@ void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr,
         /// More fancy error handling would be: Set a flag on the index that it failed to load. During usage return all granules, i.e.
         /// behave as if the index does not exist. Since format changes are expected to happen only rarely and it is "only" an index, keep it simple for now.
 
-    UInt64 dimension;
-    readIntBinary(dimension, istr);
-    index = std::make_shared<USearchIndexWithSerialization>(dimension, metric_kind, scalar_kind, usearch_hnsw_params);
+    UInt64 dimensions;
+    readIntBinary(dimensions, istr);
+    index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
 
     index->deserialize(istr);
 
+    UInt64 has_scalar_quantization_codebooks;
+    readIntBinary(has_scalar_quantization_codebooks, istr);
+    if (has_scalar_quantization_codebooks)
+    {
+        index->scalar_quantization_codebooks = std::make_optional<ScalarQuantizationCodebooks>();
+        for (size_t dimension = 0; dimension < dimensions; ++dimension)
+        {
+            Float64 min;
+            Float64 max;
+            readFloatBinary(min, istr);
+            readFloatBinary(max, istr);
+            index->scalar_quantization_codebooks->push_back({min, max});
+        }
+    }
+
     auto statistics = index->getStatistics();
     LOG_TRACE(logger, "Loaded vector similarity index: {}", statistics.toString());
 }
@@ -247,12 +279,16 @@ MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilari
     const Block & index_sample_block_,
     unum::usearch::metric_kind_t metric_kind_,
     unum::usearch::scalar_kind_t scalar_kind_,
-    UsearchHnswParams usearch_hnsw_params_)
+    UsearchHnswParams usearch_hnsw_params_,
+    Float64 scalar_quantization_quantile_,
+    size_t scalar_quantization_buffer_size_)
     : index_name(index_name_)
     , index_sample_block(index_sample_block_)
     , metric_kind(metric_kind_)
     , scalar_kind(scalar_kind_)
     , usearch_hnsw_params(usearch_hnsw_params_)
+    , scalar_quantization_quantile(scalar_quantization_quantile_)
+    , scalar_quantization_buffer_size(scalar_quantization_buffer_size_)
 {
 }
 
@@ -266,8 +302,80 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAnd
 namespace
 {
 
+template <typename Value>
+ScalarQuantizationCodebook calculateCodebook(std::vector<Value> & values, Float64 quantile)
+{
+    if (values.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "values is empty");
+
+    std::ranges::sort(values);
+
+    size_t minimum_element_index = static_cast<size_t>(values.size() * (1.0 - quantile));
+    size_t maximum_element_index = std::min(static_cast<size_t>(values.size() * quantile), values.size() - 1);
+
+    return {values[minimum_element_index], values[maximum_element_index]};
+}
+
+template <typename Value>
+void quantize(
+    const Value * values, size_t dimensions, const ScalarQuantizationCodebooks & codebooks,
+    std::array<QuantizedValue, MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION> & quantized_vector)
+{
+    /// Does a similar calculation as in Usearch's cast_to_i8_gt::try_(byte_t const* input, std::size_t dim, byte_t* output)
+
+    /// For some reason, USearch does not map into range [-std::numeric_limits<Int8>, std::numeric_limits<Int8>]
+    /// aka. [-128, 127], it maps into [-127, 127]. Do the same here.
+    constexpr QuantizedValue i8_min = -127;
+    constexpr QuantizedValue i8_max = 127;
+
+    Float64 magnitude = 0.0;
+    for (size_t dimension = 0; dimension != dimensions; ++dimension)
+    {
+        Float64 value = static_cast<Float64>(*(values + dimension));
+        magnitude += value * value;
+    }
+    magnitude = std::sqrt(magnitude);
+
+    if (magnitude == 0.0)
+    {
+        for (std::size_t dimension = 0; dimension != dimensions; ++dimension)
+            quantized_vector[dimension] = 0;
+        return;
+    }
+
+    for (std::size_t dimension = 0; dimension != dimensions; ++dimension)
+    {
+        Float64 value = static_cast<Float64>(*(values + dimension));
+
+        const ScalarQuantizationCodebook & codebook = codebooks[dimension];
+        if (value < codebook.min)
+        {
+            quantized_vector[dimension] = i8_min;
+            continue;
+        }
+        if (value > codebook.max)
+        {
+            quantized_vector[dimension] = i8_max;
+            continue;
+        }
+
+        quantized_vector[dimension] = static_cast<QuantizedValue>(std::clamp(value * i8_max / magnitude, static_cast<Float64>(i8_min), static_cast<Float64>(i8_max)));
+
+    }
+
+    /// for (size_t dimension = 0; dimension < dimensions; ++dimension)
+    /// {
+    ///     const ScalarQuantizationCodebook & codebook = codebooks[dimension];
+    ///     Float64 value = static_cast<Float64>(*(values + dimension));
+    ///     LOG_TRACE(getLogger("Vector Similarity Index"), "{}: {} --> {} (cb: [{}, {}])", dimension, value, quantized_vector[dimension], codebook.min, codebook.max);
+    /// }
+}
+
 template <typename Column>
-void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & column_array_offsets, USearchIndexWithSerializationPtr & index, size_t dimensions, size_t rows)
+void updateImpl(
+    const ColumnArray * column_array, const ColumnArray::Offsets & column_array_offsets, USearchIndexWithSerializationPtr & index,
+    size_t dimensions, size_t rows,
+    Float64 scalar_quantization_quantile, size_t scalar_quantization_buffer_size)
 {
     const auto & column_array_data = column_array->getData();
     const auto & column_array_data_float = typeid_cast<const Column &>(column_array_data);
@@ -278,6 +386,51 @@ void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & c
         if (column_array_offsets[row + 1] - column_array_offsets[row] != dimensions)
             throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length");
 
+    /// ------------------
+    /// "Quantization" in Usearch means mere downsampling. We implement scalar quantization by ourselves.
+    /// The math only works for i8 and cosine distance.
+    /// --> compute for every dimension the quantiles and store them as "codebook" in the index.
+    if (index->scalar_kind() == unum::usearch::scalar_kind_t::i8_k
+        && index->metric_kind() == unum::usearch::metric_kind_t::cos_k
+        && scalar_quantization_buffer_size != 0 && dimensions < MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION)
+    {
+        const size_t buffer_size = std::min(rows, scalar_quantization_buffer_size);
+        /// Note: This function (update) can theoretically be called in a chunked fashion but this is currently not done, i.e. update is
+        /// called exactly once per index granule. This simplifies the code, so we make this assumption for now (otherwise, we'd need to
+        /// integrate with getGranuleAndReset which "finalizes" the insert of rows).
+
+        using ColumnValue = std::conditional_t<std::is_same_v<Column,ColumnFloat32>, Float32, Float64>;
+        std::vector<std::vector<ColumnValue>> values_per_dimension;
+
+        values_per_dimension.resize(dimensions);
+        for (auto & values : values_per_dimension)
+            values.resize(buffer_size);
+
+        /// Row-to-column conversion, needed because calculateCodebook sorts along each dimension
+        for (size_t i = 0; i < buffer_size * dimensions; ++i)
+        {
+            ColumnValue value = column_array_data_float_data[i];
+            size_t x = i % dimensions;
+            size_t y = i / dimensions;
+            values_per_dimension[x][y] = value;
+        }
+
+        index->scalar_quantization_codebooks = std::make_optional<ScalarQuantizationCodebooks>();
+        for (size_t dimension = 0; dimension < dimensions; ++dimension)
+        {
+            ScalarQuantizationCodebook codebook = calculateCodebook(values_per_dimension[dimension], scalar_quantization_quantile);
+            /// Invalid codebook that would lead to division-by-0 during quantizaiton. May happen if buffer size is too small or the data
+            /// distribution is too weird. Continue without quantization.
+            if (codebook.min == codebook.max)
+            {
+                index->scalar_quantization_codebooks = std::nullopt;
+                break;
+            }
+            index->scalar_quantization_codebooks->push_back(codebook);
+        }
+    }
+    /// ------------------
+
     /// Reserving space is mandatory
     size_t max_thread_pool_size = Context::getGlobalContextInstance()->getServerSettings()[ServerSetting::max_build_vector_similarity_index_thread_pool_size];
     if (max_thread_pool_size == 0)
@@ -299,24 +452,33 @@ void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & c
         if (thread_group)
             CurrentThread::attachToGroupIfDetached(thread_group);
 
-        /// add is thread-safe
-        auto result = index->add(key, &column_array_data_float_data[column_array_offsets[row - 1]]);
-        if (!result)
+        USearchIndexWithSerialization::add_result_t add_result;
+
+        if (index->scalar_quantization_codebooks)
         {
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release()));
+            const ScalarQuantizationCodebooks & codebooks = *(index->scalar_quantization_codebooks);
+            std::array<QuantizedValue, MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION> quantized_vector;
+            quantize(&column_array_data_float_data[column_array_offsets[row - 1]], dimensions, codebooks, quantized_vector);
+            add_result = index->add(key, quantized_vector.data());
+        }
+        else
+        {
+            add_result = index->add(key, &column_array_data_float_data[column_array_offsets[row - 1]]);
         }
 
+        if (!add_result)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(add_result.error.release()));
+
         ProfileEvents::increment(ProfileEvents::USearchAddCount);
-        ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members);
-        ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances);
+        ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, add_result.visited_members);
+        ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, add_result.computed_distances);
     };
 
-    size_t index_size = index->size();
-
+    const size_t index_size = index->size();
     for (size_t row = 0; row < rows; ++row)
     {
         auto key = static_cast<USearchIndex::vector_key_t>(index_size + row);
-        auto task = [group = CurrentThread::getGroup(), &add_vector_to_index, key, row] { add_vector_to_index(key, row, group); };
+        auto task = [&add_vector_to_index, key, row, thread_group = CurrentThread::getGroup()] { add_vector_to_index(key, row, thread_group); };
         thread_pool.scheduleOrThrowOnError(task);
     }
 
@@ -386,13 +548,12 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_
     const TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
 
     if (WhichDataType(nested_type_index).isFloat32())
-        updateImpl<ColumnFloat32>(column_array, column_array_offsets, index, dimensions, rows);
+        updateImpl<ColumnFloat32>(column_array, column_array_offsets, index, dimensions, rows, scalar_quantization_quantile, scalar_quantization_buffer_size);
     else if (WhichDataType(nested_type_index).isFloat64())
-        updateImpl<ColumnFloat64>(column_array, column_array_offsets, index, dimensions, rows);
+        updateImpl<ColumnFloat64>(column_array, column_array_offsets, index, dimensions, rows, scalar_quantization_quantile, scalar_quantization_buffer_size);
     else
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected data type Array(Float*)");
 
-
     *pos += rows_read;
 }
 
@@ -448,12 +609,35 @@ std::vector<UInt64> MergeTreeIndexConditionVectorSimilarity::calculateApproximat
     /// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method
     /// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index.
 
-    auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search);
-    if (!search_result)
-        throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release()));
+    std::vector<USearchIndex::vector_key_t> neighbors; /// indexes of vectors which were closest to the reference vector
 
-    std::vector<USearchIndex::vector_key_t> neighbors(search_result.size()); /// indexes of vectors which were closest to the reference vector
-    search_result.dump_to(neighbors.data());
+    if (index->scalar_quantization_codebooks)
+    {
+        const ScalarQuantizationCodebooks & codebooks = *(index->scalar_quantization_codebooks);
+        std::array<QuantizedValue, MAX_DIMENSIONS_FOR_SCALAR_QUANTIZATION> quantized_vector;
+        quantize(reference_vector.data(), index->dimensions(), codebooks, quantized_vector);
+        auto search_result = index->search(quantized_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search);
+        if (!search_result)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", search_result.error.release());
+        neighbors.resize(search_result.size());
+        search_result.dump_to(neighbors.data());
+
+        ProfileEvents::increment(ProfileEvents::USearchSearchCount);
+        ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members);
+        ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances);
+    }
+    else
+    {
+        auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, expansion_search);
+        if (!search_result)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", search_result.error.release());
+        neighbors.resize(search_result.size());
+        search_result.dump_to(neighbors.data());
+
+        ProfileEvents::increment(ProfileEvents::USearchSearchCount);
+        ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members);
+        ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances);
+    }
 
     std::sort(neighbors.begin(), neighbors.end());
 
@@ -466,10 +650,6 @@ std::vector<UInt64> MergeTreeIndexConditionVectorSimilarity::calculateApproximat
         neighbors.erase(std::unique(neighbors.begin(), neighbors.end()), neighbors.end());
 #endif
 
-    ProfileEvents::increment(ProfileEvents::USearchSearchCount);
-    ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members);
-    ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances);
-
     return neighbors;
 }
 
@@ -490,9 +670,15 @@ MergeTreeIndexGranulePtr MergeTreeIndexVectorSimilarity::createIndexGranule() co
     return std::make_shared<MergeTreeIndexGranuleVectorSimilarity>(index.name, metric_kind, scalar_kind, usearch_hnsw_params);
 }
 
-MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
+MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & settings) const
 {
-    return std::make_shared<MergeTreeIndexAggregatorVectorSimilarity>(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params);
+    Float64 scalar_quantization_quantile = settings.scalar_quantization_quantile_for_vector_similarity_index;
+    size_t scalar_quantization_buffer_size = settings.scalar_quantization_buffer_size_for_vector_similarity_index;
+
+    if (scalar_quantization_quantile < 0.5 || scalar_quantization_quantile > 1.0)
+        throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Setting 'scalar_quantization_quantile_for_vector_similarity_index' must be in [0.5, 1.0]");
+
+    return std::make_shared<MergeTreeIndexAggregatorVectorSimilarity>(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params, scalar_quantization_quantile, scalar_quantization_buffer_size);
 }
 
 MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
index fe5049daf77..6c858c0b7f6 100644
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
@@ -4,6 +4,7 @@
 
 #if USE_USEARCH
 
+#include <Storages/MergeTree/MergeTreeIOSettings.h>
 #include <Storages/MergeTree/VectorSimilarityCondition.h>
 #include <Common/Logger.h>
 #include <usearch/index_dense.hpp>
@@ -25,6 +26,16 @@ struct UsearchHnswParams
     size_t expansion_add = default_expansion_add;
 };
 
+/// Statistics required to apply scalar quantization to a single dimension of a vector.
+struct ScalarQuantizationCodebook
+{
+    Float64 min;
+    Float64 max;
+};
+
+/// Statistics required to apply scalar quantization to all dimensions of a vector.
+using ScalarQuantizationCodebooks = std::vector<ScalarQuantizationCodebook>;
+
 using USearchIndex = unum::usearch::index_dense_t;
 
 class USearchIndexWithSerialization : public USearchIndex
@@ -59,6 +70,8 @@ public:
         String toString() const;
     };
 
+    std::optional<ScalarQuantizationCodebooks> scalar_quantization_codebooks;
+
     Statistics getStatistics() const;
 };
 
@@ -100,7 +113,7 @@ private:
     /// Note: USearch prefixes the serialized data with its own version header. We can't rely on that because 1. the index in ClickHouse
     /// is (at least in theory) agnostic of specific vector search libraries, and 2. additional data (e.g. the number of dimensions)
     /// outside USearch exists which we should version separately.
-    static constexpr UInt64 FILE_FORMAT_VERSION = 1;
+    static constexpr UInt64 FILE_FORMAT_VERSION = 2;
 };
 
 
@@ -111,7 +124,9 @@ struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregato
         const Block & index_sample_block,
         unum::usearch::metric_kind_t metric_kind_,
         unum::usearch::scalar_kind_t scalar_kind_,
-        UsearchHnswParams usearch_hnsw_params_);
+        UsearchHnswParams usearch_hnsw_params_,
+        Float64 scalar_quantization_quantile_,
+        size_t quantization_buffer_size_);
 
     ~MergeTreeIndexAggregatorVectorSimilarity() override = default;
 
@@ -124,6 +139,8 @@ struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregato
     const unum::usearch::metric_kind_t metric_kind;
     const unum::usearch::scalar_kind_t scalar_kind;
     const UsearchHnswParams usearch_hnsw_params;
+    const Float64 scalar_quantization_quantile;
+    const size_t scalar_quantization_buffer_size;
     USearchIndexWithSerializationPtr index;
 };
 
diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp
index 28ae933c3de..5066afe9e6d 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeSettings.cpp
@@ -117,8 +117,8 @@ namespace ErrorCodes
     DECLARE(Bool, add_implicit_sign_column_constraint_for_collapsing_engine, false, "If true, add implicit constraint for sign column for CollapsingMergeTree engine.", 0) \
     DECLARE(Milliseconds, sleep_before_commit_local_part_in_replicated_table_ms, 0, "For testing. Do not change it.", 0) \
     DECLARE(Bool, optimize_row_order, false, "Allow reshuffling of rows during part inserts and merges to improve the compressibility of the new part", 0) \
-    DECLARE(Bool, use_adaptive_write_buffer_for_dynamic_subcolumns, true, "Allow to use adaptive writer buffers during writing dynamic subcolumns to reduce memory usage", 0) \
     DECLARE(UInt64, adaptive_write_buffer_initial_size, 16 * 1024, "Initial size of an adaptive write buffer", 0) \
+    DECLARE(Bool, use_adaptive_write_buffer_for_dynamic_subcolumns, true, "Allow to use adaptive writer buffers during writing dynamic subcolumns to reduce memory usage", 0) \
     DECLARE(UInt64, min_free_disk_bytes_to_perform_insert, 0, "Minimum free disk space bytes to perform an insert.", 0) \
     DECLARE(Float, min_free_disk_ratio_to_perform_insert, 0.0, "Minimum free disk space ratio to perform an insert.", 0) \
     \
@@ -228,6 +228,8 @@ namespace ErrorCodes
     DECLARE(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", EXPERIMENTAL) \
     DECLARE(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", EXPERIMENTAL) \
     DECLARE(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", EXPERIMENTAL) \
+    DECLARE(Float, scalar_quantization_quantile_for_vector_similarity_index, 0.99f, "The quantile for scalar quantization in the vector similarity index. Must be in [0.5, 1.0].", EXPERIMENTAL) \
+    DECLARE(UInt64, scalar_quantization_buffer_size_for_vector_similarity_index, 10'000, "The buffer size for scalar quantization in the vector similarity index. 0 disables scalar quantization.", EXPERIMENTAL) \
     \
     /** Compress marks and primary key. */ \
     DECLARE(Bool, compress_marks, true, "Marks support compression, reduce mark file size and speed up network transmission.", 0) \
diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference
index cf80f46f53c..46be0a9782b 100644
--- a/tests/queries/0_stateless/02354_vector_search_queries.reference
+++ b/tests/queries/0_stateless/02354_vector_search_queries.reference
@@ -247,7 +247,7 @@ Expression (Projection)
             Name: idx
             Description: vector_similarity GRANULARITY 2
             Parts: 1/1
-            Granules: 3/4
+            Granules: 4/4
 -- Index on Array(Float64) column
 6	[0,2]	0
 7	[0,2.1]	0.10000000000000009
diff --git a/tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference
new file mode 100644
index 00000000000..89cd5ebf948
--- /dev/null
+++ b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.reference
@@ -0,0 +1,3 @@
+4	[1.4,2.4]	9.830249853916663e-8
+14	[1.4,2.4]	9.830249853916663e-8
+3	[1.3,2.3]	0.14142142367226698
diff --git a/tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql
new file mode 100644
index 00000000000..656d098db70
--- /dev/null
+++ b/tests/queries/0_stateless/02354_vector_search_scalar_quantization.sql
@@ -0,0 +1,34 @@
+-- Tags: no-fasttest, no-ordinary-database
+
+-- Tests various scalar quantization for vector similarity indexes with i8 quantization.
+-- The effect of quantization is extremely subtle and hard to test, so we are only testing the related settings.
+
+SET allow_experimental_vector_similarity_index = 1;
+
+SET enable_analyzer = 0;
+
+DROP TABLE IF EXISTS tab;
+
+-- Quantization interval invalid
+
+CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5, scalar_quantization_quantile_for_vector_similarity_index = 1.1;
+INSERT INTO tab VALUES (0, [1.0, 2.0]), (1, [1.1, 2.1]), (2, [1.2, 2.2]), (3, [1.3, 2.3]), (4, [1.4, 2.4]), (5, [1.5, 2.5]), (6, [1.6, 2.6]), (7, [1.7, 2.7]), (8, [1.8, 2.8]), (9, [1.9, 2.9]), (10, [1.0, 2.0]), (11, [1.1, 2.1]), (12, [1.2, 2.2]), (13, [1.3, 2.3]), (14, [1.4, 2.4]), (15, [1.5, 2.5]), (16, [1.6, 2.6]), (17, [1.7, 2.7]), (18, [1.8, 2.8]), (19, [1.9, 2.9]); -- { serverError INVALID_SETTING_VALUE }
+
+DROP TABLE tab;
+
+CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5, scalar_quantization_quantile_for_vector_similarity_index = 0.4;
+INSERT INTO tab VALUES (0, [1.0, 2.0]), (1, [1.1, 2.1]), (2, [1.2, 2.2]), (3, [1.3, 2.3]), (4, [1.4, 2.4]), (5, [1.5, 2.5]), (6, [1.6, 2.6]), (7, [1.7, 2.7]), (8, [1.8, 2.8]), (9, [1.9, 2.9]), (10, [1.0, 2.0]), (11, [1.1, 2.1]), (12, [1.2, 2.2]), (13, [1.3, 2.3]), (14, [1.4, 2.4]), (15, [1.5, 2.5]), (16, [1.6, 2.6]), (17, [1.7, 2.7]), (18, [1.8, 2.8]), (19, [1.9, 2.9]); -- { serverError INVALID_SETTING_VALUE }
+
+DROP TABLE tab;
+
+-- Test that no bad things happen
+CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5, scalar_quantization_quantile_for_vector_similarity_index = 0.9;
+INSERT INTO tab VALUES (0, [1.0, 2.0]), (1, [1.1, 2.1]), (2, [1.2, 2.2]), (3, [1.3, 2.3]), (4, [1.4, 2.4]), (5, [1.5, 2.5]), (6, [1.6, 2.6]), (7, [1.7, 2.7]), (8, [1.8, 2.8]), (9, [1.9, 2.9]), (10, [1.0, 2.0]), (11, [1.1, 2.1]), (12, [1.2, 2.2]), (13, [1.3, 2.3]), (14, [1.4, 2.4]), (15, [1.5, 2.5]), (16, [1.6, 2.6]), (17, [1.7, 2.7]), (18, [1.8, 2.8]), (19, [1.9, 2.9]);
+
+WITH [1.4, 2.4] AS reference_vec
+SELECT id, vec, L2Distance(vec, reference_vec)
+FROM tab
+ORDER BY L2Distance(vec, reference_vec), id
+LIMIT 3;
+
+DROP TABLE tab;
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index a53104581bb..af8b02af839 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -49,6 +49,7 @@ AutoML
 Autocompletion
 AvroConfluent
 AzureQueue
+BFloat
 BIGINT
 BIGSERIAL
 BORO
@@ -1668,6 +1669,7 @@ domainWithoutWWWRFC
 dont
 dotProduct
 dotall
+downsampled
 downsampling
 dplyr
 dragonbox
@@ -3157,4 +3159,3 @@ znode
 znodes
 zookeeperSessionUptime
 zstd
-BFloat