diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 77259b18bb9..b215aeae495 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -55,6 +55,7 @@ const std::unordered_map quantizationToSca {"f32", unum::usearch::scalar_kind_t::f32_k}, {"f16", unum::usearch::scalar_kind_t::f16_k}, {"i8", unum::usearch::scalar_kind_t::i8_k}}; +/// Usearch provides more quantizations but ^^ above ones seem the only ones comprehensively supported accross all distance functions. template concept is_set = std::same_as>; @@ -253,8 +254,7 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_ throw Exception( ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.", - *pos, - block.rows()); + *pos, block.rows()); size_t rows_read = std::min(limit, block.rows() - *pos); @@ -268,63 +268,64 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_ throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); const String & index_column_name = index_sample_block.getByPosition(0).name; - ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); + const ColumnPtr & index_column = block.getByName(index_column_name).column; + ColumnPtr column_cut = index_column->cut(*pos, rows_read); - if (const auto & column_array = typeid_cast(column_cut.get())) + const auto * column_array = typeid_cast(column_cut.get()); + if (!column_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float32) column"); + + if (column_array->empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty"); + + /// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays + /// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default + /// values which is also empty. + if (column_array->isDefaultAt(0)) + throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); + + const size_t rows = column_array->size(); + + const auto & column_array_offsets = column_array->getOffsets(); + const size_t dimensions = column_array_offsets[0]; + + if (!index) + index = std::make_shared(dimensions, metric_kind, scalar_kind, usearch_hnsw_params); + + /// Also check that previously inserted blocks have the same size as this block. + /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across + /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. + if (index->dimensions() != dimensions) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length"); + + /// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp + if (index->size() + rows > std::numeric_limits::max()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index would exceed 4 billion entries"); + + const auto & column_array_data = column_array->getData(); + const auto & column_array_data_float = typeid_cast(column_array_data); + const auto & column_array_data_float_data = column_array_data_float.getData(); + + /// Check all sizes are the same + for (size_t row = 0; row < rows - 1; ++row) + if (column_array_offsets[row + 1] - column_array_offsets[row] != dimensions) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length"); + + /// Reserving space is mandatory + if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + rows))) + throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index"); + + for (size_t row = 0; row < rows; ++row) { - const auto & column_array_data = column_array->getData(); - const auto & column_array_data_float = typeid_cast(column_array_data); - const auto & column_array_data_float_data = column_array_data_float.getData(); - - const auto & column_array_offsets = column_array->getOffsets(); - const size_t num_rows = column_array_offsets.size(); - - if (column_array->empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty"); - - /// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays - /// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default - /// values which is also empty. - if (column_array->isDefaultAt(0)) - throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); - - /// Check all sizes are the same - const size_t dimensions = column_array_offsets[0]; - for (size_t i = 0; i < num_rows - 1; ++i) - if (column_array_offsets[i + 1] - column_array_offsets[i] != dimensions) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); - - /// Also check that previously inserted blocks have the same size as this block. - /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across - /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. - if (index && index->dimensions() != dimensions) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); - - if (!index) - index = std::make_shared(dimensions, metric_kind, scalar_kind, usearch_hnsw_params); - - /// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp - if (index->size() + num_rows > std::numeric_limits::max()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index in column {} would exceed 4 billion entries", index_column_name); - - /// Reserving space is mandatory - if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows))) - throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index"); - - for (size_t row = 0; row < num_rows; ++row) + if (auto result = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release())); + else { - if (auto result = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result) - throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release())); - else - { - ProfileEvents::increment(ProfileEvents::USearchAddCount); - ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members); - ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances); - } + ProfileEvents::increment(ProfileEvents::USearchAddCount); + ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members); + ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances); } } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float32) column"); *pos += rows_read; } @@ -483,7 +484,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta if (!quantizationToScalarKind.contains(index.arguments[2].safeGet())) throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind)); - /// Call Usearche's own parameter validation method for HNSW-specific parameters + /// Call Usearch's own parameter validation method for HNSW-specific parameters UInt64 m = index.arguments[3].safeGet(); UInt64 ef_construction = index.arguments[4].safeGet(); UInt64 ef_search = index.arguments[5].safeGet(); @@ -498,18 +499,14 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column"); - /// Check data type of the indexed column: + /// Check that the data type is Array(Float32) DataTypePtr data_type = index.sample_block.getDataTypes()[0]; - if (const auto * data_type_array = typeid_cast(data_type.get())) - { - TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); - if (!WhichDataType(nested_type_index).isFloat32()) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); - } - else - { + const auto * data_type_array = typeid_cast(data_type.get()); + if (!data_type_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); + TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); + if (!WhichDataType(nested_type_index).isFloat32()) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); - } } } diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.reference b/tests/queries/0_stateless/02354_vector_search_bugs.reference index a27b086e118..8da05c8a7c0 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.reference +++ b/tests/queries/0_stateless/02354_vector_search_bugs.reference @@ -1,3 +1,4 @@ +Rejects INSERTs of Arrays with different sizes Issue #52258: Empty Arrays or Arrays with default values are rejected It is possible to create parts with different Array vector sizes but there will be an error at query time Correctness of index with > 1 mark diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.sql b/tests/queries/0_stateless/02354_vector_search_bugs.sql index 7c66b4b8e45..51e2e6ce2b7 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.sql +++ b/tests/queries/0_stateless/02354_vector_search_bugs.sql @@ -7,6 +7,12 @@ SET enable_analyzer = 1; -- 0 vs. 1 produce slightly different error codes, make DROP TABLE IF EXISTS tab; +SELECT 'Rejects INSERTs of Arrays with different sizes'; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; +INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA } +DROP TABLE tab; + SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected'; CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree() ORDER BY id; diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference index f18daa6e02e..5963f4b5834 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference @@ -5,4 +5,3 @@ Two or six index arguments 4nd argument (M), if given, must be UInt64 and > 1 Must be created on single column Must be created on Array(Float32) columns -Rejects INSERTs of Arrays with different sizes diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql index de9d37e1000..b39b8d3e754 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql @@ -38,8 +38,3 @@ CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity('hn CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } - -SELECT 'Rejects INSERTs of Arrays with different sizes'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA } -DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index 5cd5ac006db..374ed1f8abd 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -1,9 +1,7 @@ 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block -- ORDER-BY-type 5 [0,2] 0 6 [0,2.1] 0.09999990463256836 7 [0,2.2] 0.20000004768371582 -- ORDER-BY-type, EXPLAIN Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -20,11 +18,9 @@ Expression (Projection) Parts: 1/1 Granules: 1/1 12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block -- ORDER-BY-type 6 [0,2] 0 7 [0,2.1] 0.09999990463256836 8 [0,2.2] 0.20000004768371582 -- ORDER-BY-type, EXPLAIN Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -41,11 +37,11 @@ Expression (Projection) Parts: 1/1 Granules: 2/4 Special cases -- ORDER-BY-type +-- Non-default metric, M, ef_construction, ef_search 6 [1,9.3] 0.005731362878640178 1 [2,3.2] 0.15200169244542905 7 [5.5,4.7] 0.3503476876550442 -- Special case: setting "max_limit_for_ann_queries" +-- Setting "max_limit_for_ann_queries" Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 486910ddbbb..2c6a7f10776 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -14,14 +14,12 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- ORDER-BY-type, EXPLAIN'; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) @@ -37,14 +35,12 @@ SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexe CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]); -SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- ORDER-BY-type, EXPLAIN'; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) @@ -56,19 +52,18 @@ DROP TABLE tab; SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen. --- Test with non-default metric, M, ef_construction, ef_search +SELECT '-- Non-default metric, M, ef_construction, ef_search'; CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, cosineDistance(vec, reference_vec) FROM tab ORDER BY cosineDistance(vec, reference_vec) LIMIT 3; -SELECT '- Special case: setting "max_limit_for_ann_queries"'; +SELECT '-- Setting "max_limit_for_ann_queries"'; EXPLAIN indexes=1 WITH [0.0, 2.0] as reference_vec SELECT id, vec, cosineDistance(vec, reference_vec)