From c5b92413cac091c4e28acb5160b89fcabc37a853 Mon Sep 17 00:00:00 2001 From: flynn Date: Fri, 30 Aug 2024 08:28:31 +0000 Subject: [PATCH 1/3] Fix vector similarity index does not work for cosineDistance --- src/Storages/MergeTree/VectorSimilarityCondition.cpp | 2 ++ src/Storages/MergeTree/VectorSimilarityCondition.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.cpp b/src/Storages/MergeTree/VectorSimilarityCondition.cpp index c8f33857640..251cdde65ab 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.cpp +++ b/src/Storages/MergeTree/VectorSimilarityCondition.cpp @@ -44,6 +44,8 @@ VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(std:: { if (distance_function == "L2Distance") return VectorSimilarityCondition::Info::DistanceFunction::L2; + else if (distance_function == "cosineDistance") + return VectorSimilarityCondition::Info::DistanceFunction::Cosine; else return VectorSimilarityCondition::Info::DistanceFunction::Unknown; } diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.h b/src/Storages/MergeTree/VectorSimilarityCondition.h index 2380f8f46b0..e2946222f49 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.h +++ b/src/Storages/MergeTree/VectorSimilarityCondition.h @@ -57,7 +57,8 @@ public: enum class DistanceFunction : uint8_t { Unknown, - L2 + L2, + Cosine }; std::vector reference_vector; From af7adfe4b2f8e9b3b7354314a91d19a19b7955f9 Mon Sep 17 00:00:00 2001 From: flynn Date: Fri, 30 Aug 2024 09:03:23 +0000 Subject: [PATCH 2/3] Remove unused code --- src/Storages/MergeTree/VectorSimilarityCondition.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.h b/src/Storages/MergeTree/VectorSimilarityCondition.h index e2946222f49..83ae1e19bfb 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.h +++ b/src/Storages/MergeTree/VectorSimilarityCondition.h @@ -143,6 +143,7 @@ private: void traverseOrderByAST(const ASTPtr & node, RPN & rpn); /// Returns true and stores ANNExpr if the query has valid WHERE section + /// TODO NOT implemented, WHERE does not supported. static bool matchRPNWhere(RPN & rpn, Info & info); /// Returns true and stores ANNExpr if the query has valid ORDERBY section @@ -151,9 +152,6 @@ private: /// Returns true and stores Length if we have valid LIMIT clause in query static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit); - /// Matches dist function, reference vector, column name - static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, Info & info); - /// Gets float or int from AST node static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter); From 2620325b303b9318fdd347372f66faa131ca9401 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 4 Sep 2024 08:59:52 +0000 Subject: [PATCH 3/3] Minor fixups + add a tests --- .../MergeTree/VectorSimilarityCondition.cpp | 4 ++-- .../MergeTree/VectorSimilarityCondition.h | 6 +----- .../02354_vector_search_queries.reference | 15 +++++++++++++++ .../0_stateless/02354_vector_search_queries.sql | 7 +++++++ 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.cpp b/src/Storages/MergeTree/VectorSimilarityCondition.cpp index 251cdde65ab..641b0037e7b 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.cpp +++ b/src/Storages/MergeTree/VectorSimilarityCondition.cpp @@ -40,7 +40,7 @@ void extractReferenceVectorFromLiteral(std::vector & reference_vector, } } -VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(std::string_view distance_function) +VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(const String & distance_function) { if (distance_function == "L2Distance") return VectorSimilarityCondition::Info::DistanceFunction::L2; @@ -59,7 +59,7 @@ VectorSimilarityCondition::VectorSimilarityCondition(const SelectQueryInfo & que , index_is_useful(checkQueryStructure(query_info)) {} -bool VectorSimilarityCondition::alwaysUnknownOrTrue(String distance_function) const +bool VectorSimilarityCondition::alwaysUnknownOrTrue(const String & distance_function) const { if (!index_is_useful) return true; /// query isn't supported diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.h b/src/Storages/MergeTree/VectorSimilarityCondition.h index 83ae1e19bfb..2e9e06a31d0 100644 --- a/src/Storages/MergeTree/VectorSimilarityCondition.h +++ b/src/Storages/MergeTree/VectorSimilarityCondition.h @@ -69,7 +69,7 @@ public: }; /// Returns false if query can be speeded up by an ANN index, true otherwise. - bool alwaysUnknownOrTrue(String distance_function) const; + bool alwaysUnknownOrTrue(const String & distance_function) const; std::vector getReferenceVector() const; size_t getDimensions() const; @@ -142,10 +142,6 @@ private: /// Traverses the AST of ORDERBY section void traverseOrderByAST(const ASTPtr & node, RPN & rpn); - /// Returns true and stores ANNExpr if the query has valid WHERE section - /// TODO NOT implemented, WHERE does not supported. - static bool matchRPNWhere(RPN & rpn, Info & info); - /// Returns true and stores ANNExpr if the query has valid ORDERBY section static bool matchRPNOrderBy(RPN & rpn, Info & info); diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index faff306ef60..e42f91d05dc 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -41,6 +41,21 @@ Special cases 6 [1,9.3] 0.005731362878640178 1 [2,3.2] 0.15200169244542905 7 [5.5,4.7] 0.3503476876550442 +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: idx + Description: vector_similarity GRANULARITY 2 + Parts: 1/1 + Granules: 2/4 -- Setting "max_limit_for_ann_queries" Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 17939992165..8769e5c56bb 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -63,6 +63,13 @@ FROM tab ORDER BY cosineDistance(vec, reference_vec) LIMIT 3; +EXPLAIN indexes = 1 +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + SELECT '-- Setting "max_limit_for_ann_queries"'; EXPLAIN indexes=1 WITH [0.0, 2.0] as reference_vec