Merge pull request #69090 from ucasfl/fix-vector-index

Fix bug that vector similarity index does not work for cosine distance
This commit is contained in:
Robert Schulze 2024-09-06 10:24:46 +00:00 committed by GitHub
commit 8c9cf37858
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 29 additions and 10 deletions

View File

@ -40,10 +40,12 @@ void extractReferenceVectorFromLiteral(std::vector<Float64> & reference_vector,
}
}
VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(std::string_view distance_function)
VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(const String & distance_function)
{
if (distance_function == "L2Distance")
return VectorSimilarityCondition::Info::DistanceFunction::L2;
else if (distance_function == "cosineDistance")
return VectorSimilarityCondition::Info::DistanceFunction::Cosine;
else
return VectorSimilarityCondition::Info::DistanceFunction::Unknown;
}
@ -57,7 +59,7 @@ VectorSimilarityCondition::VectorSimilarityCondition(const SelectQueryInfo & que
, index_is_useful(checkQueryStructure(query_info))
{}
bool VectorSimilarityCondition::alwaysUnknownOrTrue(String distance_function) const
bool VectorSimilarityCondition::alwaysUnknownOrTrue(const String & distance_function) const
{
if (!index_is_useful)
return true; /// query isn't supported

View File

@ -57,7 +57,8 @@ public:
enum class DistanceFunction : uint8_t
{
Unknown,
L2
L2,
Cosine
};
std::vector<Float64> reference_vector;
@ -68,7 +69,7 @@ public:
};
/// Returns false if query can be speeded up by an ANN index, true otherwise.
bool alwaysUnknownOrTrue(String distance_function) const;
bool alwaysUnknownOrTrue(const String & distance_function) const;
std::vector<Float64> getReferenceVector() const;
size_t getDimensions() const;
@ -141,18 +142,12 @@ private:
/// Traverses the AST of ORDERBY section
void traverseOrderByAST(const ASTPtr & node, RPN & rpn);
/// Returns true and stores ANNExpr if the query has valid WHERE section
static bool matchRPNWhere(RPN & rpn, Info & info);
/// Returns true and stores ANNExpr if the query has valid ORDERBY section
static bool matchRPNOrderBy(RPN & rpn, Info & info);
/// Returns true and stores Length if we have valid LIMIT clause in query
static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit);
/// Matches dist function, reference vector, column name
static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, Info & info);
/// Gets float or int from AST node
static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter);

View File

@ -41,6 +41,21 @@ Special cases
6 [1,9.3] 0.005731362878640178
1 [2,3.2] 0.15200169244542905
7 [5.5,4.7] 0.3503476876550442
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: vector_similarity GRANULARITY 2
Parts: 1/1
Granules: 2/4
-- Setting "max_limit_for_ann_queries"
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))

View File

@ -63,6 +63,13 @@ FROM tab
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3;
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3;
SELECT '-- Setting "max_limit_for_ann_queries"';
EXPLAIN indexes=1
WITH [0.0, 2.0] as reference_vec