diff --git a/.gitmodules b/.gitmodules index bbc8fc7d06c..a3b6450032a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -332,7 +332,7 @@ url = https://github.com/ClickHouse/usearch.git [submodule "contrib/SimSIMD"] path = contrib/SimSIMD - url = https://github.com/ashvardanian/SimSIMD.git + url = https://github.com/ClickHouse/SimSIMD.git [submodule "contrib/FP16"] path = contrib/FP16 url = https://github.com/Maratyszcza/FP16.git diff --git a/contrib/SimSIMD b/contrib/SimSIMD index ff51434d90c..935fef2964b 160000 --- a/contrib/SimSIMD +++ b/contrib/SimSIMD @@ -1 +1 @@ -Subproject commit ff51434d90c66f916e94ff05b24530b127aa4cff +Subproject commit 935fef2964bc38e995c5f465b42259a35b8cf0d3 diff --git a/contrib/usearch b/contrib/usearch index 1706420acaf..53799b84ca9 160000 --- a/contrib/usearch +++ b/contrib/usearch @@ -1 +1 @@ -Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48 +Subproject commit 53799b84ca9ad708b060d0b1cfa5f039371721cd diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index 223a18b57bf..cf80f46f53c 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -67,7 +67,7 @@ Expression (Projection) Condition: true Parts: 1/1 Granules: 4/4 --- Non-default quantization +-- Test all distance metrics x all quantization 1 [2,3.2] 2.3323807824711897 4 [2.4,5.2] 3.9999999046325727 2 [4.2,3.4] 4.427188573446585 @@ -75,7 +75,7 @@ Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_f64) + ReadFromMergeTree (default.tab_l2_f64) Indexes: PrimaryKey Condition: true @@ -93,7 +93,7 @@ Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_f32) + ReadFromMergeTree (default.tab_l2_f32) Indexes: PrimaryKey Condition: true @@ -111,7 +111,7 @@ Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_f16) + ReadFromMergeTree (default.tab_l2_f16) Indexes: PrimaryKey Condition: true @@ -129,7 +129,7 @@ Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_bf16) + ReadFromMergeTree (default.tab_l2_bf16) Indexes: PrimaryKey Condition: true @@ -147,7 +147,97 @@ Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_i8) + ReadFromMergeTree (default.tab_l2_i8) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: idx + Description: vector_similarity GRANULARITY 2 + Parts: 1/1 + Granules: 3/4 +6 [1,9.3] 0.005731362878640178 +4 [2.4,5.2] 0.09204062768384846 +1 [2,3.2] 0.15200169244542905 +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab_cos_f64) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: idx + Description: vector_similarity GRANULARITY 2 + Parts: 1/1 + Granules: 3/4 +6 [1,9.3] 0.005731362878640178 +4 [2.4,5.2] 0.09204062768384846 +1 [2,3.2] 0.15200169244542905 +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab_cos_f32) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: idx + Description: vector_similarity GRANULARITY 2 + Parts: 1/1 + Granules: 3/4 +6 [1,9.3] 0.005731362878640178 +4 [2.4,5.2] 0.09204062768384846 +1 [2,3.2] 0.15200169244542905 +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab_cos_f16) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: idx + Description: vector_similarity GRANULARITY 2 + Parts: 1/1 + Granules: 3/4 +6 [1,9.3] 0.005731362878640178 +4 [2.4,5.2] 0.09204062768384846 +1 [2,3.2] 0.15200169244542905 +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab_cos_bf16) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: idx + Description: vector_similarity GRANULARITY 2 + Parts: 1/1 + Granules: 3/4 +6 [1,9.3] 0.005731362878640178 +4 [2.4,5.2] 0.09204062768384846 +1 [2,3.2] 0.15200169244542905 +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab_cos_i8) Indexes: PrimaryKey Condition: true diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 71b8a1e520a..0941f9a43d6 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -81,88 +81,181 @@ SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann ind DROP TABLE tab; -SELECT '-- Non-default quantization'; -CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -INSERT INTO tab_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -INSERT INTO tab_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -INSERT INTO tab_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +SELECT '-- Test all distance metrics x all quantization'; + +DROP TABLE IF EXISTS tab_l2_f64; +DROP TABLE IF EXISTS tab_l2_f32; +DROP TABLE IF EXISTS tab_l2_f16; +DROP TABLE IF EXISTS tab_l2_bf16; +DROP TABLE IF EXISTS tab_l2_i8; +DROP TABLE IF EXISTS tab_cos_f64; +DROP TABLE IF EXISTS tab_cos_f32; +DROP TABLE IF EXISTS tab_cos_f16; +DROP TABLE IF EXISTS tab_cos_bf16; +DROP TABLE IF EXISTS tab_cos_i8; + +CREATE TABLE tab_l2_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_l2_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_l2_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_l2_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_l2_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_cos_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_cos_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_cos_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_cos_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab_cos_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; + +INSERT INTO tab_l2_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_l2_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_l2_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_l2_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_l2_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_cos_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_cos_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_cos_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_cos_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); +INSERT INTO tab_cos_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_f64 +FROM tab_l2_f64 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_f64 +FROM tab_l2_f64 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_f32 +FROM tab_l2_f32 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_f32 +FROM tab_l2_f32 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_f16 +FROM tab_l2_f16 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_f16 +FROM tab_l2_f16 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_bf16 +FROM tab_l2_bf16 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_bf16 +FROM tab_l2_bf16 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_i8 +FROM tab_l2_i8 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_i8 +FROM tab_l2_i8 ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -DROP TABLE tab_f64; -DROP TABLE tab_f32; -DROP TABLE tab_f16; -DROP TABLE tab_bf16; -DROP TABLE tab_i8; +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_f64 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +EXPLAIN indexes = 1 +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_f64 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_f32 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +EXPLAIN indexes = 1 +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_f32 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_f16 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +EXPLAIN indexes = 1 +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_f16 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_bf16 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +EXPLAIN indexes = 1 +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_bf16 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_i8 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +EXPLAIN indexes = 1 +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, cosineDistance(vec, reference_vec) +FROM tab_cos_i8 +ORDER BY cosineDistance(vec, reference_vec) +LIMIT 3; + +DROP TABLE tab_l2_f64; +DROP TABLE tab_l2_f32; +DROP TABLE tab_l2_f16; +DROP TABLE tab_l2_bf16; +DROP TABLE tab_l2_i8; +DROP TABLE tab_cos_f64; +DROP TABLE tab_cos_f32; +DROP TABLE tab_cos_f16; +DROP TABLE tab_cos_bf16; +DROP TABLE tab_cos_i8; SELECT '-- Index on Array(Float64) column'; CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;