2024-08-09 09:47:50 +00:00
|
|
|
-- Tags: no-fasttest, no-ordinary-database
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
-- Tests various simple approximate nearest neighborhood (ANN) queries that utilize vector search indexes.
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 14:21:25 +00:00
|
|
|
SET allow_experimental_vector_similarity_index = 1;
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 09:36:39 +00:00
|
|
|
SET enable_analyzer = 0;
|
|
|
|
|
2024-08-09 10:15:03 +00:00
|
|
|
SELECT '10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block';
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
DROP TABLE IF EXISTS tab;
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 15:28:38 +00:00
|
|
|
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
|
2024-08-09 12:28:03 +00:00
|
|
|
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
|
2023-09-19 18:52:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
2024-08-09 12:28:03 +00:00
|
|
|
FROM tab
|
2023-09-19 18:52:46 +00:00
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
EXPLAIN indexes = 1
|
2023-09-19 18:52:46 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
2024-08-09 12:28:03 +00:00
|
|
|
FROM tab
|
2023-09-19 18:52:46 +00:00
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
DROP TABLE tab;
|
2023-09-19 18:52:46 +00:00
|
|
|
|
|
|
|
|
2024-08-09 10:15:03 +00:00
|
|
|
SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block';
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 15:28:38 +00:00
|
|
|
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
2024-08-09 12:28:03 +00:00
|
|
|
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
|
2023-09-19 18:52:46 +00:00
|
|
|
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
2024-08-09 12:28:03 +00:00
|
|
|
FROM tab
|
2023-09-19 18:52:46 +00:00
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
EXPLAIN indexes = 1
|
2023-09-19 18:52:46 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
2024-08-09 12:28:03 +00:00
|
|
|
FROM tab
|
2023-09-19 18:52:46 +00:00
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
DROP TABLE tab;
|
2023-09-19 18:52:46 +00:00
|
|
|
|
|
|
|
|
2024-08-09 10:15:03 +00:00
|
|
|
SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen.
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-14 15:44:22 +00:00
|
|
|
SELECT '-- Non-default metric, M, ef_construction, ef_search';
|
2024-08-09 15:28:38 +00:00
|
|
|
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
2024-08-09 12:28:03 +00:00
|
|
|
INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 10:15:03 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
2023-09-19 18:52:46 +00:00
|
|
|
SELECT id, vec, cosineDistance(vec, reference_vec)
|
2024-08-09 12:28:03 +00:00
|
|
|
FROM tab
|
2023-09-19 18:52:46 +00:00
|
|
|
ORDER BY cosineDistance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-09-04 08:59:52 +00:00
|
|
|
EXPLAIN indexes = 1
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, cosineDistance(vec, reference_vec)
|
|
|
|
FROM tab
|
|
|
|
ORDER BY cosineDistance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-14 15:44:22 +00:00
|
|
|
SELECT '-- Setting "max_limit_for_ann_queries"';
|
2023-09-19 18:52:46 +00:00
|
|
|
EXPLAIN indexes=1
|
2024-08-09 10:15:03 +00:00
|
|
|
WITH [0.0, 2.0] as reference_vec
|
2023-09-19 18:52:46 +00:00
|
|
|
SELECT id, vec, cosineDistance(vec, reference_vec)
|
2024-08-09 12:28:03 +00:00
|
|
|
FROM tab
|
2023-09-19 18:52:46 +00:00
|
|
|
ORDER BY cosineDistance(vec, reference_vec)
|
|
|
|
LIMIT 3
|
2024-08-09 12:28:03 +00:00
|
|
|
SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann index
|
2023-09-19 18:52:46 +00:00
|
|
|
|
2024-08-09 12:28:03 +00:00
|
|
|
DROP TABLE tab;
|
2024-08-15 09:32:17 +00:00
|
|
|
|
|
|
|
SELECT '-- Non-default quantization';
|
2024-08-21 14:11:33 +00:00
|
|
|
CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
2024-08-15 09:32:17 +00:00
|
|
|
CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
|
|
|
CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
2024-08-21 14:11:33 +00:00
|
|
|
CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
2024-08-15 09:32:17 +00:00
|
|
|
CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
2024-08-21 14:11:33 +00:00
|
|
|
INSERT INTO tab_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
2024-08-15 09:32:17 +00:00
|
|
|
INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
|
|
|
INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
2024-08-21 14:11:33 +00:00
|
|
|
INSERT INTO tab_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
2024-08-15 09:32:17 +00:00
|
|
|
INSERT INTO tab_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
|
|
|
|
2024-08-21 14:11:33 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_f64
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
EXPLAIN indexes = 1
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_f64
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-15 09:32:17 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_f32
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
EXPLAIN indexes = 1
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_f32
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_f16
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
EXPLAIN indexes = 1
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_f16
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-21 14:11:33 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_bf16
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
EXPLAIN indexes = 1
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_bf16
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-15 09:32:17 +00:00
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_i8
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
EXPLAIN indexes = 1
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab_i8
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
2024-08-21 14:11:33 +00:00
|
|
|
DROP TABLE tab_f64;
|
2024-08-15 09:32:17 +00:00
|
|
|
DROP TABLE tab_f32;
|
|
|
|
DROP TABLE tab_f16;
|
2024-08-21 14:11:33 +00:00
|
|
|
DROP TABLE tab_bf16;
|
2024-08-15 09:32:17 +00:00
|
|
|
DROP TABLE tab_i8;
|
2024-08-15 10:42:06 +00:00
|
|
|
|
|
|
|
SELECT '-- Index on Array(Float64) column';
|
|
|
|
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
|
|
|
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
|
|
|
|
|
|
|
|
WITH [0.0, 2.0] AS reference_vec
|
|
|
|
SELECT id, vec, L2Distance(vec, reference_vec)
|
|
|
|
FROM tab
|
|
|
|
ORDER BY L2Distance(vec, reference_vec)
|
|
|
|
LIMIT 3;
|
|
|
|
|
|
|
|
DROP TABLE tab;
|