Merge pull request #63675 from rschu1ze/vector-search

Initial implementation of vector similarity index
This commit is contained in:
Robert Schulze 2024-08-13 15:06:20 +00:00 committed by GitHub
commit 2ffcc97af2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 1492 additions and 2524 deletions

3
.gitmodules vendored
View File

@ -230,9 +230,6 @@
[submodule "contrib/minizip-ng"]
path = contrib/minizip-ng
url = https://github.com/zlib-ng/minizip-ng
[submodule "contrib/annoy"]
path = contrib/annoy
url = https://github.com/ClickHouse/annoy
[submodule "contrib/qpl"]
path = contrib/qpl
url = https://github.com/intel/qpl

View File

@ -205,9 +205,8 @@ add_contrib (morton-nd-cmake morton-nd)
if (ARCH_S390X)
add_contrib(crc32-s390x-cmake crc32-s390x)
endif()
add_contrib (annoy-cmake annoy)
option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES})
option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES})
if (ENABLE_USEARCH)
add_contrib (FP16-cmake FP16)
add_contrib (robin-map-cmake robin-map)

1
contrib/annoy vendored

@ -1 +0,0 @@
Subproject commit f2ac8e7b48f9a9cf676d3b58286e5455aba8e956

View File

@ -1,24 +0,0 @@
option(ENABLE_ANNOY "Enable Annoy index support" ${ENABLE_LIBRARIES})
# Annoy index should be disabled with undefined sanitizer. Because of memory storage optimizations
# (https://github.com/ClickHouse/annoy/blob/9d8a603a4cd252448589e84c9846f94368d5a289/src/annoylib.h#L442-L463)
# UBSan fails and leads to crash. Simmilar issue is already opened in Annoy repo
# https://github.com/spotify/annoy/issues/456
# Problem with aligment can lead to errors like
# (https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment)
# or will lead to crash on arm https://developer.arm.com/documentation/ka003038/latest
# This issues should be resolved before annoy became non-experimental (--> setting "allow_experimental_annoy_index")
if ((NOT ENABLE_ANNOY) OR (SANITIZE STREQUAL "undefined") OR (ARCH_AARCH64))
message (STATUS "Not using annoy")
return()
endif()
set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy")
set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src")
add_library(_annoy INTERFACE)
target_include_directories(_annoy SYSTEM INTERFACE ${ANNOY_SOURCE_DIR})
add_library(ch_contrib::annoy ALIAS _annoy)
target_compile_definitions(_annoy INTERFACE ENABLE_ANNOY)
target_compile_definitions(_annoy INTERFACE ANNOYLIB_MULTITHREADED_BUILD)

View File

@ -1,9 +1,7 @@
set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include")
set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD-map")
set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
add_library(_usearch INTERFACE)
@ -11,7 +9,6 @@ target_include_directories(_usearch SYSTEM INTERFACE
${FP16_PROJECT_DIR}/include
${ROBIN_MAP_PROJECT_DIR}/include
${SIMSIMD_PROJECT_DIR}/include
${USEARCH_SOURCE_DIR})
${USEARCH_PROJECT_DIR}/include)
add_library(ch_contrib::usearch ALIAS _usearch)
target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH)

View File

@ -17,7 +17,7 @@ In terms of SQL, the nearest neighborhood problem can be expressed as follows:
``` sql
SELECT *
FROM table_with_ann_index
FROM table
ORDER BY Distance(vectors, Point)
LIMIT N
```
@ -27,75 +27,109 @@ Function `Distance` computes the distance between two vectors. Often, the Euclid
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
0.33, ...)`, and `N` limits the number of search results.
An alternative formulation of the nearest neighborhood search problem looks as follows:
This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for
situations where `MaxDistance` is difficult to determine in advance.
``` sql
SELECT *
FROM table_with_ann_index
WHERE Distance(vectors, Point) < MaxDistance
LIMIT N
```
While the first query returns the top-`N` closest points to the reference point, the second query returns all points closer to the reference
point than a maximally allowed radius `MaxDistance`. Parameter `N` limits the number of returned values which is useful for situations where
`MaxDistance` is difficult to determine in advance.
With brute force search, both queries are expensive (linear in the number of points) because the distance between all points in `vectors` and
With brute force search, the query is expensive (linear in the number of points) because the distance between all points in `vectors` and
`Point` must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation
of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer much quicker (in sub-linear time).
# Creating and Using ANN Indexes {#creating_using_ann_indexes}
# Creating and Using Vector Similarity Indexes
Syntax to create an ANN index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
Syntax to create a vector similarity index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
```sql
CREATE TABLE table_with_ann_index
CREATE TABLE table
(
`id` Int64,
`vectors` Array(Float32),
INDEX [ann_index_name vectors TYPE [ann_index_type]([ann_index_parameters]) [GRANULARITY [N]]
id Int64,
vectors Array(Float32),
INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
Parameters:
- `method`: Supports currently only `hnsw`.
- `distance_function`: either `L2Distance` (the [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) - the length of a
line between two points in Euclidean space), or `cosineDistance` (the [cosine
distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors).
- `quantization`: either `f32`, `f16`, or `i8` for storing the vector with reduced precision (optional, default: `f32`)
- `m`: the number of neighbors per graph node (optional, default: 16)
- `ef_construction`: (optional, default: 128)
- `ef_search`: (optional, default: 64)
Example:
```sql
CREATE TABLE table
(
id Int64,
vectors Array(Float32),
INDEX idx vectors TYPE vector_similarity('hnsw', 'L2Distance') -- Alternative syntax: TYPE vector_similarity(hnsw, L2Distance)
)
ENGINE = MergeTree
ORDER BY id;
```
Vector similarity indexes are based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
algorithm](https://arxiv.org/abs/1603.09320), i.e., a hierarchical graph where each point represents a vector and the edges represent
similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the
overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors, that are expensive
to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further distance computations on modern
Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient navigation around immutable persistent
files, without loading them into RAM.
USearch indexes are currently experimental, to use them you first need to `SET allow_experimental_vector_similarity_index = 1`.
Vector similarity indexes currently support two distance functions:
- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`.
If no scalar kind was specified during index creation, `f16` is used as default.
For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
distance function was specified during index creation, `L2Distance` is used as default.
:::note
All arrays must have same length. To avoid errors, you can use a
[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK
length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported.
:::
:::note
The vector similarity index currently does not work with per-table, non-default `index_granularity` settings (see
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
:::
ANN indexes are built during column insertion and merge. As a result, `INSERT` and `OPTIMIZE` statements will be slower than for ordinary
tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write
requests.
ANN indexes support two types of queries:
- ORDER BY queries:
ANN indexes support these queries:
``` sql
SELECT *
FROM table_with_ann_index
FROM table
[WHERE ...]
ORDER BY Distance(vectors, Point)
LIMIT N
```
- WHERE queries:
``` sql
SELECT *
FROM table_with_ann_index
WHERE Distance(vectors, Point) < MaxDistance
LIMIT N
```
:::tip
To avoid writing out large vectors, you can use [query
parameters](/docs/en/interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g.
```bash
clickhouse-client --param_vec='hello' --query="SELECT * FROM table_with_ann_index WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0"
clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0"
```
:::
**Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)`
clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries
without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting
**Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause
cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting
`max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for
approximate neighbor search.
@ -122,128 +156,3 @@ brute-force distance calculation over all rows of the granules. With a small `GR
equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall
back to a smaller `GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY`
was specified for ANN indexes, the default value is 100 million.
# Available ANN Indexes {#available_ann_indexes}
- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)
- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch)
## Annoy {#annoy}
Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
disabled on ARM due to memory safety problems with the algorithm.
This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random
linear surfaces (lines in 2D, planes in 3D etc.).
<div class='vimeo-container'>
<iframe src="//www.youtube.com/embed/QkCCyLW0ehU"
width="640"
height="360"
frameborder="0"
allow="autoplay;
fullscreen;
picture-in-picture"
allowfullscreen>
</iframe>
</div>
Syntax to create an Annoy index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
```sql
CREATE TABLE table_with_annoy_index
(
id Int64,
vectors Array(Float32),
INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
Annoy currently supports two distance functions:
- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
distance function was specified during index creation, `L2Distance` is used as default.
Parameter `NumTrees` is the number of trees which the algorithm creates (default if not specified: 100). Higher values of `NumTree` mean
more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.
:::note
All arrays must have same length. To avoid errors, you can use a
[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK
length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported.
:::
The creation of Annoy indexes (whenever a new part is build, e.g. at the end of a merge) is a relatively slow process. You can increase
setting `max_threads_for_annoy_index_creation` (default: 4) which controls how many threads are used to create an Annoy index. Please be
careful with this setting, it is possible that multiple indexes are created in parallel in which case there can be overparallelization.
Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
values mean more accurate results at the cost of longer query runtime:
```sql
SELECT *
FROM table_name
ORDER BY L2Distance(vectors, Point)
LIMIT N
SETTINGS annoy_index_search_k_nodes=100;
```
:::note
The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
:::
## USearch {#usearch}
This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent
similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the
overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors,
that are expensive to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further
distance computations on modern Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient
navigation around immutable persistent files, without loading them into RAM.
<div class='vimeo-container'>
<iframe src="//www.youtube.com/embed/UMrhB3icP9w"
width="640"
height="360"
frameborder="0"
allow="autoplay;
fullscreen;
picture-in-picture"
allowfullscreen>
</iframe>
</div>
Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column:
```sql
CREATE TABLE table_with_usearch_index
(
id Int64,
vectors Array(Float32),
INDEX [ann_index_name] vectors TYPE usearch([Distance[, ScalarKind]]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
USearch currently supports two distance functions:
- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
USearch allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`. If no scalar kind
was specified during index creation, `f16` is used as default.
For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
distance function was specified during index creation, `L2Distance` is used as default.

View File

@ -601,10 +601,6 @@ endif()
dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing)
if (TARGET ch_contrib::annoy)
dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
endif()
if (TARGET ch_contrib::usearch)
dbms_target_link_libraries(PUBLIC ch_contrib::usearch)
endif()

View File

@ -58,6 +58,7 @@
#cmakedefine01 USE_FILELOG
#cmakedefine01 USE_ODBC
#cmakedefine01 USE_BLAKE3
#cmakedefine01 USE_USEARCH
#cmakedefine01 USE_SKIM
#cmakedefine01 USE_PRQL
#cmakedefine01 USE_ULID

View File

@ -908,14 +908,11 @@ class IColumn;
M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
M(Bool, allow_experimental_time_series_table, false, "Allows experimental TimeSeries table engine", 0) \
M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \
M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
M(UInt64, max_threads_for_annoy_index_creation, 4, "Number of threads used to build Annoy indexes (0 means all cores, not recommended)", 0) \
M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \
M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \
@ -1038,6 +1035,10 @@ class IColumn;
MAKE_OBSOLETE(M, UInt64, parallel_replicas_min_number_of_granules_to_enable, 0) \
MAKE_OBSOLETE(M, Bool, query_plan_optimize_projection, true) \
MAKE_OBSOLETE(M, Bool, query_cache_store_results_of_queries_with_nondeterministic_functions, false) \
MAKE_OBSOLETE(M, Bool, allow_experimental_annoy_index, false) \
MAKE_OBSOLETE(M, UInt64, max_threads_for_annoy_index_creation, 4) \
MAKE_OBSOLETE(M, Int64, annoy_index_search_k_nodes, -1) \
MAKE_OBSOLETE(M, Bool, allow_experimental_usearch_index, false) \
MAKE_OBSOLETE(M, Bool, optimize_move_functions_out_of_any, false) \
MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \
MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \

View File

@ -87,6 +87,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"},
{"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
{"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
{"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
}
},
{"24.7",

View File

@ -1153,8 +1153,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
query_context->setSetting("allow_experimental_object_type", 1);
query_context->setSetting("allow_experimental_variant_type", 1);
query_context->setSetting("allow_experimental_dynamic_type", 1);
query_context->setSetting("allow_experimental_annoy_index", 1);
query_context->setSetting("allow_experimental_usearch_index", 1);
query_context->setSetting("allow_experimental_vector_similarity_index", 1);
query_context->setSetting("allow_experimental_bigint_types", 1);
query_context->setSetting("allow_experimental_window_functions", 1);
query_context->setSetting("allow_experimental_geo_types", 1);

View File

@ -787,10 +787,8 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index)
throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'");
/// ----
if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index is disabled. Turn on allow_experimental_annoy_index");
if (index_desc.type == "usearch" && !settings.allow_experimental_usearch_index)
throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index is disabled. Turn on allow_experimental_usearch_index");
if (index_desc.type == "vector_similarity" && !settings.allow_experimental_vector_similarity_index)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index is disabled. Turn on allow_experimental_vector_similarity_index");
properties.indices.push_back(index_desc);
}

View File

@ -13,8 +13,7 @@ class ASTIndexDeclaration : public IAST
{
public:
static const auto DEFAULT_INDEX_GRANULARITY = 1uz;
static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz;
static const auto DEFAULT_USEARCH_INDEX_GRANULARITY = 100'000'000uz;
static const auto DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY = 100'000'000uz;
ASTIndexDeclaration(ASTPtr expression, ASTPtr type, const String & name_);

View File

@ -89,10 +89,8 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected
else
{
auto index_type = index->getType();
if (index_type && index_type->name == "annoy")
index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
else if (index_type && index_type->name == "usearch")
index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
if (index_type && index_type->name == "vector_similarity")
index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY;
else
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
}

View File

@ -214,10 +214,8 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
else
{
auto index_type = index->getType();
if (index_type->name == "annoy")
index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
else if (index_type->name == "usearch")
index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
if (index_type->name == "vector_similarity")
index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY;
else
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
}

View File

@ -24,8 +24,8 @@
#include <Processors/Transforms/SelectByIndicesTransform.h>
#include <QueryPipeline/QueryPipelineBuilder.h>
#include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
#include <Storages/MergeTree/MergeTreeIndexAnnoy.h>
#include <Storages/MergeTree/MergeTreeIndexUSearch.h>
#include <Storages/MergeTree/MergeTreeIndexVectorSimilarity.h>
#include <Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h>
#include <Storages/MergeTree/MergeTreeReadPool.h>
#include <Storages/MergeTree/MergeTreePrefetchedReadPool.h>
#include <Storages/MergeTree/MergeTreeReadPoolInOrder.h>
@ -52,6 +52,8 @@
#include <memory>
#include <unordered_map>
#include "config.h"
using namespace DB;
namespace
@ -1474,16 +1476,14 @@ static void buildIndexes(
else
{
MergeTreeIndexConditionPtr condition;
if (index_helper->isVectorSearch())
if (index_helper->isVectorSimilarityIndex())
{
#ifdef ENABLE_ANNOY
if (const auto * annoy = typeid_cast<const MergeTreeIndexAnnoy *>(index_helper.get()))
condition = annoy->createIndexCondition(query_info, context);
#endif
#ifdef ENABLE_USEARCH
if (const auto * usearch = typeid_cast<const MergeTreeIndexUSearch *>(index_helper.get()))
condition = usearch->createIndexCondition(query_info, context);
#if USE_USEARCH
if (const auto * vector_similarity_index = typeid_cast<const MergeTreeIndexVectorSimilarity *>(index_helper.get()))
condition = vector_similarity_index->createIndexCondition(query_info, context);
#endif
if (const auto * legacy_vector_similarity_index = typeid_cast<const MergeTreeIndexLegacyVectorSimilarity *>(index_helper.get()))
condition = legacy_vector_similarity_index->createIndexCondition(query_info, context);
if (!condition)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown vector search index {}", index_helper->index.name);
}

View File

@ -3,6 +3,7 @@
#include <Storages/IndicesDescription.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTIndexDeclaration.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ParserCreateQuery.h>
@ -130,10 +131,15 @@ IndexDescription IndexDescription::getIndexFromAST(const ASTPtr & definition_ast
{
for (size_t i = 0; i < index_type->arguments->children.size(); ++i)
{
const auto * argument = index_type->arguments->children[i]->as<ASTLiteral>();
if (!argument)
const auto & child = index_type->arguments->children[i];
if (const auto * ast_literal = child->as<ASTLiteral>(); ast_literal != nullptr)
/// E.g. INDEX index_name column_name TYPE vector_similarity('hnsw', 'f32')
result.arguments.emplace_back(ast_literal->value);
else if (const auto * ast_identifier = child->as<ASTIdentifier>(); ast_identifier != nullptr)
/// E.g. INDEX index_name column_name TYPE vector_similarity(hnsw, f32)
result.arguments.emplace_back(ast_identifier->name());
else
throw Exception(ErrorCodes::INCORRECT_QUERY, "Only literals can be skip index arguments");
result.arguments.emplace_back(argument->value);
}
}

View File

@ -1,507 +0,0 @@
#include <Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h>
#include <Core/Settings.h>
#include <Interpreters/Context.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTOrderByElement.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Storages/MergeTree/KeyCondition.h>
#include <Storages/MergeTree/MergeTreeSettings.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int INCORRECT_QUERY;
}
namespace
{
template <typename Literal>
void extractReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & reference_vector, Literal literal)
{
Float64 float_element_of_reference_vector;
Int64 int_element_of_reference_vector;
for (const auto & value : literal.value())
{
if (value.tryGet(float_element_of_reference_vector))
reference_vector.emplace_back(float_element_of_reference_vector);
else if (value.tryGet(int_element_of_reference_vector))
reference_vector.emplace_back(static_cast<float>(int_element_of_reference_vector));
else
throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in reference vector. Only float or int are supported.");
}
}
ApproximateNearestNeighborInformation::Metric stringToMetric(std::string_view metric)
{
if (metric == "L2Distance")
return ApproximateNearestNeighborInformation::Metric::L2;
else if (metric == "LpDistance")
return ApproximateNearestNeighborInformation::Metric::Lp;
else
return ApproximateNearestNeighborInformation::Metric::Unknown;
}
}
ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context)
: block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context))
, index_granularity(context->getMergeTreeSettings().index_granularity)
, max_limit_for_ann_queries(context->getSettingsRef().max_limit_for_ann_queries)
, index_is_useful(checkQueryStructure(query_info))
{}
bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric) const
{
if (!index_is_useful)
return true; // Query isn't supported
// If query is supported, check metrics for match
return !(stringToMetric(metric) == query_information->metric);
}
float ApproximateNearestNeighborCondition::getComparisonDistanceForWhereQuery() const
{
if (index_is_useful && query_information.has_value()
&& query_information->type == ApproximateNearestNeighborInformation::Type::Where)
return query_information->distance;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type");
}
UInt64 ApproximateNearestNeighborCondition::getLimit() const
{
if (index_is_useful && query_information.has_value())
return query_information->limit;
throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported");
}
std::vector<float> ApproximateNearestNeighborCondition::getReferenceVector() const
{
if (index_is_useful && query_information.has_value())
return query_information->reference_vector;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index.");
}
size_t ApproximateNearestNeighborCondition::getDimensions() const
{
if (index_is_useful && query_information.has_value())
return query_information->reference_vector.size();
throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index.");
}
String ApproximateNearestNeighborCondition::getColumnName() const
{
if (index_is_useful && query_information.has_value())
return query_information->column_name;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index.");
}
ApproximateNearestNeighborInformation::Metric ApproximateNearestNeighborCondition::getMetricType() const
{
if (index_is_useful && query_information.has_value())
return query_information->metric;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Metric name was requested for useless or uninitialized index.");
}
float ApproximateNearestNeighborCondition::getPValueForLpDistance() const
{
if (index_is_useful && query_information.has_value())
return query_information->p_for_lp_dist;
throw Exception(ErrorCodes::LOGICAL_ERROR, "P from LPDistance was requested for useless or uninitialized index.");
}
ApproximateNearestNeighborInformation::Type ApproximateNearestNeighborCondition::getQueryType() const
{
if (index_is_useful && query_information.has_value())
return query_information->type;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index.");
}
bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryInfo & query)
{
/// RPN-s for different sections of the query
RPN rpn_prewhere_clause;
RPN rpn_where_clause;
RPN rpn_order_by_clause;
RPNElement rpn_limit;
UInt64 limit;
ApproximateNearestNeighborInformation prewhere_info;
ApproximateNearestNeighborInformation where_info;
ApproximateNearestNeighborInformation order_by_info;
/// Build rpns for query sections
const auto & select = query.query->as<ASTSelectQuery &>();
/// If query has PREWHERE clause
if (select.prewhere())
traverseAST(select.prewhere(), rpn_prewhere_clause);
/// If query has WHERE clause
if (select.where())
traverseAST(select.where(), rpn_where_clause);
/// If query has LIMIT clause
if (select.limitLength())
traverseAtomAST(select.limitLength(), rpn_limit);
if (select.orderBy()) // If query has ORDERBY clause
traverseOrderByAST(select.orderBy(), rpn_order_by_clause);
/// Reverse RPNs for conveniences during parsing
std::reverse(rpn_prewhere_clause.begin(), rpn_prewhere_clause.end());
std::reverse(rpn_where_clause.begin(), rpn_where_clause.end());
std::reverse(rpn_order_by_clause.begin(), rpn_order_by_clause.end());
/// Match rpns with supported types and extract information
const bool prewhere_is_valid = matchRPNWhere(rpn_prewhere_clause, prewhere_info);
const bool where_is_valid = matchRPNWhere(rpn_where_clause, where_info);
const bool order_by_is_valid = matchRPNOrderBy(rpn_order_by_clause, order_by_info);
const bool limit_is_valid = matchRPNLimit(rpn_limit, limit);
/// Query without a LIMIT clause or with a limit greater than a restriction is not supported
if (!limit_is_valid || max_limit_for_ann_queries < limit)
return false;
/// Search type query in both sections isn't supported
if (prewhere_is_valid && where_is_valid)
return false;
/// Search type should be in WHERE or PREWHERE clause
if (prewhere_is_valid || where_is_valid)
query_information = std::move(prewhere_is_valid ? prewhere_info : where_info);
if (order_by_is_valid)
{
/// Query with valid where and order by type is not supported
if (query_information.has_value())
return false;
query_information = std::move(order_by_info);
}
if (query_information)
query_information->limit = limit;
return query_information.has_value();
}
void ApproximateNearestNeighborCondition::traverseAST(const ASTPtr & node, RPN & rpn)
{
// If the node is ASTFunction, it may have children nodes
if (const auto * func = node->as<ASTFunction>())
{
const ASTs & children = func->arguments->children;
// Traverse children nodes
for (const auto& child : children)
traverseAST(child, rpn);
}
RPNElement element;
/// Get the data behind node
if (!traverseAtomAST(node, element))
element.function = RPNElement::FUNCTION_UNKNOWN;
rpn.emplace_back(std::move(element));
}
bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out)
{
/// Match Functions
if (const auto * function = node->as<ASTFunction>())
{
/// Set the name
out.func_name = function->name;
if (function->name == "L1Distance" ||
function->name == "L2Distance" ||
function->name == "LinfDistance" ||
function->name == "cosineDistance" ||
function->name == "dotProduct" ||
function->name == "LpDistance")
out.function = RPNElement::FUNCTION_DISTANCE;
else if (function->name == "tuple")
out.function = RPNElement::FUNCTION_TUPLE;
else if (function->name == "array")
out.function = RPNElement::FUNCTION_ARRAY;
else if (function->name == "less" ||
function->name == "greater" ||
function->name == "lessOrEquals" ||
function->name == "greaterOrEquals")
out.function = RPNElement::FUNCTION_COMPARISON;
else if (function->name == "_CAST")
out.function = RPNElement::FUNCTION_CAST;
else
return false;
return true;
}
/// Match identifier
else if (const auto * identifier = node->as<ASTIdentifier>())
{
out.function = RPNElement::FUNCTION_IDENTIFIER;
out.identifier.emplace(identifier->name());
out.func_name = "column identifier";
return true;
}
/// Check if we have constants behind the node
return tryCastToConstType(node, out);
}
bool ApproximateNearestNeighborCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out)
{
Field const_value;
DataTypePtr const_type;
if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
{
/// Check for constant types
if (const_value.getType() == Field::Types::Float64)
{
out.function = RPNElement::FUNCTION_FLOAT_LITERAL;
out.float_literal.emplace(const_value.safeGet<Float32>());
out.func_name = "Float literal";
return true;
}
if (const_value.getType() == Field::Types::UInt64)
{
out.function = RPNElement::FUNCTION_INT_LITERAL;
out.int_literal.emplace(const_value.safeGet<UInt64>());
out.func_name = "Int literal";
return true;
}
if (const_value.getType() == Field::Types::Int64)
{
out.function = RPNElement::FUNCTION_INT_LITERAL;
out.int_literal.emplace(const_value.safeGet<Int64>());
out.func_name = "Int literal";
return true;
}
if (const_value.getType() == Field::Types::Tuple)
{
out.function = RPNElement::FUNCTION_LITERAL_TUPLE;
out.tuple_literal = const_value.safeGet<Tuple>();
out.func_name = "Tuple literal";
return true;
}
if (const_value.getType() == Field::Types::Array)
{
out.function = RPNElement::FUNCTION_LITERAL_ARRAY;
out.array_literal = const_value.safeGet<Array>();
out.func_name = "Array literal";
return true;
}
if (const_value.getType() == Field::Types::String)
{
out.function = RPNElement::FUNCTION_STRING_LITERAL;
out.func_name = const_value.safeGet<String>();
return true;
}
}
return false;
}
void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn)
{
if (const auto * expr_list = node->as<ASTExpressionList>())
if (const auto * order_by_element = expr_list->children.front()->as<ASTOrderByElement>())
traverseAST(order_by_element->children.front(), rpn);
}
/// Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause
bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info)
{
/// Fill query type field
ann_info.type = ApproximateNearestNeighborInformation::Type::Where;
/// WHERE section must have at least 5 expressions
/// Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats))
if (rpn.size() < 5)
return false;
auto iter = rpn.begin();
/// Query starts from operator less
if (iter->function != RPNElement::FUNCTION_COMPARISON)
return false;
const bool greater_case = iter->func_name == "greater" || iter->func_name == "greaterOrEquals";
const bool less_case = iter->func_name == "less" || iter->func_name == "lessOrEquals";
++iter;
if (less_case)
{
if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL)
return false;
ann_info.distance = getFloatOrIntLiteralOrPanic(iter);
if (ann_info.distance < 0)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance);
++iter;
}
else if (!greater_case)
return false;
auto end = rpn.end();
if (!matchMainParts(iter, end, ann_info))
return false;
if (greater_case)
{
if (ann_info.reference_vector.size() < 2)
return false;
ann_info.distance = ann_info.reference_vector.back();
if (ann_info.distance < 0)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance);
ann_info.reference_vector.pop_back();
}
/// query is ok
return true;
}
/// Returns true and stores ANNExpr if the query has valid ORDERBY clause
bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info)
{
/// Fill query type field
ann_info.type = ApproximateNearestNeighborInformation::Type::OrderBy;
// ORDER BY clause must have at least 3 expressions
if (rpn.size() < 3)
return false;
auto iter = rpn.begin();
auto end = rpn.end();
return ApproximateNearestNeighborCondition::matchMainParts(iter, end, ann_info);
}
/// Returns true and stores Length if we have valid LIMIT clause in query
bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit)
{
if (rpn.function == RPNElement::FUNCTION_INT_LITERAL)
{
limit = rpn.int_literal.value();
return true;
}
return false;
}
/// Matches dist function, referencer vector, column name
bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info)
{
bool identifier_found = false;
/// Matches DistanceFunc->[Column]->[Tuple(array)Func]->ReferenceVector(floats)->[Column]
if (iter->function != RPNElement::FUNCTION_DISTANCE)
return false;
ann_info.metric = stringToMetric(iter->func_name);
++iter;
if (ann_info.metric == ApproximateNearestNeighborInformation::Metric::Lp)
{
if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL &&
iter->function != RPNElement::FUNCTION_INT_LITERAL)
return false;
ann_info.p_for_lp_dist = getFloatOrIntLiteralOrPanic(iter);
++iter;
}
if (iter->function == RPNElement::FUNCTION_IDENTIFIER)
{
identifier_found = true;
ann_info.column_name = std::move(iter->identifier.value());
++iter;
}
if (iter->function == RPNElement::FUNCTION_TUPLE || iter->function == RPNElement::FUNCTION_ARRAY)
++iter;
if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE)
{
extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal);
++iter;
}
if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY)
{
extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal);
++iter;
}
/// further conditions are possible if there is no tuple or array, or no identifier is found
/// the tuple or array can be inside a cast function. For other cases, see the loop after this condition
if (iter != end && iter->function == RPNElement::FUNCTION_CAST)
{
++iter;
/// Cast should be made to array or tuple
if (!iter->func_name.starts_with("Array") && !iter->func_name.starts_with("Tuple"))
return false;
++iter;
if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE)
{
extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal);
++iter;
}
else if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY)
{
extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal);
++iter;
}
else
return false;
}
while (iter != end)
{
if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL ||
iter->function == RPNElement::FUNCTION_INT_LITERAL)
ann_info.reference_vector.emplace_back(getFloatOrIntLiteralOrPanic(iter));
else if (iter->function == RPNElement::FUNCTION_IDENTIFIER)
{
if (identifier_found)
return false;
ann_info.column_name = std::move(iter->identifier.value());
identifier_found = true;
}
else
return false;
++iter;
}
/// Final checks of correctness
return identifier_found && !ann_info.reference_vector.empty();
}
/// Gets float or int from AST node
float ApproximateNearestNeighborCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter)
{
if (iter->float_literal.has_value())
return iter->float_literal.value();
if (iter->int_literal.has_value())
return static_cast<float>(iter->int_literal.value());
throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong parsed AST in buildRPN\n");
}
}

View File

@ -11,6 +11,7 @@
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
#include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
#include <Storages/MergeTree/MergeTreeIndexFullText.h>
#include <Storages/MergeTree/VectorSimilarityCondition.h>
#include <Storages/ReadInOrderOptimizer.h>
#include <Storages/VirtualColumnUtils.h>
#include <Parsers/ASTIdentifier.h>
@ -48,7 +49,6 @@
#include <Functions/IFunction.h>
#include <IO/WriteBufferFromOStream.h>
#include <Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h>
namespace CurrentMetrics
{
@ -1406,11 +1406,10 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin)
reader.read(granule);
auto ann_condition = std::dynamic_pointer_cast<IMergeTreeIndexConditionApproximateNearestNeighbor>(condition);
if (ann_condition != nullptr)
if (index_helper->isVectorSimilarityIndex())
{
/// An array of indices of useful ranges.
auto result = ann_condition->getUsefulRanges(granule);
auto result = condition->getUsefulRanges(granule);
for (auto range : result)
{

View File

@ -27,7 +27,6 @@ MergeTreeWriterSettings::MergeTreeWriterSettings(
, rewrite_primary_key(rewrite_primary_key_)
, blocks_are_granules_size(blocks_are_granules_size_)
, query_write_settings(query_write_settings_)
, max_threads_for_annoy_index_creation(global_settings.max_threads_for_annoy_index_creation)
, low_cardinality_max_dictionary_size(global_settings.low_cardinality_max_dictionary_size)
, low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part != 0)
, use_compact_variant_discriminators_serialization(storage_settings->use_compact_variant_discriminators_serialization)

View File

@ -77,8 +77,6 @@ struct MergeTreeWriterSettings
bool blocks_are_granules_size;
WriteSettings query_write_settings;
size_t max_threads_for_annoy_index_creation;
size_t low_cardinality_max_dictionary_size;
bool low_cardinality_use_single_dictionary_for_part;
bool use_compact_variant_discriminators_serialization;

View File

@ -1,416 +0,0 @@
#ifdef ENABLE_ANNOY
#include <Storages/MergeTree/MergeTreeIndexAnnoy.h>
#include <Columns/ColumnArray.h>
#include <Common/typeid_cast.h>
#include <Core/Field.h>
#include <Core/Settings.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <Interpreters/castColumn.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int INCORRECT_DATA;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_QUERY;
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
}
template <typename Distance>
AnnoyIndexWithSerialization<Distance>::AnnoyIndexWithSerialization(size_t dimensions)
: Base::AnnoyIndex(static_cast<int>(dimensions))
{
}
template<typename Distance>
void AnnoyIndexWithSerialization<Distance>::serialize(WriteBuffer & ostr) const
{
chassert(Base::_built);
writeIntBinary(Base::_s, ostr);
writeIntBinary(Base::_n_items, ostr);
writeIntBinary(Base::_n_nodes, ostr);
writeIntBinary(Base::_nodes_size, ostr);
writeIntBinary(Base::_K, ostr);
writeIntBinary(Base::_seed, ostr);
writeVectorBinary(Base::_roots, ostr);
ostr.write(reinterpret_cast<const char *>(Base::_nodes), Base::_s * Base::_n_nodes);
}
template<typename Distance>
void AnnoyIndexWithSerialization<Distance>::deserialize(ReadBuffer & istr)
{
chassert(!Base::_built);
readIntBinary(Base::_s, istr);
readIntBinary(Base::_n_items, istr);
readIntBinary(Base::_n_nodes, istr);
readIntBinary(Base::_nodes_size, istr);
readIntBinary(Base::_K, istr);
readIntBinary(Base::_seed, istr);
readVectorBinary(Base::_roots, istr);
Base::_nodes = realloc(Base::_nodes, Base::_s * Base::_n_nodes);
istr.readStrict(reinterpret_cast<char *>(Base::_nodes), Base::_s * Base::_n_nodes);
Base::_fd = 0;
// set flags
Base::_loaded = false;
Base::_verbose = false;
Base::_on_disk = false;
Base::_built = true;
}
template<typename Distance>
size_t AnnoyIndexWithSerialization<Distance>::getDimensions() const
{
return Base::get_f();
}
template <typename Distance>
MergeTreeIndexGranuleAnnoy<Distance>::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, index(nullptr)
{}
template <typename Distance>
MergeTreeIndexGranuleAnnoy<Distance>::MergeTreeIndexGranuleAnnoy(
const String & index_name_,
const Block & index_sample_block_,
AnnoyIndexWithSerializationPtr<Distance> index_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, index(std::move(index_))
{}
template <typename Distance>
void MergeTreeIndexGranuleAnnoy<Distance>::serializeBinary(WriteBuffer & ostr) const
{
/// Number of dimensions is required in the index constructor,
/// so it must be written and read separately from the other part
writeIntBinary(static_cast<UInt64>(index->getDimensions()), ostr); // write dimension
index->serialize(ostr);
}
template <typename Distance>
void MergeTreeIndexGranuleAnnoy<Distance>::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
{
UInt64 dimension;
readIntBinary(dimension, istr);
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
index->deserialize(istr);
}
template <typename Distance>
MergeTreeIndexAggregatorAnnoy<Distance>::MergeTreeIndexAggregatorAnnoy(
const String & index_name_,
const Block & index_sample_block_,
UInt64 trees_,
size_t max_threads_for_creation_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, trees(trees_)
, max_threads_for_creation(max_threads_for_creation_)
{}
template <typename Distance>
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy<Distance>::getGranuleAndReset()
{
int threads = (max_threads_for_creation == 0) ? -1 : static_cast<int>(max_threads_for_creation);
/// clang-tidy reports a false positive: it considers %p with an outdated pointer in fprintf() (used by logging which we don't do) dereferencing
index->build(static_cast<int>(trees), threads);
auto granule = std::make_shared<MergeTreeIndexGranuleAnnoy<Distance>>(index_name, index_sample_block, index);
index = nullptr;
return granule;
}
template <typename Distance>
void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t * pos, size_t limit)
{
if (*pos >= block.rows())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
*pos, block.rows());
size_t rows_read = std::min(limit, block.rows() - *pos);
if (rows_read == 0)
return;
if (rows_read > std::numeric_limits<uint32_t>::max())
throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than 4B rows per index granule.");
if (index_sample_block.columns() > 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
const String & index_column_name = index_sample_block.getByPosition(0).name;
ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
{
const auto & column_array_data = column_array->getData();
const auto & column_array_data_float = typeid_cast<const ColumnFloat32 &>(column_array_data);
const auto & column_array_data_float_data = column_array_data_float.getData();
const auto & column_array_offsets = column_array->getOffsets();
const size_t num_rows = column_array_offsets.size();
if (column_array->empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
/// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
/// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default
/// value which is also empty.
if (column_array->isDefaultAt(0))
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
/// Check all sizes are the same
size_t dimension = column_array_offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
/// Also check that previously inserted blocks have the same size as this block.
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
if (index && index->getDimensions() != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
if (!index)
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
/// Add all rows of block
index->add_item(index->get_n_items(), column_array_data_float_data.data());
for (size_t current_row = 1; current_row < num_rows; ++current_row)
index->add_item(index->get_n_items(), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
}
else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
{
const auto & column_tuple_columns = column_tuple->getColumns();
/// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything
std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
for (const auto & column : column_tuple_columns)
{
const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
for (size_t i = 0; i < pod_array.size(); ++i)
data[i].push_back(pod_array[i]);
}
if (data.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
if (!index)
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(data[0].size());
for (const auto & item : data)
index->add_item(index->get_n_items(), item.data());
}
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column");
*pos += rows_read;
}
MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy(
const IndexDescription & /*index_description*/,
const SelectQueryInfo & query,
const String & distance_function_,
ContextPtr context)
: ann_condition(query, context)
, distance_function(distance_function_)
, search_k(context->getSettingsRef().annoy_index_search_k_nodes)
{}
bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes");
}
bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const
{
return ann_condition.alwaysUnknownOrTrue(distance_function);
}
std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
{
if (distance_function == DISTANCE_FUNCTION_L2)
return getUsefulRangesImpl<Annoy::Euclidean>(idx_granule);
else if (distance_function == DISTANCE_FUNCTION_COSINE)
return getUsefulRangesImpl<Annoy::Angular>(idx_granule);
std::unreachable();
}
template <typename Distance>
std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const
{
const UInt64 limit = ann_condition.getLimit();
const UInt64 index_granularity = ann_condition.getIndexGranularity();
const std::optional<float> comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where
? std::optional<float>(ann_condition.getComparisonDistanceForWhereQuery())
: std::nullopt;
if (comparison_distance && comparison_distance.value() < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance");
const std::vector<float> reference_vector = ann_condition.getReferenceVector();
const auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleAnnoy<Distance>>(idx_granule);
if (granule == nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type");
const AnnoyIndexWithSerializationPtr<Distance> annoy = granule->index;
if (ann_condition.getDimensions() != annoy->getDimensions())
throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
"does not match the dimension in the index ({})",
ann_condition.getDimensions(), annoy->getDimensions());
std::vector<UInt64> neighbors; /// indexes of dots which were closest to the reference vector
std::vector<Float32> distances;
neighbors.reserve(limit);
distances.reserve(limit);
annoy->get_nns_by_vector(reference_vector.data(), limit, static_cast<int>(search_k), &neighbors, &distances);
chassert(neighbors.size() == distances.size());
std::vector<size_t> granules;
granules.reserve(neighbors.size());
for (size_t i = 0; i < neighbors.size(); ++i)
{
if (comparison_distance && distances[i] > comparison_distance)
continue;
granules.push_back(neighbors[i] / index_granularity);
}
/// make unique
std::sort(granules.begin(), granules.end());
granules.erase(std::unique(granules.begin(), granules.end()), granules.end());
return granules;
}
MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_)
: IMergeTreeIndex(index_)
, trees(trees_)
, distance_function(distance_function_)
{}
MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const
{
if (distance_function == DISTANCE_FUNCTION_L2)
return std::make_shared<MergeTreeIndexGranuleAnnoy<Annoy::Euclidean>>(index.name, index.sample_block);
else if (distance_function == DISTANCE_FUNCTION_COSINE)
return std::make_shared<MergeTreeIndexGranuleAnnoy<Annoy::Angular>>(index.name, index.sample_block);
std::unreachable();
}
MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator(const MergeTreeWriterSettings & settings) const
{
/// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171
if (distance_function == DISTANCE_FUNCTION_L2)
return std::make_shared<MergeTreeIndexAggregatorAnnoy<Annoy::Euclidean>>(index.name, index.sample_block, trees, settings.max_threads_for_annoy_index_creation);
else if (distance_function == DISTANCE_FUNCTION_COSINE)
return std::make_shared<MergeTreeIndexAggregatorAnnoy<Annoy::Angular>>(index.name, index.sample_block, trees, settings.max_threads_for_annoy_index_creation);
std::unreachable();
}
MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
{
return std::make_shared<MergeTreeIndexConditionAnnoy>(index, query, distance_function, context);
};
MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const ActionsDAG *, ContextPtr) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG");
}
MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index)
{
static constexpr auto DEFAULT_DISTANCE_FUNCTION = DISTANCE_FUNCTION_L2;
String distance_function = DEFAULT_DISTANCE_FUNCTION;
if (!index.arguments.empty())
distance_function = index.arguments[0].safeGet<String>();
static constexpr auto DEFAULT_TREES = 100uz;
UInt64 trees = DEFAULT_TREES;
if (index.arguments.size() > 1)
trees = index.arguments[1].safeGet<UInt64>();
return std::make_shared<MergeTreeIndexAnnoy>(index, trees, distance_function);
}
void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
{
/// Check number and type of Annoy index arguments:
if (index.arguments.size() > 2)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters");
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of Annoy index must be of type String");
if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::UInt64)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be of type UInt64");
/// Check that the index is created on a single column
if (index.column_names.size() != 1 || index.data_types.size() != 1)
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column");
/// Check that a supported metric was passed as first argument
if (!index.arguments.empty())
{
String distance_name = index.arguments[0].safeGet<String>();
if (distance_name != DISTANCE_FUNCTION_L2 && distance_name != DISTANCE_FUNCTION_COSINE)
throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index only supports distance functions '{}' and '{}'", DISTANCE_FUNCTION_L2, DISTANCE_FUNCTION_COSINE);
}
/// Check data type of indexed column:
auto throw_unsupported_underlying_column_exception = []()
{
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
};
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
if (const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get()))
{
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw_unsupported_underlying_column_exception();
}
else if (const auto * data_type_tuple = typeid_cast<const DataTypeTuple *>(data_type.get()))
{
const DataTypes & inner_types = data_type_tuple->getElements();
for (const auto & inner_type : inner_types)
{
TypeIndex nested_type_index = inner_type->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw_unsupported_underlying_column_exception();
}
}
else
throw_unsupported_underlying_column_exception();
}
}
#endif

View File

@ -1,112 +0,0 @@
#pragma once
#ifdef ENABLE_ANNOY
#include <Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h>
#include <annoylib.h>
#include <kissrandom.h>
namespace DB
{
template <typename Distance>
class AnnoyIndexWithSerialization : public Annoy::AnnoyIndex<UInt64, Float32, Distance, Annoy::Kiss64Random, Annoy::AnnoyIndexMultiThreadedBuildPolicy>
{
using Base = Annoy::AnnoyIndex<UInt64, Float32, Distance, Annoy::Kiss64Random, Annoy::AnnoyIndexMultiThreadedBuildPolicy>;
public:
explicit AnnoyIndexWithSerialization(size_t dimensions);
void serialize(WriteBuffer & ostr) const;
void deserialize(ReadBuffer & istr);
size_t getDimensions() const;
};
template <typename Distance>
using AnnoyIndexWithSerializationPtr = std::shared_ptr<AnnoyIndexWithSerialization<Distance>>;
template <typename Distance>
struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule
{
MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_);
MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexWithSerializationPtr<Distance> index_);
~MergeTreeIndexGranuleAnnoy() override = default;
void serializeBinary(WriteBuffer & ostr) const override;
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
bool empty() const override { return !index.get(); }
const String index_name;
const Block index_sample_block;
AnnoyIndexWithSerializationPtr<Distance> index;
};
template <typename Distance>
struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator
{
MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, UInt64 trees, size_t max_threads_for_creation);
~MergeTreeIndexAggregatorAnnoy() override = default;
bool empty() const override { return !index || index->get_n_items() == 0; }
MergeTreeIndexGranulePtr getGranuleAndReset() override;
void update(const Block & block, size_t * pos, size_t limit) override;
const String index_name;
const Block index_sample_block;
const UInt64 trees;
const size_t max_threads_for_creation;
AnnoyIndexWithSerializationPtr<Distance> index;
};
class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexConditionApproximateNearestNeighbor
{
public:
MergeTreeIndexConditionAnnoy(
const IndexDescription & index_description,
const SelectQueryInfo & query,
const String & distance_function,
ContextPtr context);
~MergeTreeIndexConditionAnnoy() override = default;
bool alwaysUnknownOrTrue() const override;
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
std::vector<size_t> getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override;
private:
template <typename Distance>
std::vector<size_t> getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const;
const ApproximateNearestNeighborCondition ann_condition;
const String distance_function;
const Int64 search_k;
};
class MergeTreeIndexAnnoy final : public IMergeTreeIndex
{
public:
MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_);
~MergeTreeIndexAnnoy() override = default;
MergeTreeIndexGranulePtr createIndexGranule() const override;
MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const;
MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
bool isVectorSearch() const override { return true; }
private:
const UInt64 trees;
const String distance_function;
};
}
#endif

View File

@ -0,0 +1,45 @@
#include <Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_INDEX;
}
MergeTreeIndexLegacyVectorSimilarity::MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_)
: IMergeTreeIndex(index_)
{
}
MergeTreeIndexGranulePtr MergeTreeIndexLegacyVectorSimilarity::createIndexGranule() const
{
throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
}
MergeTreeIndexAggregatorPtr MergeTreeIndexLegacyVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings &) const
{
throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
}
MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const SelectQueryInfo &, ContextPtr) const
{
throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
};
MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const
{
throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
}
MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index)
{
return std::make_shared<MergeTreeIndexLegacyVectorSimilarity>(index);
}
void legacyVectorSimilarityIndexValidator(const IndexDescription &, bool)
{
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include <Storages/MergeTree/VectorSimilarityCondition.h>
/// Walking corpse implementation for removed skipping index of type "annoy" and "usearch".
/// Its only purpose is to allow loading old tables with indexes of these types.
/// Data insertion and index usage/search will throw an exception, suggesting to migrate to "vector_similarity" indexes.
namespace DB
{
class MergeTreeIndexLegacyVectorSimilarity : public IMergeTreeIndex
{
public:
explicit MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_);
~MergeTreeIndexLegacyVectorSimilarity() override = default;
MergeTreeIndexGranulePtr createIndexGranule() const override;
MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings &) const override;
MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo &, ContextPtr) const;
MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
bool isVectorSimilarityIndex() const override { return true; }
};
}

View File

@ -1,463 +0,0 @@
#ifdef ENABLE_USEARCH
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#include <Storages/MergeTree/MergeTreeIndexUSearch.h>
#include <Columns/ColumnArray.h>
#include <Common/typeid_cast.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <Interpreters/castColumn.h>
namespace ProfileEvents
{
extern const Event USearchAddCount;
extern const Event USearchAddVisitedMembers;
extern const Event USearchAddComputedDistances;
extern const Event USearchSearchCount;
extern const Event USearchSearchVisitedMembers;
extern const Event USearchSearchComputedDistances;
}
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int ILLEGAL_COLUMN;
extern const int INCORRECT_DATA;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_QUERY;
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
}
namespace
{
std::unordered_map<String, unum::usearch::scalar_kind_t> nameToScalarKind = {
{"f64", unum::usearch::scalar_kind_t::f64_k},
{"f32", unum::usearch::scalar_kind_t::f32_k},
{"f16", unum::usearch::scalar_kind_t::f16_k},
{"i8", unum::usearch::scalar_kind_t::i8_k}};
}
template <unum::usearch::metric_kind_t Metric>
USearchIndexWithSerialization<Metric>::USearchIndexWithSerialization(size_t dimensions, unum::usearch::scalar_kind_t scalar_kind)
: Base(Base::make(unum::usearch::metric_punned_t(dimensions, Metric, scalar_kind)))
{
}
template <unum::usearch::metric_kind_t Metric>
void USearchIndexWithSerialization<Metric>::serialize(WriteBuffer & ostr) const
{
auto callback = [&ostr](void * from, size_t n)
{
ostr.write(reinterpret_cast<const char *>(from), n);
return true;
};
Base::save_to_stream(callback);
}
template <unum::usearch::metric_kind_t Metric>
void USearchIndexWithSerialization<Metric>::deserialize(ReadBuffer & istr)
{
auto callback = [&istr](void * from, size_t n)
{
istr.readStrict(reinterpret_cast<char *>(from), n);
return true;
};
Base::load_from_stream(callback);
}
template <unum::usearch::metric_kind_t Metric>
size_t USearchIndexWithSerialization<Metric>::getDimensions() const
{
return Base::dimensions();
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexGranuleUSearch<Metric>::MergeTreeIndexGranuleUSearch(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::scalar_kind_t scalar_kind_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, scalar_kind(scalar_kind_)
, index(nullptr)
{
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexGranuleUSearch<Metric>::MergeTreeIndexGranuleUSearch(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::scalar_kind_t scalar_kind_,
USearchIndexWithSerializationPtr<Metric> index_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, scalar_kind(scalar_kind_)
, index(std::move(index_))
{
}
template <unum::usearch::metric_kind_t Metric>
void MergeTreeIndexGranuleUSearch<Metric>::serializeBinary(WriteBuffer & ostr) const
{
/// Number of dimensions is required in the index constructor,
/// so it must be written and read separately from the other part
writeIntBinary(static_cast<UInt64>(index->getDimensions()), ostr); // write dimension
index->serialize(ostr);
}
template <unum::usearch::metric_kind_t Metric>
void MergeTreeIndexGranuleUSearch<Metric>::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
{
UInt64 dimension;
readIntBinary(dimension, istr);
index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);
index->deserialize(istr);
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexAggregatorUSearch<Metric>::MergeTreeIndexAggregatorUSearch(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::scalar_kind_t scalar_kind_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, scalar_kind(scalar_kind_)
{
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorUSearch<Metric>::getGranuleAndReset()
{
auto granule = std::make_shared<MergeTreeIndexGranuleUSearch<Metric>>(index_name, index_sample_block, scalar_kind, index);
index = nullptr;
return granule;
}
template <unum::usearch::metric_kind_t Metric>
void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t * pos, size_t limit)
{
if (*pos >= block.rows())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
*pos,
block.rows());
size_t rows_read = std::min(limit, block.rows() - *pos);
if (rows_read == 0)
return;
if (rows_read > std::numeric_limits<uint32_t>::max())
throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than 4B rows per index granule.");
if (index_sample_block.columns() > 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
const String & index_column_name = index_sample_block.getByPosition(0).name;
ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
{
const auto & column_array_data = column_array->getData();
const auto & column_array_data_float = typeid_cast<const ColumnFloat32 &>(column_array_data);
const auto & column_array_data_float_data = column_array_data_float.getData();
const auto & column_array_offsets = column_array->getOffsets();
const size_t num_rows = column_array_offsets.size();
if (column_array->empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
/// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
/// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default
/// values which is also empty.
if (column_array->isDefaultAt(0))
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
/// Check all sizes are the same
size_t dimension = column_array_offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
/// Also check that previously inserted blocks have the same size as this block.
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
if (index && index->getDimensions() != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
if (!index)
index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);
/// Add all rows of block
if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows)))
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index");
for (size_t current_row = 0; current_row < num_rows; ++current_row)
{
auto rc = index->add(static_cast<uint32_t>(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
if (!rc)
throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, rc.error.release());
ProfileEvents::increment(ProfileEvents::USearchAddCount);
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members);
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances);
}
}
else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
{
const auto & column_tuple_columns = column_tuple->getColumns();
std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
for (const auto & column : column_tuple_columns)
{
const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
for (size_t i = 0; i < pod_array.size(); ++i)
data[i].push_back(pod_array[i]);
}
if (data.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
if (!index)
index = std::make_shared<USearchIndexWithSerialization<Metric>>(data[0].size(), scalar_kind);
if (!index->reserve(unum::usearch::ceil2(index->size() + data.size())))
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index");
for (const auto & item : data)
{
auto rc = index->add(static_cast<uint32_t>(index->size()), item.data());
if (!rc)
throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, rc.error.release());
ProfileEvents::increment(ProfileEvents::USearchAddCount);
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members);
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances);
}
}
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column");
*pos += rows_read;
}
MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch(
const IndexDescription & /*index_description*/,
const SelectQueryInfo & query,
const String & distance_function_,
ContextPtr context)
: ann_condition(query, context)
, distance_function(distance_function_)
{
}
bool MergeTreeIndexConditionUSearch::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes");
}
bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const
{
return ann_condition.alwaysUnknownOrTrue(distance_function);
}
std::vector<size_t> MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
{
if (distance_function == DISTANCE_FUNCTION_L2)
return getUsefulRangesImpl<unum::usearch::metric_kind_t::l2sq_k>(idx_granule);
else if (distance_function == DISTANCE_FUNCTION_COSINE)
return getUsefulRangesImpl<unum::usearch::metric_kind_t::cos_k>(idx_granule);
std::unreachable();
}
template <unum::usearch::metric_kind_t Metric>
std::vector<size_t> MergeTreeIndexConditionUSearch::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const
{
const UInt64 limit = ann_condition.getLimit();
const UInt64 index_granularity = ann_condition.getIndexGranularity();
const std::optional<float> comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where
? std::optional<float>(ann_condition.getComparisonDistanceForWhereQuery())
: std::nullopt;
if (comparison_distance && comparison_distance.value() < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance");
const std::vector<float> reference_vector = ann_condition.getReferenceVector();
const auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleUSearch<Metric>>(idx_granule);
if (granule == nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type");
const USearchIndexWithSerializationPtr<Metric> index = granule->index;
if (ann_condition.getDimensions() != index->dimensions())
throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
"does not match the dimension in the index ({})",
ann_condition.getDimensions(), index->dimensions());
auto result = index->search(reference_vector.data(), limit);
ProfileEvents::increment(ProfileEvents::USearchSearchCount);
ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members);
ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances);
std::vector<UInt32> neighbors(result.size()); /// indexes of dots which were closest to the reference vector
std::vector<Float32> distances(result.size());
result.dump_to(neighbors.data(), distances.data());
std::vector<size_t> granules;
granules.reserve(neighbors.size());
for (size_t i = 0; i < neighbors.size(); ++i)
{
if (comparison_distance && distances[i] > comparison_distance)
continue;
granules.push_back(neighbors[i] / index_granularity);
}
/// make unique
std::sort(granules.begin(), granules.end());
granules.erase(std::unique(granules.begin(), granules.end()), granules.end());
return granules;
}
MergeTreeIndexUSearch::MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_, unum::usearch::scalar_kind_t scalar_kind_)
: IMergeTreeIndex(index_)
, distance_function(distance_function_)
, scalar_kind(scalar_kind_)
{
}
MergeTreeIndexGranulePtr MergeTreeIndexUSearch::createIndexGranule() const
{
if (distance_function == DISTANCE_FUNCTION_L2)
return std::make_shared<MergeTreeIndexGranuleUSearch<unum::usearch::metric_kind_t::l2sq_k>>(index.name, index.sample_block, scalar_kind);
else if (distance_function == DISTANCE_FUNCTION_COSINE)
return std::make_shared<MergeTreeIndexGranuleUSearch<unum::usearch::metric_kind_t::cos_k>>(index.name, index.sample_block, scalar_kind);
std::unreachable();
}
MergeTreeIndexAggregatorPtr MergeTreeIndexUSearch::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
{
if (distance_function == DISTANCE_FUNCTION_L2)
return std::make_shared<MergeTreeIndexAggregatorUSearch<unum::usearch::metric_kind_t::l2sq_k>>(index.name, index.sample_block, scalar_kind);
else if (distance_function == DISTANCE_FUNCTION_COSINE)
return std::make_shared<MergeTreeIndexAggregatorUSearch<unum::usearch::metric_kind_t::cos_k>>(index.name, index.sample_block, scalar_kind);
std::unreachable();
}
MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
{
return std::make_shared<MergeTreeIndexConditionUSearch>(index, query, distance_function, context);
};
MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const ActionsDAG *, ContextPtr) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG");
}
MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index)
{
static constexpr auto default_distance_function = DISTANCE_FUNCTION_L2;
String distance_function = default_distance_function;
if (!index.arguments.empty())
distance_function = index.arguments[0].safeGet<String>();
static constexpr auto default_scalar_kind = unum::usearch::scalar_kind_t::f16_k;
auto scalar_kind = default_scalar_kind;
if (index.arguments.size() > 1)
scalar_kind = nameToScalarKind.at(index.arguments[1].safeGet<String>());
return std::make_shared<MergeTreeIndexUSearch>(index, distance_function, scalar_kind);
}
void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
{
/// Check number and type of USearch index arguments:
if (index.arguments.size() > 2)
throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index must not have more than one parameters");
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of USearch index (distance function) must be of type String");
if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of USearch index (scalar type) must be of type String");
/// Check that the index is created on a single column
if (index.column_names.size() != 1 || index.data_types.size() != 1)
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "USearch indexes must be created on a single column");
/// Check that a supported metric was passed as first argument
if (!index.arguments.empty())
{
String distance_name = index.arguments[0].safeGet<String>();
if (distance_name != DISTANCE_FUNCTION_L2 && distance_name != DISTANCE_FUNCTION_COSINE)
throw Exception(ErrorCodes::INCORRECT_DATA, "USearch index only supports distance functions '{}' and '{}'", DISTANCE_FUNCTION_L2, DISTANCE_FUNCTION_COSINE);
}
/// Check that a supported kind was passed as a second argument
if (index.arguments.size() > 1 && !nameToScalarKind.contains(index.arguments[1].safeGet<String>()))
{
String supported_kinds;
for (const auto & [name, kind] : nameToScalarKind)
{
if (!supported_kinds.empty())
supported_kinds += ", ";
supported_kinds += name;
}
throw Exception(ErrorCodes::INCORRECT_DATA, "Unrecognized scalar kind (second argument) for USearch index. Supported kinds are: {}", supported_kinds);
}
/// Check data type of indexed column:
auto throw_unsupported_underlying_column_exception = []()
{
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
};
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
if (const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get()))
{
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw_unsupported_underlying_column_exception();
}
else if (const auto * data_type_tuple = typeid_cast<const DataTypeTuple *>(data_type.get()))
{
const DataTypes & inner_types = data_type_tuple->getElements();
for (const auto & inner_type : inner_types)
{
TypeIndex nested_type_index = inner_type->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw_unsupported_underlying_column_exception();
}
}
else
throw_unsupported_underlying_column_exception();
}
}
#endif

View File

@ -1,116 +0,0 @@
#pragma once
#ifdef ENABLE_USEARCH
#include <Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h>
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#include <usearch/index_dense.hpp>
#pragma clang diagnostic pop
namespace DB
{
using USearchImplType = unum::usearch::index_dense_gt</* key_at */ uint32_t, /* compressed_slot_at */ uint32_t>;
template <unum::usearch::metric_kind_t Metric>
class USearchIndexWithSerialization : public USearchImplType
{
using Base = USearchImplType;
public:
USearchIndexWithSerialization(size_t dimensions, unum::usearch::scalar_kind_t scalar_kind);
void serialize(WriteBuffer & ostr) const;
void deserialize(ReadBuffer & istr);
size_t getDimensions() const;
};
template <unum::usearch::metric_kind_t Metric>
using USearchIndexWithSerializationPtr = std::shared_ptr<USearchIndexWithSerialization<Metric>>;
template <unum::usearch::metric_kind_t Metric>
struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule
{
MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_, unum::usearch::scalar_kind_t scalar_kind_);
MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_, unum::usearch::scalar_kind_t scalar_kind_, USearchIndexWithSerializationPtr<Metric> index_);
~MergeTreeIndexGranuleUSearch() override = default;
void serializeBinary(WriteBuffer & ostr) const override;
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
bool empty() const override { return !index.get(); }
const String index_name;
const Block index_sample_block;
const unum::usearch::scalar_kind_t scalar_kind;
USearchIndexWithSerializationPtr<Metric> index;
};
template <unum::usearch::metric_kind_t Metric>
struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator
{
MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block, unum::usearch::scalar_kind_t scalar_kind_);
~MergeTreeIndexAggregatorUSearch() override = default;
bool empty() const override { return !index || index->size() == 0; }
MergeTreeIndexGranulePtr getGranuleAndReset() override;
void update(const Block & block, size_t * pos, size_t limit) override;
const String index_name;
const Block index_sample_block;
const unum::usearch::scalar_kind_t scalar_kind;
USearchIndexWithSerializationPtr<Metric> index;
};
class MergeTreeIndexConditionUSearch final : public IMergeTreeIndexConditionApproximateNearestNeighbor
{
public:
MergeTreeIndexConditionUSearch(
const IndexDescription & index_description,
const SelectQueryInfo & query,
const String & distance_function,
ContextPtr context);
~MergeTreeIndexConditionUSearch() override = default;
bool alwaysUnknownOrTrue() const override;
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
std::vector<size_t> getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override;
private:
template <unum::usearch::metric_kind_t Metric>
std::vector<size_t> getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const;
const ApproximateNearestNeighborCondition ann_condition;
const String distance_function;
};
class MergeTreeIndexUSearch : public IMergeTreeIndex
{
public:
MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_, unum::usearch::scalar_kind_t scalar_kind_);
~MergeTreeIndexUSearch() override = default;
MergeTreeIndexGranulePtr createIndexGranule() const override;
MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const;
MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
bool isVectorSearch() const override { return true; }
private:
const String distance_function;
const unum::usearch::scalar_kind_t scalar_kind;
};
}
#endif

View File

@ -0,0 +1,492 @@
#include <Storages/MergeTree/MergeTreeIndexVectorSimilarity.h>
#if USE_USEARCH
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#include <Columns/ColumnArray.h>
#include <Common/BitHelpers.h>
#include <Common/formatReadable.h>
#include <Common/logger_useful.h>
#include <Common/typeid_cast.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <Interpreters/castColumn.h>
namespace ProfileEvents
{
extern const Event USearchAddCount;
extern const Event USearchAddVisitedMembers;
extern const Event USearchAddComputedDistances;
extern const Event USearchSearchCount;
extern const Event USearchSearchVisitedMembers;
extern const Event USearchSearchComputedDistances;
}
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int FORMAT_VERSION_TOO_OLD;
extern const int ILLEGAL_COLUMN;
extern const int INCORRECT_DATA;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_QUERY;
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
}
namespace
{
/// The only indexing method currently supported by USearch
std::set<String> methods = {"hnsw"};
/// Maps from user-facing name to internal name
std::unordered_map<String, unum::usearch::metric_kind_t> distanceFunctionToMetricKind = {
{"L2Distance", unum::usearch::metric_kind_t::l2sq_k},
{"cosineDistance", unum::usearch::metric_kind_t::cos_k}};
/// Maps from user-facing name to internal name
std::unordered_map<String, unum::usearch::scalar_kind_t> quantizationToScalarKind = {
{"f32", unum::usearch::scalar_kind_t::f32_k},
{"f16", unum::usearch::scalar_kind_t::f16_k},
{"i8", unum::usearch::scalar_kind_t::i8_k}};
template<typename T>
concept is_set = std::same_as<T, std::set<typename T::key_type, typename T::key_compare, typename T::allocator_type>>;
template<typename T>
concept is_unordered_map = std::same_as<T, std::unordered_map<typename T::key_type, typename T::mapped_type, typename T::hasher, typename T::key_equal, typename T::allocator_type>>;
template <typename T>
String joinByComma(const T & t)
{
if constexpr (is_set<T>)
{
return fmt::format("{}", fmt::join(t, ", "));
}
else if constexpr (is_unordered_map<T>)
{
String joined_keys;
for (const auto & [k, _] : t)
{
if (!joined_keys.empty())
joined_keys += ", ";
joined_keys += k;
}
return joined_keys;
}
/// TODO once our libcxx is recent enough, replace above by
/// return fmt::format("{}", fmt::join(std::views::keys(t)), ", "));
std::unreachable();
}
}
USearchIndexWithSerialization::USearchIndexWithSerialization(
size_t dimensions,
unum::usearch::metric_kind_t metric_kind,
unum::usearch::scalar_kind_t scalar_kind,
UsearchHnswParams usearch_hnsw_params)
: Base(Base::make(unum::usearch::metric_punned_t(dimensions, metric_kind, scalar_kind),
unum::usearch::index_dense_config_t(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search)))
{
}
void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const
{
auto callback = [&ostr](void * from, size_t n)
{
ostr.write(reinterpret_cast<const char *>(from), n);
return true;
};
auto result = Base::save_to_stream(callback);
if (result.error)
throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not save vector similarity index, error: " + String(result.error.release()));
}
void USearchIndexWithSerialization::deserialize(ReadBuffer & istr)
{
auto callback = [&istr](void * from, size_t n)
{
istr.readStrict(reinterpret_cast<char *>(from), n);
return true;
};
auto result = Base::load_from_stream(callback);
if (result.error)
/// See the comment in MergeTreeIndexGranuleVectorSimilarity::deserializeBinary why we throw here
throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not load vector similarity index, error: " + String(result.error.release()) + " Please drop the index and create it again.");
}
USearchIndexWithSerialization::Statistics USearchIndexWithSerialization::getStatistics() const
{
Statistics statistics = {
.max_level = max_level(),
.connectivity = connectivity(),
.size = size(), /// number of vectors
.capacity = capacity(), /// number of vectors reserved
.memory_usage = memory_usage(), /// in bytes, the value is not exact
.bytes_per_vector = bytes_per_vector(),
.scalar_words = scalar_words(),
.statistics = stats()};
return statistics;
}
MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_)
: MergeTreeIndexGranuleVectorSimilarity(index_name_, index_sample_block_, metric_kind_, scalar_kind_, usearch_hnsw_params_, nullptr)
{
}
MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_,
USearchIndexWithSerializationPtr index_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, metric_kind(metric_kind_)
, scalar_kind(scalar_kind_)
, usearch_hnsw_params(usearch_hnsw_params_)
, index(std::move(index_))
{
}
void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr) const
{
if (empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty minmax index {}", backQuote(index_name));
writeIntBinary(FILE_FORMAT_VERSION, ostr);
/// Number of dimensions is required in the index constructor,
/// so it must be written and read separately from the other part
writeIntBinary(static_cast<UInt64>(index->dimensions()), ostr);
index->serialize(ostr);
auto statistics = index->getStatistics();
LOG_TRACE(logger, "Wrote vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}",
statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage));
}
void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
{
UInt64 file_version;
readIntBinary(file_version, istr);
if (file_version != FILE_FORMAT_VERSION)
throw Exception(
ErrorCodes::FORMAT_VERSION_TOO_OLD,
"Vector similarity index could not be loaded because its version is too old (current version: {}, persisted version: {}). Please drop the index and create it again.",
FILE_FORMAT_VERSION, file_version);
/// More fancy error handling would be: Set a flag on the index that it failed to load. During usage return all granules, i.e.
/// behave as if the index does not exist. Since format changes are expected to happen only rarely and it is "only" an index, keep it simple for now.
UInt64 dimension;
readIntBinary(dimension, istr);
index = std::make_shared<USearchIndexWithSerialization>(dimension, metric_kind, scalar_kind, usearch_hnsw_params);
index->deserialize(istr);
auto statistics = index->getStatistics();
LOG_TRACE(logger, "Loaded vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}",
statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage));
}
MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilarity(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, metric_kind(metric_kind_)
, scalar_kind(scalar_kind_)
, usearch_hnsw_params(usearch_hnsw_params_)
{
}
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAndReset()
{
auto granule = std::make_shared<MergeTreeIndexGranuleVectorSimilarity>(index_name, index_sample_block, metric_kind, scalar_kind, usearch_hnsw_params, index);
index = nullptr;
return granule;
}
void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_t * pos, size_t limit)
{
if (*pos >= block.rows())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
*pos,
block.rows());
size_t rows_read = std::min(limit, block.rows() - *pos);
if (rows_read == 0)
return;
if (rows_read > std::numeric_limits<UInt32>::max())
throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than {} rows per index granule.", std::numeric_limits<UInt32>::max());
if (index_sample_block.columns() > 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
const String & index_column_name = index_sample_block.getByPosition(0).name;
ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
{
const auto & column_array_data = column_array->getData();
const auto & column_array_data_float = typeid_cast<const ColumnFloat32 &>(column_array_data);
const auto & column_array_data_float_data = column_array_data_float.getData();
const auto & column_array_offsets = column_array->getOffsets();
const size_t num_rows = column_array_offsets.size();
if (column_array->empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
/// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
/// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default
/// values which is also empty.
if (column_array->isDefaultAt(0))
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
/// Check all sizes are the same
const size_t dimensions = column_array_offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimensions)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
/// Also check that previously inserted blocks have the same size as this block.
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
if (index && index->dimensions() != dimensions)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
if (!index)
index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
/// Reserving space is mandatory
if (!index->reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows)))
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index");
for (size_t row = 0; row < num_rows; ++row)
{
auto rc = index->add(static_cast<UInt32>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]);
if (!rc)
throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index, error: " + String(rc.error.release()));
ProfileEvents::increment(ProfileEvents::USearchAddCount);
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members);
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances);
}
}
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float32) column");
*pos += rows_read;
}
MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity(
const IndexDescription & /*index_description*/,
const SelectQueryInfo & query,
unum::usearch::metric_kind_t metric_kind_,
ContextPtr context)
: vector_similarity_condition(query, context)
, metric_kind(metric_kind_)
{
}
bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes");
}
bool MergeTreeIndexConditionVectorSimilarity::alwaysUnknownOrTrue() const
{
String index_distance_function;
switch (metric_kind)
{
case unum::usearch::metric_kind_t::l2sq_k: index_distance_function = "L2Distance"; break;
case unum::usearch::metric_kind_t::cos_k: index_distance_function = "cosineDistance"; break;
default: std::unreachable();
}
return vector_similarity_condition.alwaysUnknownOrTrue(index_distance_function);
}
std::vector<size_t> MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(MergeTreeIndexGranulePtr granule_) const
{
const UInt64 limit = vector_similarity_condition.getLimit();
const UInt64 index_granularity = vector_similarity_condition.getIndexGranularity();
const auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleVectorSimilarity>(granule_);
if (granule == nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type");
const USearchIndexWithSerializationPtr index = granule->index;
if (vector_similarity_condition.getDimensions() != index->dimensions())
throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
"does not match the dimension in the index ({})",
vector_similarity_condition.getDimensions(), index->dimensions());
const std::vector<float> reference_vector = vector_similarity_condition.getReferenceVector();
auto result = index->search(reference_vector.data(), limit);
if (result.error)
throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index, error: " + String(result.error.release()));
ProfileEvents::increment(ProfileEvents::USearchSearchCount);
ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members);
ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances);
std::vector<USearchIndex::key_t> neighbors(result.size()); /// indexes of dots which were closest to the reference vector
std::vector<USearchIndex::distance_t> distances(result.size());
result.dump_to(neighbors.data(), distances.data());
std::vector<size_t> granules;
granules.reserve(neighbors.size());
for (auto neighbor : neighbors)
granules.push_back(neighbor / index_granularity);
/// make unique
std::sort(granules.begin(), granules.end());
granules.erase(std::unique(granules.begin(), granules.end()), granules.end());
return granules;
}
MergeTreeIndexVectorSimilarity::MergeTreeIndexVectorSimilarity(
const IndexDescription & index_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_)
: IMergeTreeIndex(index_)
, metric_kind(metric_kind_)
, scalar_kind(scalar_kind_)
, usearch_hnsw_params(usearch_hnsw_params_)
{
}
MergeTreeIndexGranulePtr MergeTreeIndexVectorSimilarity::createIndexGranule() const
{
return std::make_shared<MergeTreeIndexGranuleVectorSimilarity>(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params);
}
MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
{
return std::make_shared<MergeTreeIndexAggregatorVectorSimilarity>(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params);
}
MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
{
return std::make_shared<MergeTreeIndexConditionVectorSimilarity>(index, query, metric_kind, context);
};
MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG");
}
MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index)
{
const bool has_six_args = (index.arguments.size() == 6);
unum::usearch::metric_kind_t metric_kind = distanceFunctionToMetricKind.at(index.arguments[1].safeGet<String>());
/// use defaults for the other parameters
unum::usearch::scalar_kind_t scalar_kind = unum::usearch::scalar_kind_t::f32_k;
UsearchHnswParams usearch_hnsw_params;
if (has_six_args)
{
scalar_kind = quantizationToScalarKind.at(index.arguments[2].safeGet<String>());
usearch_hnsw_params = {.m = index.arguments[3].safeGet<UInt64>(),
.ef_construction = index.arguments[4].safeGet<UInt64>(),
.ef_search = index.arguments[5].safeGet<UInt64>()};
}
return std::make_shared<MergeTreeIndexVectorSimilarity>(index, metric_kind, scalar_kind, usearch_hnsw_params);
}
void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* attach */)
{
const bool has_two_args = (index.arguments.size() == 2);
const bool has_six_args = (index.arguments.size() == 6);
/// Check number and type of arguments
if (!has_two_args && !has_six_args)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or six arguments");
if (index.arguments[0].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (method) must be of type String");
if (index.arguments[1].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (metric) must be of type String");
if (has_six_args)
{
if (index.arguments[2].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Third argument of vector similarity index (quantization) must be of type String");
if (index.arguments[3].getType() != Field::Types::UInt64)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (M) must be of type UInt64");
if (index.arguments[4].getType() != Field::Types::UInt64)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (ef_construction) must be of type UInt64");
if (index.arguments[5].getType() != Field::Types::UInt64)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Sixth argument of vector similarity index (ef_search) must be of type UInt64");
}
/// Check that passed arguments are supported
if (!methods.contains(index.arguments[0].safeGet<String>()))
throw Exception(ErrorCodes::INCORRECT_DATA, "First argument (method) of vector similarity index is not supported. Supported methods are: {}", joinByComma(methods));
if (!distanceFunctionToMetricKind.contains(index.arguments[1].safeGet<String>()))
throw Exception(ErrorCodes::INCORRECT_DATA, "Second argument (distance function) of vector similarity index is not supported. Supported distance function are: {}", joinByComma(distanceFunctionToMetricKind));
if (has_six_args)
{
if (!quantizationToScalarKind.contains(index.arguments[2].safeGet<String>()))
throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind));
if (index.arguments[3].safeGet<UInt64>() < 2)
throw Exception(ErrorCodes::INCORRECT_DATA, "Fourth argument (M) of vector similarity index must be > 1");
if (index.arguments[4].safeGet<UInt64>() < 1)
throw Exception(ErrorCodes::INCORRECT_DATA, "Fifth argument (ef_construction) of vector similarity index must be > 0");
if (index.arguments[5].safeGet<UInt64>() < 1)
throw Exception(ErrorCodes::INCORRECT_DATA, "Sixth argument (ef_search) of vector similarity index must be > 0");
}
/// Check that the index is created on a single column
if (index.column_names.size() != 1 || index.data_types.size() != 1)
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column");
/// Check data type of the indexed column:
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
if (const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get()))
{
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)");
}
else
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)");
}
}
}
#endif

View File

@ -0,0 +1,172 @@
#pragma once
#include "config.h"
#if USE_USEARCH
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
# include <Storages/MergeTree/VectorSimilarityCondition.h>
# include <Common/Logger.h>
# include <usearch/index_dense.hpp>
#pragma clang diagnostic pop
namespace DB
{
struct UsearchHnswParams
{
size_t m = unum::usearch::default_connectivity();
size_t ef_construction = unum::usearch::default_expansion_add();
size_t ef_search = unum::usearch::default_expansion_search();
};
using USearchIndex = unum::usearch::index_dense_gt</*key_at*/ uint32_t, /*compressed_slot_at*/ uint32_t>;
class USearchIndexWithSerialization : public USearchIndex
{
using Base = USearchIndex;
public:
USearchIndexWithSerialization(
size_t dimensions,
unum::usearch::metric_kind_t metric_kind,
unum::usearch::scalar_kind_t scalar_kind,
UsearchHnswParams usearch_hnsw_params);
void serialize(WriteBuffer & ostr) const;
void deserialize(ReadBuffer & istr);
struct Statistics
{
size_t max_level;
size_t connectivity;
size_t size;
size_t capacity;
size_t memory_usage;
/// advanced stats:
size_t bytes_per_vector;
size_t scalar_words;
Base::stats_t statistics;
};
Statistics getStatistics() const;
};
using USearchIndexWithSerializationPtr = std::shared_ptr<USearchIndexWithSerialization>;
struct MergeTreeIndexGranuleVectorSimilarity final : public IMergeTreeIndexGranule
{
MergeTreeIndexGranuleVectorSimilarity(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_);
MergeTreeIndexGranuleVectorSimilarity(
const String & index_name_,
const Block & index_sample_block_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_,
USearchIndexWithSerializationPtr index_);
~MergeTreeIndexGranuleVectorSimilarity() override = default;
void serializeBinary(WriteBuffer & ostr) const override;
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
bool empty() const override { return !index || index->size() == 0; }
const String index_name;
const Block index_sample_block;
const unum::usearch::metric_kind_t metric_kind;
const unum::usearch::scalar_kind_t scalar_kind;
const UsearchHnswParams usearch_hnsw_params;
USearchIndexWithSerializationPtr index;
LoggerPtr logger = getLogger("VectorSimilarityIndex");
private:
/// The version of the persistence format of USearch index. Increment whenever you change the format.
/// Note: USearch prefixes the serialized data with its own version header. We can't rely on that because 1. the index in ClickHouse
/// is (at least in theory) agnostic of specific vector search libraries, and 2. additional data (e.g. the number of dimensions)
/// outside USearch exists which we should version separately.
static constexpr UInt64 FILE_FORMAT_VERSION = 1;
};
struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregator
{
MergeTreeIndexAggregatorVectorSimilarity(
const String & index_name_,
const Block & index_sample_block,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_);
~MergeTreeIndexAggregatorVectorSimilarity() override = default;
bool empty() const override { return !index || index->size() == 0; }
MergeTreeIndexGranulePtr getGranuleAndReset() override;
void update(const Block & block, size_t * pos, size_t limit) override;
const String index_name;
const Block index_sample_block;
const unum::usearch::metric_kind_t metric_kind;
const unum::usearch::scalar_kind_t scalar_kind;
const UsearchHnswParams usearch_hnsw_params;
USearchIndexWithSerializationPtr index;
};
class MergeTreeIndexConditionVectorSimilarity final : public IMergeTreeIndexCondition
{
public:
MergeTreeIndexConditionVectorSimilarity(
const IndexDescription & index_description,
const SelectQueryInfo & query,
unum::usearch::metric_kind_t metric_kind_,
ContextPtr context);
~MergeTreeIndexConditionVectorSimilarity() override = default;
bool alwaysUnknownOrTrue() const override;
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override;
std::vector<size_t> getUsefulRanges(MergeTreeIndexGranulePtr granule) const override;
private:
const VectorSimilarityCondition vector_similarity_condition;
const unum::usearch::metric_kind_t metric_kind;
};
class MergeTreeIndexVectorSimilarity : public IMergeTreeIndex
{
public:
MergeTreeIndexVectorSimilarity(
const IndexDescription & index_,
unum::usearch::metric_kind_t metric_kind_,
unum::usearch::scalar_kind_t scalar_kind_,
UsearchHnswParams usearch_hnsw_params_);
~MergeTreeIndexVectorSimilarity() override = default;
MergeTreeIndexGranulePtr createIndexGranule() const override;
MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const;
MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
bool isVectorSimilarityIndex() const override { return true; }
private:
const unum::usearch::metric_kind_t metric_kind;
const unum::usearch::scalar_kind_t scalar_kind;
const UsearchHnswParams usearch_hnsw_params;
};
}
#endif

View File

@ -127,15 +127,21 @@ MergeTreeIndexFactory::MergeTreeIndexFactory()
registerCreator("hypothesis", hypothesisIndexCreator);
registerValidator("hypothesis", hypothesisIndexValidator);
#ifdef ENABLE_ANNOY
registerCreator("annoy", annoyIndexCreator);
registerValidator("annoy", annoyIndexValidator);
#endif
#ifdef ENABLE_USEARCH
registerCreator("usearch", usearchIndexCreator);
registerValidator("usearch", usearchIndexValidator);
#if USE_USEARCH
registerCreator("vector_similarity", vectorSimilarityIndexCreator);
registerValidator("vector_similarity", vectorSimilarityIndexValidator);
#endif
/// ------
/// TODO: remove this block at the end of 2024.
/// Index types 'annoy' and 'usearch' are no longer supported as of June 2024. Their successor is index type 'vector_similarity'.
/// To support loading tables with old indexes during a transition period, register dummy indexes which allow load/attaching but
/// throw an exception when the user attempts to use them.
registerCreator("annoy", legacyVectorSimilarityIndexCreator);
registerValidator("annoy", legacyVectorSimilarityIndexValidator);
registerCreator("usearch", legacyVectorSimilarityIndexCreator);
registerValidator("usearch", legacyVectorSimilarityIndexValidator);
/// ------
registerCreator("inverted", fullTextIndexCreator);
registerValidator("inverted", fullTextIndexValidator);

View File

@ -15,6 +15,7 @@
#include <Interpreters/ExpressionActions.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include "config.h"
constexpr auto INDEX_FILE_PREFIX = "skp_idx_";
@ -92,6 +93,13 @@ public:
virtual bool alwaysUnknownOrTrue() const = 0;
virtual bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const = 0;
/// Special stuff for vector similarity indexes
/// - Returns vector of indexes of ranges in granule which are useful for query.
virtual std::vector<size_t> getUsefulRanges(MergeTreeIndexGranulePtr) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for non-vector-similarity indexes.");
}
};
using MergeTreeIndexConditionPtr = std::shared_ptr<IMergeTreeIndexCondition>;
@ -169,7 +177,7 @@ struct IMergeTreeIndex
virtual MergeTreeIndexConditionPtr createIndexCondition(
const ActionsDAG * filter_actions_dag, ContextPtr context) const = 0;
virtual bool isVectorSearch() const { return false; }
virtual bool isVectorSimilarityIndex() const { return false; }
virtual MergeTreeIndexMergedConditionPtr createIndexMergedCondition(
const SelectQueryInfo & /*query_info*/, StorageMetadataPtr /*storage_metadata*/) const
@ -230,17 +238,15 @@ void bloomFilterIndexValidator(const IndexDescription & index, bool attach);
MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index);
void hypothesisIndexValidator(const IndexDescription & index, bool attach);
#ifdef ENABLE_ANNOY
MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index);
void annoyIndexValidator(const IndexDescription & index, bool attach);
#if USE_USEARCH
MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index);
void vectorSimilarityIndexValidator(const IndexDescription & index, bool attach);
#endif
#ifdef ENABLE_USEARCH
MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index);
void usearchIndexValidator(const IndexDescription& index, bool attach);
#endif
MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index);
void legacyVectorSimilarityIndexValidator(const IndexDescription & index, bool attach);
MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription& index);
void fullTextIndexValidator(const IndexDescription& index, bool attach);
MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription & index);
void fullTextIndexValidator(const IndexDescription & index, bool attach);
}

View File

@ -0,0 +1,350 @@
#include <Storages/MergeTree/VectorSimilarityCondition.h>
#include <Core/Settings.h>
#include <Interpreters/Context.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTOrderByElement.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Storages/MergeTree/KeyCondition.h>
#include <Storages/MergeTree/MergeTreeSettings.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int INCORRECT_QUERY;
}
namespace
{
template <typename Literal>
void extractReferenceVectorFromLiteral(std::vector<Float32> & reference_vector, Literal literal)
{
Float64 float_element_of_reference_vector;
Int64 int_element_of_reference_vector;
for (const auto & value : literal.value())
{
if (value.tryGet(float_element_of_reference_vector))
reference_vector.emplace_back(float_element_of_reference_vector);
else if (value.tryGet(int_element_of_reference_vector))
reference_vector.emplace_back(static_cast<float>(int_element_of_reference_vector));
else
throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in reference vector. Only float or int are supported.");
}
}
VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(std::string_view distance_function)
{
if (distance_function == "L2Distance")
return VectorSimilarityCondition::Info::DistanceFunction::L2;
else
return VectorSimilarityCondition::Info::DistanceFunction::Unknown;
}
}
VectorSimilarityCondition::VectorSimilarityCondition(const SelectQueryInfo & query_info, ContextPtr context)
: block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context))
, index_granularity(context->getMergeTreeSettings().index_granularity)
, max_limit_for_ann_queries(context->getSettingsRef().max_limit_for_ann_queries)
, index_is_useful(checkQueryStructure(query_info))
{}
bool VectorSimilarityCondition::alwaysUnknownOrTrue(String distance_function) const
{
if (!index_is_useful)
return true; /// query isn't supported
/// If query is supported, check if distance function of index is the same as distance function in query
return !(stringToDistanceFunction(distance_function) == query_information->distance_function);
}
UInt64 VectorSimilarityCondition::getLimit() const
{
if (index_is_useful && query_information.has_value())
return query_information->limit;
throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported");
}
std::vector<float> VectorSimilarityCondition::getReferenceVector() const
{
if (index_is_useful && query_information.has_value())
return query_information->reference_vector;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index.");
}
size_t VectorSimilarityCondition::getDimensions() const
{
if (index_is_useful && query_information.has_value())
return query_information->reference_vector.size();
throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index.");
}
String VectorSimilarityCondition::getColumnName() const
{
if (index_is_useful && query_information.has_value())
return query_information->column_name;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index.");
}
VectorSimilarityCondition::Info::DistanceFunction VectorSimilarityCondition::getDistanceFunction() const
{
if (index_is_useful && query_information.has_value())
return query_information->distance_function;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Distance function was requested for useless or uninitialized index.");
}
bool VectorSimilarityCondition::checkQueryStructure(const SelectQueryInfo & query)
{
Info order_by_info;
/// Build rpns for query sections
const auto & select = query.query->as<ASTSelectQuery &>();
RPN rpn_order_by;
RPNElement rpn_limit;
UInt64 limit;
if (select.limitLength())
traverseAtomAST(select.limitLength(), rpn_limit);
if (select.orderBy())
traverseOrderByAST(select.orderBy(), rpn_order_by);
/// Reverse RPNs for conveniences during parsing
std::reverse(rpn_order_by.begin(), rpn_order_by.end());
const bool order_by_is_valid = matchRPNOrderBy(rpn_order_by, order_by_info);
const bool limit_is_valid = matchRPNLimit(rpn_limit, limit);
if (!limit_is_valid || limit > max_limit_for_ann_queries)
return false;
if (order_by_is_valid)
{
query_information = std::move(order_by_info);
query_information->limit = limit;
return true;
}
return false;
}
void VectorSimilarityCondition::traverseAST(const ASTPtr & node, RPN & rpn)
{
/// If the node is ASTFunction, it may have children nodes
if (const auto * func = node->as<ASTFunction>())
{
const ASTs & children = func->arguments->children;
/// Traverse children nodes
for (const auto& child : children)
traverseAST(child, rpn);
}
RPNElement element;
/// Get the data behind node
if (!traverseAtomAST(node, element))
element.function = RPNElement::FUNCTION_UNKNOWN;
rpn.emplace_back(std::move(element));
}
bool VectorSimilarityCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out)
{
/// Match Functions
if (const auto * function = node->as<ASTFunction>())
{
/// Set the name
out.func_name = function->name;
if (function->name == "L1Distance" ||
function->name == "L2Distance" ||
function->name == "LinfDistance" ||
function->name == "cosineDistance" ||
function->name == "dotProduct")
out.function = RPNElement::FUNCTION_DISTANCE;
else if (function->name == "array")
out.function = RPNElement::FUNCTION_ARRAY;
else if (function->name == "_CAST")
out.function = RPNElement::FUNCTION_CAST;
else
return false;
return true;
}
/// Match identifier
else if (const auto * identifier = node->as<ASTIdentifier>())
{
out.function = RPNElement::FUNCTION_IDENTIFIER;
out.identifier.emplace(identifier->name());
out.func_name = "column identifier";
return true;
}
/// Check if we have constants behind the node
return tryCastToConstType(node, out);
}
bool VectorSimilarityCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out)
{
Field const_value;
DataTypePtr const_type;
if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
{
/// Check for constant types
if (const_value.getType() == Field::Types::Float64)
{
out.function = RPNElement::FUNCTION_FLOAT_LITERAL;
out.float_literal.emplace(const_value.safeGet<Float32>());
out.func_name = "Float literal";
return true;
}
if (const_value.getType() == Field::Types::UInt64)
{
out.function = RPNElement::FUNCTION_INT_LITERAL;
out.int_literal.emplace(const_value.safeGet<UInt64>());
out.func_name = "Int literal";
return true;
}
if (const_value.getType() == Field::Types::Int64)
{
out.function = RPNElement::FUNCTION_INT_LITERAL;
out.int_literal.emplace(const_value.safeGet<Int64>());
out.func_name = "Int literal";
return true;
}
if (const_value.getType() == Field::Types::Array)
{
out.function = RPNElement::FUNCTION_LITERAL_ARRAY;
out.array_literal = const_value.safeGet<Array>();
out.func_name = "Array literal";
return true;
}
if (const_value.getType() == Field::Types::String)
{
out.function = RPNElement::FUNCTION_STRING_LITERAL;
out.func_name = const_value.safeGet<String>();
return true;
}
}
return false;
}
void VectorSimilarityCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn)
{
if (const auto * expr_list = node->as<ASTExpressionList>())
if (const auto * order_by_element = expr_list->children.front()->as<ASTOrderByElement>())
traverseAST(order_by_element->children.front(), rpn);
}
/// Returns true and stores ANNExpr if the query has valid ORDERBY clause
bool VectorSimilarityCondition::matchRPNOrderBy(RPN & rpn, Info & info)
{
/// ORDER BY clause must have at least 3 expressions
if (rpn.size() < 3)
return false;
auto iter = rpn.begin();
auto end = rpn.end();
bool identifier_found = false;
/// Matches DistanceFunc->[Column]->[ArrayFunc]->ReferenceVector(floats)->[Column]
if (iter->function != RPNElement::FUNCTION_DISTANCE)
return false;
info.distance_function = stringToDistanceFunction(iter->func_name);
++iter;
if (iter->function == RPNElement::FUNCTION_IDENTIFIER)
{
identifier_found = true;
info.column_name = std::move(iter->identifier.value());
++iter;
}
if (iter->function == RPNElement::FUNCTION_ARRAY)
++iter;
if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY)
{
extractReferenceVectorFromLiteral(info.reference_vector, iter->array_literal);
++iter;
}
/// further conditions are possible if there is no array, or no identifier is found
/// the array can be inside a cast function. For other cases, see the loop after this condition
if (iter != end && iter->function == RPNElement::FUNCTION_CAST)
{
++iter;
/// Cast should be made to array
if (!iter->func_name.starts_with("Array"))
return false;
++iter;
if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY)
{
extractReferenceVectorFromLiteral(info.reference_vector, iter->array_literal);
++iter;
}
else
return false;
}
while (iter != end)
{
if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL ||
iter->function == RPNElement::FUNCTION_INT_LITERAL)
info.reference_vector.emplace_back(getFloatOrIntLiteralOrPanic(iter));
else if (iter->function == RPNElement::FUNCTION_IDENTIFIER)
{
if (identifier_found)
return false;
info.column_name = std::move(iter->identifier.value());
identifier_found = true;
}
else
return false;
++iter;
}
/// Final checks of correctness
return identifier_found && !info.reference_vector.empty();
}
/// Returns true and stores Length if we have valid LIMIT clause in query
bool VectorSimilarityCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit)
{
if (rpn.function == RPNElement::FUNCTION_INT_LITERAL)
{
limit = rpn.int_literal.value();
return true;
}
return false;
}
/// Gets float or int from AST node
float VectorSimilarityCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter)
{
if (iter->float_literal.has_value())
return iter->float_literal.value();
if (iter->int_literal.has_value())
return static_cast<float>(iter->int_literal.value());
throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong parsed AST in buildRPN\n");
}
}

View File

@ -9,52 +9,9 @@
namespace DB
{
static constexpr auto DISTANCE_FUNCTION_L2 = "L2Distance";
static constexpr auto DISTANCE_FUNCTION_COSINE = "cosineDistance";
/// Approximate Nearest Neighbour queries have a similar structure:
/// - reference vector from which all distances are calculated
/// - metric name (e.g L2Distance, LpDistance, etc.)
/// - name of column with embeddings
/// - type of query
/// - maximum number of returned elements (LIMIT)
///
/// And two optional parameters:
/// - p for LpDistance function
/// - distance to compare with (only for where queries)
///
/// This struct holds all these components.
struct ApproximateNearestNeighborInformation
{
using Embedding = std::vector<float>;
Embedding reference_vector;
enum class Metric : uint8_t
{
Unknown,
L2,
Lp
};
Metric metric;
String column_name;
UInt64 limit;
enum class Type : uint8_t
{
OrderBy,
Where
};
Type type;
float p_for_lp_dist = -1.0;
float distance = -1.0;
};
// Class ANNCondition, is responsible for recognizing if the query is an ANN queries which can utilize ANN indexes. It parses the SQL query
/// and checks if it matches ANNIndexes. Method alwaysUnknownOrTrue returns false if we can speed up the query, and true otherwise. It has
/// only one argument, the name of the metric with which index was built. Two main patterns of queries are supported
/// Class VectorSimilarityCondition is responsible for recognizing if the query can utilize vector similarity indexes.
/// Method alwaysUnknownOrTrue returns false if we can speed up the query, and true otherwise. It has
/// only one argument, the name of the distance function with which index was built. Two main patterns of queries are supported
///
/// - 1. WHERE queries:
/// SELECT * FROM * WHERE DistanceFunc(column, reference_vector) < floatLiteral LIMIT count
@ -64,14 +21,14 @@ struct ApproximateNearestNeighborInformation
///
/// Queries without LIMIT count are not supported
/// If the query is both of type 1. and 2., than we can't use the index and alwaysUnknownOrTrue returns true.
/// reference_vector should have float coordinates, e.g. (0.2, 0.1, .., 0.5)
/// reference_vector should have float coordinates, e.g. [0.2, 0.1, .., 0.5]
///
/// If the query matches one of these two types, then this class extracts the main information needed for ANN indexes from the query.
/// If the query matches one of these two types, then this class extracts the main information needed for vector similarity indexes from the
/// query.
///
/// From matching query it extracts
/// - referenceVector
/// - metricName(DistanceFunction)
/// - dimension size if query uses LpDistance
/// - distance function
/// - distance to compare(ONLY for search types, otherwise you get exception)
/// - spaceDimension(which is referenceVector's components count)
/// - column
@ -79,35 +36,45 @@ struct ApproximateNearestNeighborInformation
/// - queryHasOrderByClause and queryHasWhereClause return true if query matches the type
///
/// Search query type is also recognized for PREWHERE clause
class ApproximateNearestNeighborCondition
class VectorSimilarityCondition
{
public:
ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context);
VectorSimilarityCondition(const SelectQueryInfo & query_info, ContextPtr context);
/// Approximate nearest neighbour (ANN) / vector similarity queries have a similar structure:
/// - reference vector from which all distances are calculated
/// - distance function, e.g L2Distance
/// - name of column with embeddings
/// - type of query
/// - maximum number of returned elements (LIMIT)
///
/// And one optional parameter:
/// - distance to compare with (only for where queries)
///
/// This struct holds all these components.
struct Info
{
enum class DistanceFunction : uint8_t
{
Unknown,
L2
};
std::vector<Float32> reference_vector;
DistanceFunction distance_function;
String column_name;
UInt64 limit;
float distance = -1.0;
};
/// Returns false if query can be speeded up by an ANN index, true otherwise.
bool alwaysUnknownOrTrue(String metric) const;
bool alwaysUnknownOrTrue(String distance_function) const;
/// Returns the distance to compare with for search query
float getComparisonDistanceForWhereQuery() const;
/// Distance should be calculated regarding to referenceVector
std::vector<float> getReferenceVector() const;
/// Reference vector's dimension count
size_t getDimensions() const;
String getColumnName() const;
ApproximateNearestNeighborInformation::Metric getMetricType() const;
/// The P- value if the metric is 'LpDistance'
float getPValueForLpDistance() const;
ApproximateNearestNeighborInformation::Type getQueryType() const;
Info::DistanceFunction getDistanceFunction() const;
UInt64 getIndexGranularity() const { return index_granularity; }
/// Length's value from LIMIT clause
UInt64 getLimit() const;
private:
@ -118,9 +85,6 @@ private:
/// DistanceFunctions
FUNCTION_DISTANCE,
//tuple(0.1, ..., 0.1)
FUNCTION_TUPLE,
//array(0.1, ..., 0.1)
FUNCTION_ARRAY,
@ -139,9 +103,6 @@ private:
/// Unknown, can be any value
FUNCTION_UNKNOWN,
/// (0.1, ...., 0.1) vector without word 'tuple'
FUNCTION_LITERAL_TUPLE,
/// [0.1, ...., 0.1] vector without word 'array'
FUNCTION_LITERAL_ARRAY,
@ -154,19 +115,14 @@ private:
explicit RPNElement(Function function_ = FUNCTION_UNKNOWN)
: function(function_)
, func_name("Unknown")
, float_literal(std::nullopt)
, identifier(std::nullopt)
{}
Function function;
String func_name;
String func_name = "Unknown";
std::optional<float> float_literal;
std::optional<String> identifier;
std::optional<int64_t> int_literal;
std::optional<Tuple> tuple_literal;
std::optional<Array> array_literal;
UInt32 dim = 0;
@ -186,16 +142,16 @@ private:
void traverseOrderByAST(const ASTPtr & node, RPN & rpn);
/// Returns true and stores ANNExpr if the query has valid WHERE section
static bool matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info);
static bool matchRPNWhere(RPN & rpn, Info & info);
/// Returns true and stores ANNExpr if the query has valid ORDERBY section
static bool matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info);
static bool matchRPNOrderBy(RPN & rpn, Info & info);
/// Returns true and stores Length if we have valid LIMIT clause in query
static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit);
/* Matches dist function, reference vector, column name */
static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info);
/// Matches dist function, reference vector, column name
static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, Info & info);
/// Gets float or int from AST node
static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter);
@ -203,7 +159,7 @@ private:
Block block_with_constants;
/// true if we have one of two supported query types
std::optional<ApproximateNearestNeighborInformation> query_information;
std::optional<Info> query_information;
// Get from settings ANNIndex parameters
const UInt64 index_granularity;
@ -214,13 +170,4 @@ private:
bool index_is_useful = false;
};
/// Common interface of ANN indexes.
class IMergeTreeIndexConditionApproximateNearestNeighbor : public IMergeTreeIndexCondition
{
public:
/// Returns vector of indexes of ranges in granule which are useful for query.
virtual std::vector<size_t> getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const = 0;
};
}

View File

@ -164,6 +164,9 @@ endif()
if (TARGET ch_contrib::bcrypt)
set(USE_BCRYPT 1)
endif()
if (TARGET ch_contrib::usearch)
set(USE_USEARCH 1)
endif()
if (TARGET ch_contrib::ssh)
set(USE_SSH 1)
endif()

View File

@ -1,17 +1,5 @@
Issue #52258: Empty Arrays or Arrays with default values are rejected
- Annoy
- Usearch
It is possible to create parts with different Array vector sizes but there will be an error at query time
- Annoy
- Usearch
Correctness of index with > 1 mark
- Annoy
1 [1,0] 0
9000 [9000,0] 0
1 (1,0) 0
9000 (9000,0) 0
- Usearch
1 [1,0] 0
9000 [9000,0] 0
1 (1,0) 0
9000 (9000,0) 0

View File

@ -1,61 +1,22 @@
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan
-- Tags: no-fasttest, no-ordinary-database
-- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage,
-- therefore they are tested in a single file.
-- This file contains tests for various bugs and special cases
SET allow_experimental_annoy_index = 1;
SET allow_experimental_usearch_index = 1;
-- Tests various bugs and special cases for vector indexes.
SET allow_experimental_vector_similarity_index = 1;
SET enable_analyzer = 1; -- 0 vs. 1 produce slightly different error codes, make it future-proof
DROP TABLE IF EXISTS tab;
SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected';
SELECT '- Annoy';
CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree() ORDER BY (id);
CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree() ORDER BY id;
INSERT INTO tab VALUES (1, []); -- { serverError INCORRECT_DATA }
INSERT INTO tab (id) VALUES (1); -- { serverError INCORRECT_DATA }
DROP TABLE tab;
CREATE TABLE tab (id UInt64, vec Tuple(Float32, Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree() ORDER BY (id);
INSERT INTO tab (id) VALUES (1); -- works fine, takes on default tuple (0.0, 0.0)
DROP TABLE tab;
SELECT '- Usearch';
CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree() ORDER BY (id);
INSERT INTO tab VALUES (1, []); -- { serverError INCORRECT_DATA }
INSERT INTO tab (id) VALUES (1); -- { serverError INCORRECT_DATA }
DROP TABLE tab;
CREATE TABLE tab (id UInt64, vec Tuple(Float32, Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree() ORDER BY (id);
INSERT INTO tab (id) VALUES (1); -- works fine, takes on default tuple (0.0, 0.0)
DROP TABLE tab;
SELECT 'It is possible to create parts with different Array vector sizes but there will be an error at query time';
SELECT '- Annoy';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id;
SYSTEM STOP MERGES tab;
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2]);
INSERT INTO tab values (2, [2.2, 2.3, 2.4]) (3, [3.1, 3.2, 3.3]);
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3; -- { serverError SIZES_OF_ARRAYS_DONT_MATCH }
DROP TABLE tab;
SELECT '- Usearch';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id;
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
SYSTEM STOP MERGES tab;
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2]);
INSERT INTO tab values (2, [2.2, 2.3, 2.4]) (3, [3.1, 3.2, 3.3]);
@ -70,9 +31,7 @@ DROP TABLE tab;
SELECT 'Correctness of index with > 1 mark';
SELECT '- Annoy';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug
INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000);
WITH [1.0, 0.0] AS reference_vec
@ -88,58 +47,3 @@ ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
DROP TABLE tab;
-- same, but with Tuples
CREATE TABLE tab(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug
INSERT INTO tab SELECT number, (toFloat32(number), 0.0) from numbers(10000);
WITH (1.0, 0.0) AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
WITH (9000.0, 0.0) AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
DROP TABLE tab;
SELECT '- Usearch';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug
INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000);
WITH [1.0, 0.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
WITH [9000.0, 0.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
DROP TABLE tab;
-- same, but with Tuples
CREATE TABLE tab(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug
INSERT INTO tab SELECT number, (toFloat32(number), 0.0) from numbers(10000);
WITH (1.0, 0.0) AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
WITH (9000.0, 0.0) AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 1;
DROP TABLE tab;

View File

@ -1,7 +1,2 @@
Test the default index granularity for vector search indexes (CREATE TABLE AND ALTER TABLE), should be 100 million for Annoy and USearch
- Annoy
100000000
100000000
- Usearch
100000000
100000000

View File

@ -1,35 +1,18 @@
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan
-- Tags: no-fasttest, no-ordinary-database
-- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage,
-- therefore they are tested in a single file.
-- Tests that vector search indexes use a (non-standard) index granularity of 100 mio by default.
-- This file contains tests for the non-standard default granularity of vector search indexes.
SET allow_experimental_annoy_index = 1;
SET allow_experimental_usearch_index = 1;
SELECT 'Test the default index granularity for vector search indexes (CREATE TABLE AND ALTER TABLE), should be 100 million for Annoy and USearch';
SELECT '- Annoy';
SET allow_experimental_vector_similarity_index = 1;
-- After CREATE TABLE
DROP TABLE IF EXISTS tab;
CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE annoy) ENGINE=MergeTree ORDER BY id;
SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx';
DROP TABLE tab;
CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE=MergeTree ORDER BY id;
ALTER TABLE tab ADD INDEX idx(vec) TYPE annoy;
SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx';
SELECT '- Usearch';
DROP TABLE tab;
CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE usearch) ENGINE=MergeTree ORDER BY id;
SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx';
DROP TABLE tab;
CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE=MergeTree ORDER BY id;
ALTER TABLE tab ADD INDEX idx(vec) TYPE usearch;
CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx';
-- After ALTER TABLE
DROP TABLE tab;
CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE = MergeTree ORDER BY id;
ALTER TABLE tab ADD INDEX idx(vec) TYPE vector_similarity('hnsw', 'L2Distance');
SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx';
DROP TABLE tab;

View File

@ -0,0 +1,3 @@
5 [0,2] 0
6 [0,2.1] 0.09999990463256836
7 [0,2.2] 0.20000004768371582

View File

@ -0,0 +1,20 @@
-- Tags: no-fasttest, no-ordinary-database
-- Tests that vector similarity indexes can be detached/attached.
SET allow_experimental_vector_similarity_index = 1;
DROP TABLE IF EXISTS tab;
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
DETACH TABLE tab SYNC;
ATTACH TABLE tab;
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
DROP TABLE tab;

View File

@ -1,8 +1,10 @@
At most two index arguments
1st argument (distance function) must be String
Rejects unsupported distance functions
2nd argument (Annoy: number of trees, USearch: scalar kind) must be UInt64 (Annoy) / String (Usearch)
Rejects unsupported scalar kinds (only Usearch)
Two or six index arguments
1st argument (method) must be String and hnsw
2nd argument (distance function) must be String and L2Distance or cosineDistance
3nd argument (quantization), if given, must be String and f32, f16, ...
4nd argument (M), if given, must be UInt64 and > 1
5nd argument (ef_construction), if given, must be UInt64 and > 0
6nd argument (ef_search), if given, must be UInt64 and > 0
Must be created on single column
Must be created on Array(Float32) or Tuple(Float32, Float, ...) columns
Must be created on Array(Float32) columns
Rejects INSERTs of Arrays with different sizes

View File

@ -1,63 +1,51 @@
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan
-- Tags: no-fasttest, no-ordinary-database
-- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage,
-- therefore they are tested in a single file.
-- Tests that various conditions are checked during creation of vector search indexes.
-- This file tests that various conditions are checked during creation of vector search indexes.
SET allow_experimental_annoy_index = 1;
SET allow_experimental_usearch_index = 1;
SET allow_experimental_vector_similarity_index = 1;
DROP TABLE IF EXISTS tab;
SELECT 'At most two index arguments';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('too', 'many', 'args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
SELECT 'Two or six index arguments';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant_have_one_arg')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'three_args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'six', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
SELECT '1st argument (distance function) must be String';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
SELECT '1st argument (method) must be String and hnsw';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3, 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('not_hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT 'Rejects unsupported distance functions';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT '2nd argument (distance function) must be String and L2Distance or cosineDistance';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'invalid_distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT '2nd argument (Annoy: number of trees, USearch: scalar kind) must be UInt64 (Annoy) / String (Usearch)';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
SELECT 'Rejects unsupported scalar kinds (only Usearch)';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('L2Distance', 'invalidKind')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT '3nd argument (quantization), if given, must be String and f32, f16, ...';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT '4nd argument (M), if given, must be UInt64 and > 1';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 'invalid', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT '5nd argument (ef_construction), if given, must be UInt64 and > 0';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 'invalid', 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 0, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT '6nd argument (ef_search), if given, must be UInt64 and > 0';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 'invalid')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 0)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
SELECT 'Must be created on single column';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
SELECT 'Must be created on Array(Float32) or Tuple(Float32, Float, ...) columns';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
SELECT 'Must be created on Array(Float32) columns';
SET allow_suspicious_low_cardinality_types = 1;
CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Tuple(Float64), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Tuple(Float64), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec UInt64, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
SELECT 'Rejects INSERTs of Arrays with different sizes';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id;
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA }
DROP TABLE tab;
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id;
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA }
DROP TABLE tab;

View File

@ -0,0 +1,2 @@
Annoy
Usearch

View File

@ -0,0 +1,43 @@
-- Indexes of type 'annoy' or 'usearch' are no longer supported.
-- Test what happens when ClickHouse encounters tables with the old index type.
DROP TABLE IF EXISTS tab;
SELECT 'Annoy';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX vec_idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -- { serverError ILLEGAL_INDEX }
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
-- (*) The search succeeds because the index contains no data (i.e. some shortcut)
-- If it had data (can't really test in SQL tests ...), this statement would also return an error, trust me.
-- Detach and attach should work.
DETACH TABLE tab;
ATTACH TABLE tab;
DROP TABLE tab;
SELECT 'Usearch';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX vec_idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -- { serverError ILLEGAL_INDEX }
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
-- see above: (*)
-- Detach and attach should work.
DETACH TABLE tab;
ATTACH TABLE tab;
DROP TABLE tab;

View File

@ -0,0 +1,14 @@
-- Tags: no-fasttest, no-ordinary-database
-- Tests that multiple vector similarity indexes can be created on the same column (even if that makes no sense)
SET allow_experimental_vector_similarity_index = 1;
DROP TABLE IF EXISTS tab;
CREATE TABLE tab (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity('hnsw', 'L2Distance'));
ALTER TABLE tab ADD INDEX idx(vec) TYPE minmax;
ALTER TABLE tab ADD INDEX vec_idx1(vec) TYPE vector_similarity('hnsw', 'cosineDistance');
ALTER TABLE tab ADD INDEX vec_idx2(vec) TYPE vector_similarity('hnsw', 'L2Distance'); -- silly but creating the same index also works for non-vector indexes ...
DROP TABLE tab;

View File

@ -1,41 +1,14 @@
ARRAY, 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block
- Annoy: WHERE-type
10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block
- ORDER-BY-type
5 [0,2] 0
6 [0,2.1] 0.09999990463256836
7 [0,2.2] 0.20000004768371582
- Annoy: ORDER-BY-type
5 [0,2] 0
6 [0,2.1] 0.09999990463256836
7 [0,2.2] 0.20000004768371582
- Usearch: WHERE-type
5 [0,2] 0
6 [0,2.1] 0.09999990463256836
7 [0,2.2] 0.20000004768371582
- Usearch: ORDER-BY-type
5 [0,2] 0
6 [0,2.1] 0.09999990463256836
7 [0,2.2] 0.20000004768371582
- Annoy: WHERE-type, EXPLAIN
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
Expression
ReadFromMergeTree (default.tab_annoy)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 1/1
Skip
Name: idx
Description: annoy GRANULARITY 100000000
Parts: 1/1
Granules: 1/1
- Annoy: ORDER-BY-type, EXPLAIN
- ORDER-BY-type, EXPLAIN
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_annoy)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
@ -43,78 +16,20 @@ Expression (Projection)
Granules: 1/1
Skip
Name: idx
Description: annoy GRANULARITY 100000000
Description: vector_similarity GRANULARITY 100000000
Parts: 1/1
Granules: 1/1
- Usearch: WHERE-type, EXPLAIN
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
Expression
ReadFromMergeTree (default.tab_usearch)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 1/1
Skip
Name: idx
Description: usearch GRANULARITY 100000000
Parts: 1/1
Granules: 1/1
- Usearch: ORDER-BY-type, EXPLAIN
12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block
- ORDER-BY-type
6 [0,2] 0
7 [0,2.1] 0.09999990463256836
8 [0,2.2] 0.20000004768371582
- ORDER-BY-type, EXPLAIN
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_usearch)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 1/1
Skip
Name: idx
Description: usearch GRANULARITY 100000000
Parts: 1/1
Granules: 1/1
ARRAY vectors, 12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block
- Annoy: WHERE-type
6 [0,2] 0
7 [0,2.1] 0.09999990463256836
8 [0,2.2] 0.20000004768371582
- Annoy: ORDER-BY-type
6 [0,2] 0
7 [0,2.1] 0.09999990463256836
8 [0,2.2] 0.20000004768371582
- Usearch: WHERE-type
6 [0,2] 0
7 [0,2.1] 0.09999990463256836
8 [0,2.2] 0.20000004768371582
- Usearch: ORDER-BY-type
6 [0,2] 0
7 [0,2.1] 0.09999990463256836
8 [0,2.2] 0.20000004768371582
- Annoy: WHERE-type, EXPLAIN
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
Expression
ReadFromMergeTree (default.tab_annoy)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: annoy GRANULARITY 2
Parts: 1/1
Granules: 1/4
- Annoy: ORDER-BY-type, EXPLAIN
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_annoy)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
@ -122,77 +37,20 @@ Expression (Projection)
Granules: 4/4
Skip
Name: idx
Description: annoy GRANULARITY 2
Description: vector_similarity GRANULARITY 2
Parts: 1/1
Granules: 2/4
- Usearch: WHERE-type, EXPLAIN
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
Expression
ReadFromMergeTree (default.tab_usearch)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: usearch GRANULARITY 2
Parts: 1/1
Granules: 1/4
- Usearch: ORDER-BY-type, EXPLAIN
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_usearch)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: usearch GRANULARITY 2
Parts: 1/1
Granules: 2/4
TUPLE vectors and special cases
- Annoy: WHERE-type
0 (4.6,2.3) 0.5527864045000421
1 (2,3.2) 0.15200169244542905
2 (4.2,3.4) 0.37080174340866845
- Annoy: ORDER-BY-type
6 (1,9.3) 0.005731362878640178
1 (2,3.2) 0.15200169244542905
7 (5.5,4.7) 0.3503476876550442
- Usearch: WHERE-type
0 (4.6,2.3) 0.5527864045000421
1 (2,3.2) 0.15200169244542905
2 (4.2,3.4) 0.37080174340866845
- Usearch: ORDER-BY-type
6 (1,9.3) 0.005731362878640178
1 (2,3.2) 0.15200169244542905
7 (5.5,4.7) 0.3503476876550442
- Special case: MaximumDistance is negative
- Special case: MaximumDistance is negative
- Special case: setting "annoy_index_search_k_nodes"
Special cases
- ORDER-BY-type
6 [1,9.3] 0.005731362878640178
1 [2,3.2] 0.15200169244542905
7 [5.5,4.7] 0.3503476876550442
- Special case: setting "max_limit_for_ann_queries"
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_annoy)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
- Special case: setting "max_limit_for_ann_queries"
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_usearch)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true

View File

@ -1,238 +1,80 @@
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan
-- Tags: no-fasttest, no-ordinary-database
-- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage,
-- therefore they are tested in a single file.
-- Tests various simple approximate nearest neighborhood (ANN) queries that utilize vector search indexes.
-- This file tests various simple approximate nearest neighborhood (ANN) queries that utilize vector search indexes.
SET allow_experimental_vector_similarity_index = 1;
SET allow_experimental_annoy_index = 1;
SET allow_experimental_usearch_index = 1;
SET enable_analyzer = 0;
SELECT 'ARRAY, 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block';
SELECT '10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block';
DROP TABLE IF EXISTS tab_annoy;
DROP TABLE IF EXISTS tab_usearch;
DROP TABLE IF EXISTS tab;
CREATE TABLE tab_annoy(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
INSERT INTO tab_annoy VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
CREATE TABLE tab_usearch(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
INSERT INTO tab_usearch VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
SELECT '- Annoy: WHERE-type';
SELECT '- ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Annoy: ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- Usearch: WHERE-type';
SELECT '- ORDER-BY-type, EXPLAIN';
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Usearch: ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- Annoy: WHERE-type, EXPLAIN';
EXPLAIN indexes=1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
DROP TABLE tab;
SELECT '- Annoy: ORDER-BY-type, EXPLAIN';
EXPLAIN indexes=1
SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
SELECT '- ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- Usearch: WHERE-type, EXPLAIN';
EXPLAIN indexes=1
SELECT '- ORDER-BY-type, EXPLAIN';
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Usearch: ORDER-BY-type, EXPLAIN';
EXPLAIN indexes=1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
DROP TABLE tab_annoy;
DROP TABLE tab_usearch;
DROP TABLE tab;
SELECT 'ARRAY vectors, 12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block';
SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen.
-- Test with non-default metric, M, ef_construction, ef_search
CREATE TABLE tab_annoy(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab_annoy VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
CREATE TABLE tab_usearch(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab_usearch VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
SELECT '- Annoy: WHERE-type';
SELECT '- ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Annoy: ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- Usearch: WHERE-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Usearch: ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- Annoy: WHERE-type, EXPLAIN';
EXPLAIN indexes=1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Annoy: ORDER-BY-type, EXPLAIN';
EXPLAIN indexes=1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_annoy
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- Usearch: WHERE-type, EXPLAIN';
EXPLAIN indexes=1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
WHERE L2Distance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Usearch: ORDER-BY-type, EXPLAIN';
EXPLAIN indexes=1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_usearch
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
DROP TABLE tab_annoy;
DROP TABLE tab_usearch;
SELECT 'TUPLE vectors and special cases';
-- Not a systematic test, just to check that no bad things happen.
-- Just for jun, use metric = 'cosineDistance' (Annoy/Usearch), tree_count = 200 (Annoy), scalarKind = 'f64' (Usearch)
CREATE TABLE tab_annoy(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE annoy('cosineDistance', 200) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab_annoy VALUES (0, (4.6, 2.3)), (1, (2.0, 3.2)), (2, (4.2, 3.4)), (3, (5.3, 2.9)), (4, (2.4, 5.2)), (5, (5.3, 2.3)), (6, (1.0, 9.3)), (7, (5.5, 4.7)), (8, (6.4, 3.5)), (9, (5.3, 2.5)), (10, (6.4, 3.4)), (11, (6.4, 3.2));
CREATE TABLE tab_usearch(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE usearch('cosineDistance', 'f64') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab_usearch VALUES (0, (4.6, 2.3)), (1, (2.0, 3.2)), (2, (4.2, 3.4)), (3, (5.3, 2.9)), (4, (2.4, 5.2)), (5, (5.3, 2.3)), (6, (1.0, 9.3)), (7, (5.5, 4.7)), (8, (6.4, 3.5)), (9, (5.3, 2.5)), (10, (6.4, 3.4)), (11, (6.4, 3.2));
SELECT '- Annoy: WHERE-type';
WITH (0.0, 2.0) AS reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_annoy
WHERE cosineDistance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Annoy: ORDER-BY-type';
WITH (0.0, 2.0) AS reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_annoy
FROM tab
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3;
SELECT '- Usearch: WHERE-type';
WITH (0.0, 2.0) AS reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_usearch
WHERE cosineDistance(vec, reference_vec) < 1.0
LIMIT 3;
SELECT '- Usearch: ORDER-BY-type';
WITH (0.0, 2.0) AS reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_usearch
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3;
SELECT '- Special case: MaximumDistance is negative';
WITH (0.0, 2.0) as reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_annoy
WHERE cosineDistance(vec, reference_vec) < -1.0
LIMIT 3; -- { serverError INCORRECT_QUERY }
SELECT '- Special case: MaximumDistance is negative';
WITH (0.0, 2.0) as reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_usearch
WHERE cosineDistance(vec, reference_vec) < -1.0
LIMIT 3; -- { serverError INCORRECT_QUERY }
SELECT '- Special case: setting "annoy_index_search_k_nodes"';
WITH (0.0, 2.0) as reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_annoy
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3
SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results
SELECT '- Special case: setting "max_limit_for_ann_queries"';
EXPLAIN indexes=1
WITH (0.0, 2.0) as reference_vec
WITH [0.0, 2.0] as reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_annoy
FROM tab
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3
SETTINGS max_limit_for_ann_queries=2; -- LIMIT 3 > 2 --> don't use the ann index
SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann index
SELECT '- Special case: setting "max_limit_for_ann_queries"';
EXPLAIN indexes=1
WITH (0.0, 2.0) as reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab_usearch
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3
SETTINGS max_limit_for_ann_queries=2; -- LIMIT 3 > 2 --> don't use the ann index
DROP TABLE tab_annoy;
DROP TABLE tab_usearch;
DROP TABLE tab;

View File

@ -0,0 +1,23 @@
-- Tags: no-fasttest, no-ordinary-database
SET allow_experimental_vector_similarity_index = 1;
-- Tests that quoted and unquoted parameters can be passed to vector search indexes.
DROP TABLE IF EXISTS tab1;
DROP TABLE IF EXISTS tab2;
CREATE TABLE tab1 (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity('hnsw', 'L2Distance'));
CREATE TABLE tab2 (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity(hnsw, L2Distance));
DROP TABLE tab1;
DROP TABLE tab2;
CREATE TABLE tab1 (id Int32, vec Array(Float32), PRIMARY KEY id);
CREATE TABLE tab2 (id Int32, vec Array(Float32), PRIMARY KEY id);
ALTER TABLE tab1 ADD INDEX idx1(vec) TYPE vector_similarity('hnsw', 'L2Distance');
ALTER TABLE tab2 ADD INDEX idx2(vec) TYPE vector_similarity(hnsw, L2Distance);
DROP TABLE tab1;
DROP TABLE tab2;