diff --git a/.gitmodules b/.gitmodules
index 7fdfb1103c5..0a66031de8d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -230,9 +230,6 @@
[submodule "contrib/minizip-ng"]
path = contrib/minizip-ng
url = https://github.com/zlib-ng/minizip-ng
-[submodule "contrib/annoy"]
- path = contrib/annoy
- url = https://github.com/ClickHouse/annoy
[submodule "contrib/qpl"]
path = contrib/qpl
url = https://github.com/intel/qpl
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index b33e7083e32..dc2ad2a3150 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -205,9 +205,8 @@ add_contrib (morton-nd-cmake morton-nd)
if (ARCH_S390X)
add_contrib(crc32-s390x-cmake crc32-s390x)
endif()
-add_contrib (annoy-cmake annoy)
-option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES})
+option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES})
if (ENABLE_USEARCH)
add_contrib (FP16-cmake FP16)
add_contrib (robin-map-cmake robin-map)
diff --git a/contrib/annoy b/contrib/annoy
deleted file mode 160000
index f2ac8e7b48f..00000000000
--- a/contrib/annoy
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f2ac8e7b48f9a9cf676d3b58286e5455aba8e956
diff --git a/contrib/annoy-cmake/CMakeLists.txt b/contrib/annoy-cmake/CMakeLists.txt
deleted file mode 100644
index bdef7d92132..00000000000
--- a/contrib/annoy-cmake/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-option(ENABLE_ANNOY "Enable Annoy index support" ${ENABLE_LIBRARIES})
-
-# Annoy index should be disabled with undefined sanitizer. Because of memory storage optimizations
-# (https://github.com/ClickHouse/annoy/blob/9d8a603a4cd252448589e84c9846f94368d5a289/src/annoylib.h#L442-L463)
-# UBSan fails and leads to crash. Simmilar issue is already opened in Annoy repo
-# https://github.com/spotify/annoy/issues/456
-# Problem with aligment can lead to errors like
-# (https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment)
-# or will lead to crash on arm https://developer.arm.com/documentation/ka003038/latest
-# This issues should be resolved before annoy became non-experimental (--> setting "allow_experimental_annoy_index")
-if ((NOT ENABLE_ANNOY) OR (SANITIZE STREQUAL "undefined") OR (ARCH_AARCH64))
- message (STATUS "Not using annoy")
- return()
-endif()
-
-set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy")
-set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src")
-
-add_library(_annoy INTERFACE)
-target_include_directories(_annoy SYSTEM INTERFACE ${ANNOY_SOURCE_DIR})
-
-add_library(ch_contrib::annoy ALIAS _annoy)
-target_compile_definitions(_annoy INTERFACE ENABLE_ANNOY)
-target_compile_definitions(_annoy INTERFACE ANNOYLIB_MULTITHREADED_BUILD)
diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt
index 29fbe57106c..6be622275ae 100644
--- a/contrib/usearch-cmake/CMakeLists.txt
+++ b/contrib/usearch-cmake/CMakeLists.txt
@@ -1,9 +1,7 @@
-set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
-set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include")
-
set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
-set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD-map")
+set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
+set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
add_library(_usearch INTERFACE)
@@ -11,7 +9,6 @@ target_include_directories(_usearch SYSTEM INTERFACE
${FP16_PROJECT_DIR}/include
${ROBIN_MAP_PROJECT_DIR}/include
${SIMSIMD_PROJECT_DIR}/include
- ${USEARCH_SOURCE_DIR})
+ ${USEARCH_PROJECT_DIR}/include)
add_library(ch_contrib::usearch ALIAS _usearch)
-target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH)
diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md
index 5a81313f62e..e73d6f07a32 100644
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@@ -17,7 +17,7 @@ In terms of SQL, the nearest neighborhood problem can be expressed as follows:
``` sql
SELECT *
-FROM table_with_ann_index
+FROM table
ORDER BY Distance(vectors, Point)
LIMIT N
```
@@ -27,75 +27,109 @@ Function `Distance` computes the distance between two vectors. Often, the Euclid
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
0.33, ...)`, and `N` limits the number of search results.
-An alternative formulation of the nearest neighborhood search problem looks as follows:
+This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for
+situations where `MaxDistance` is difficult to determine in advance.
-``` sql
-SELECT *
-FROM table_with_ann_index
-WHERE Distance(vectors, Point) < MaxDistance
-LIMIT N
-```
-
-While the first query returns the top-`N` closest points to the reference point, the second query returns all points closer to the reference
-point than a maximally allowed radius `MaxDistance`. Parameter `N` limits the number of returned values which is useful for situations where
-`MaxDistance` is difficult to determine in advance.
-
-With brute force search, both queries are expensive (linear in the number of points) because the distance between all points in `vectors` and
+With brute force search, the query is expensive (linear in the number of points) because the distance between all points in `vectors` and
`Point` must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation
of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer much quicker (in sub-linear time).
-# Creating and Using ANN Indexes {#creating_using_ann_indexes}
+# Creating and Using Vector Similarity Indexes
-Syntax to create an ANN index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
+Syntax to create a vector similarity index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
```sql
-CREATE TABLE table_with_ann_index
+CREATE TABLE table
(
- `id` Int64,
- `vectors` Array(Float32),
- INDEX [ann_index_name vectors TYPE [ann_index_type]([ann_index_parameters]) [GRANULARITY [N]]
+ id Int64,
+ vectors Array(Float32),
+ INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
+Parameters:
+- `method`: Supports currently only `hnsw`.
+- `distance_function`: either `L2Distance` (the [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) - the length of a
+ line between two points in Euclidean space), or `cosineDistance` (the [cosine
+ distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors).
+- `quantization`: either `f32`, `f16`, or `i8` for storing the vector with reduced precision (optional, default: `f32`)
+- `m`: the number of neighbors per graph node (optional, default: 16)
+- `ef_construction`: (optional, default: 128)
+- `ef_search`: (optional, default: 64)
+
+Example:
+
+```sql
+CREATE TABLE table
+(
+ id Int64,
+ vectors Array(Float32),
+ INDEX idx vectors TYPE vector_similarity('hnsw', 'L2Distance') -- Alternative syntax: TYPE vector_similarity(hnsw, L2Distance)
+)
+ENGINE = MergeTree
+ORDER BY id;
+```
+
+Vector similarity indexes are based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
+algorithm](https://arxiv.org/abs/1603.09320), i.e., a hierarchical graph where each point represents a vector and the edges represent
+similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the
+overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors, that are expensive
+to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further distance computations on modern
+Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient navigation around immutable persistent
+files, without loading them into RAM.
+
+USearch indexes are currently experimental, to use them you first need to `SET allow_experimental_vector_similarity_index = 1`.
+
+Vector similarity indexes currently support two distance functions:
+- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
+ ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
+- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
+ ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
+
+Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`.
+If no scalar kind was specified during index creation, `f16` is used as default.
+
+For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
+distance function was specified during index creation, `L2Distance` is used as default.
+
+:::note
+All arrays must have same length. To avoid errors, you can use a
+[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK
+length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported.
+:::
+
+:::note
+The vector similarity index currently does not work with per-table, non-default `index_granularity` settings (see
+[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
+:::
+
ANN indexes are built during column insertion and merge. As a result, `INSERT` and `OPTIMIZE` statements will be slower than for ordinary
tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write
requests.
-ANN indexes support two types of queries:
-
-- ORDER BY queries:
+ANN indexes support these queries:
``` sql
SELECT *
- FROM table_with_ann_index
+ FROM table
[WHERE ...]
ORDER BY Distance(vectors, Point)
LIMIT N
```
-- WHERE queries:
-
- ``` sql
- SELECT *
- FROM table_with_ann_index
- WHERE Distance(vectors, Point) < MaxDistance
- LIMIT N
- ```
-
:::tip
To avoid writing out large vectors, you can use [query
parameters](/docs/en/interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g.
```bash
-clickhouse-client --param_vec='hello' --query="SELECT * FROM table_with_ann_index WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0"
+clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0"
```
:::
-**Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)`
-clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries
-without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting
+**Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause
+cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting
`max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for
approximate neighbor search.
@@ -122,128 +156,3 @@ brute-force distance calculation over all rows of the granules. With a small `GR
equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall
back to a smaller `GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY`
was specified for ANN indexes, the default value is 100 million.
-
-
-# Available ANN Indexes {#available_ann_indexes}
-
-- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)
-
-- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch)
-
-## Annoy {#annoy}
-
-Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
-disabled on ARM due to memory safety problems with the algorithm.
-
-This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random
-linear surfaces (lines in 2D, planes in 3D etc.).
-
-
-
-
-
-Syntax to create an Annoy index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
-
-```sql
-CREATE TABLE table_with_annoy_index
-(
- id Int64,
- vectors Array(Float32),
- INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N]
-)
-ENGINE = MergeTree
-ORDER BY id;
-```
-
-Annoy currently supports two distance functions:
-- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
- ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
-- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
- ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
-
-For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
-distance function was specified during index creation, `L2Distance` is used as default.
-
-Parameter `NumTrees` is the number of trees which the algorithm creates (default if not specified: 100). Higher values of `NumTree` mean
-more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.
-
-:::note
-All arrays must have same length. To avoid errors, you can use a
-[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK
-length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported.
-:::
-
-The creation of Annoy indexes (whenever a new part is build, e.g. at the end of a merge) is a relatively slow process. You can increase
-setting `max_threads_for_annoy_index_creation` (default: 4) which controls how many threads are used to create an Annoy index. Please be
-careful with this setting, it is possible that multiple indexes are created in parallel in which case there can be overparallelization.
-
-Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
-values mean more accurate results at the cost of longer query runtime:
-
-```sql
-SELECT *
-FROM table_name
-ORDER BY L2Distance(vectors, Point)
-LIMIT N
-SETTINGS annoy_index_search_k_nodes=100;
-```
-
-:::note
-The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
-[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
-:::
-
-## USearch {#usearch}
-
-This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
-algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent
-similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the
-overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors,
-that are expensive to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further
-distance computations on modern Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient
-navigation around immutable persistent files, without loading them into RAM.
-
-
-
-
-
-Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column:
-
-```sql
-CREATE TABLE table_with_usearch_index
-(
- id Int64,
- vectors Array(Float32),
- INDEX [ann_index_name] vectors TYPE usearch([Distance[, ScalarKind]]) [GRANULARITY N]
-)
-ENGINE = MergeTree
-ORDER BY id;
-```
-
-USearch currently supports two distance functions:
-- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
- ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
-- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
- ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
-
-USearch allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`. If no scalar kind
-was specified during index creation, `f16` is used as default.
-
-For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
-distance function was specified during index creation, `L2Distance` is used as default.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index db3778f3ceb..95e431b54be 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -601,10 +601,6 @@ endif()
dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing)
-if (TARGET ch_contrib::annoy)
- dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
-endif()
-
if (TARGET ch_contrib::usearch)
dbms_target_link_libraries(PUBLIC ch_contrib::usearch)
endif()
diff --git a/src/Common/config.h.in b/src/Common/config.h.in
index e3f8882850f..2e3b8d84366 100644
--- a/src/Common/config.h.in
+++ b/src/Common/config.h.in
@@ -58,6 +58,7 @@
#cmakedefine01 USE_FILELOG
#cmakedefine01 USE_ODBC
#cmakedefine01 USE_BLAKE3
+#cmakedefine01 USE_USEARCH
#cmakedefine01 USE_SKIM
#cmakedefine01 USE_PRQL
#cmakedefine01 USE_ULID
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index acdc8316a4d..0808e8eb49f 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -908,14 +908,11 @@ class IColumn;
M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
M(Bool, allow_experimental_time_series_table, false, "Allows experimental TimeSeries table engine", 0) \
+ M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \
M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
- M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
- M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
- M(UInt64, max_threads_for_annoy_index_creation, 4, "Number of threads used to build Annoy indexes (0 means all cores, not recommended)", 0) \
- M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \
M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \
@@ -1038,6 +1035,10 @@ class IColumn;
MAKE_OBSOLETE(M, UInt64, parallel_replicas_min_number_of_granules_to_enable, 0) \
MAKE_OBSOLETE(M, Bool, query_plan_optimize_projection, true) \
MAKE_OBSOLETE(M, Bool, query_cache_store_results_of_queries_with_nondeterministic_functions, false) \
+ MAKE_OBSOLETE(M, Bool, allow_experimental_annoy_index, false) \
+ MAKE_OBSOLETE(M, UInt64, max_threads_for_annoy_index_creation, 4) \
+ MAKE_OBSOLETE(M, Int64, annoy_index_search_k_nodes, -1) \
+ MAKE_OBSOLETE(M, Bool, allow_experimental_usearch_index, false) \
MAKE_OBSOLETE(M, Bool, optimize_move_functions_out_of_any, false) \
MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \
MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 844aac05f37..20a8721c10e 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -87,6 +87,7 @@ static std::initializer_listsetSetting("allow_experimental_object_type", 1);
query_context->setSetting("allow_experimental_variant_type", 1);
query_context->setSetting("allow_experimental_dynamic_type", 1);
- query_context->setSetting("allow_experimental_annoy_index", 1);
- query_context->setSetting("allow_experimental_usearch_index", 1);
+ query_context->setSetting("allow_experimental_vector_similarity_index", 1);
query_context->setSetting("allow_experimental_bigint_types", 1);
query_context->setSetting("allow_experimental_window_functions", 1);
query_context->setSetting("allow_experimental_geo_types", 1);
diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index 8bcb7f18a0f..95143031707 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -787,10 +787,8 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index)
throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'");
/// ----
- if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index is disabled. Turn on allow_experimental_annoy_index");
- if (index_desc.type == "usearch" && !settings.allow_experimental_usearch_index)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index is disabled. Turn on allow_experimental_usearch_index");
+ if (index_desc.type == "vector_similarity" && !settings.allow_experimental_vector_similarity_index)
+ throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index is disabled. Turn on allow_experimental_vector_similarity_index");
properties.indices.push_back(index_desc);
}
diff --git a/src/Parsers/ASTIndexDeclaration.h b/src/Parsers/ASTIndexDeclaration.h
index dd05ad08184..72f3f017a99 100644
--- a/src/Parsers/ASTIndexDeclaration.h
+++ b/src/Parsers/ASTIndexDeclaration.h
@@ -13,8 +13,7 @@ class ASTIndexDeclaration : public IAST
{
public:
static const auto DEFAULT_INDEX_GRANULARITY = 1uz;
- static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz;
- static const auto DEFAULT_USEARCH_INDEX_GRANULARITY = 100'000'000uz;
+ static const auto DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY = 100'000'000uz;
ASTIndexDeclaration(ASTPtr expression, ASTPtr type, const String & name_);
diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp
index 9ebee4cc852..ed89b80edca 100644
--- a/src/Parsers/ParserCreateIndexQuery.cpp
+++ b/src/Parsers/ParserCreateIndexQuery.cpp
@@ -89,10 +89,8 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected
else
{
auto index_type = index->getType();
- if (index_type && index_type->name == "annoy")
- index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
- else if (index_type && index_type->name == "usearch")
- index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
+ if (index_type && index_type->name == "vector_similarity")
+ index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY;
else
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
}
diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp
index 318fe9da1b4..cc4e02f46a3 100644
--- a/src/Parsers/ParserCreateQuery.cpp
+++ b/src/Parsers/ParserCreateQuery.cpp
@@ -214,10 +214,8 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
else
{
auto index_type = index->getType();
- if (index_type->name == "annoy")
- index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
- else if (index_type->name == "usearch")
- index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
+ if (index_type->name == "vector_similarity")
+ index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY;
else
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
}
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index 901d7c61167..348019d7d10 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -24,8 +24,8 @@
#include
#include
#include
-#include
-#include
+#include
+#include
#include
#include
#include
@@ -52,6 +52,8 @@
#include
#include
+#include "config.h"
+
using namespace DB;
namespace
@@ -1474,16 +1476,14 @@ static void buildIndexes(
else
{
MergeTreeIndexConditionPtr condition;
- if (index_helper->isVectorSearch())
+ if (index_helper->isVectorSimilarityIndex())
{
-#ifdef ENABLE_ANNOY
- if (const auto * annoy = typeid_cast(index_helper.get()))
- condition = annoy->createIndexCondition(query_info, context);
-#endif
-#ifdef ENABLE_USEARCH
- if (const auto * usearch = typeid_cast(index_helper.get()))
- condition = usearch->createIndexCondition(query_info, context);
+#if USE_USEARCH
+ if (const auto * vector_similarity_index = typeid_cast(index_helper.get()))
+ condition = vector_similarity_index->createIndexCondition(query_info, context);
#endif
+ if (const auto * legacy_vector_similarity_index = typeid_cast(index_helper.get()))
+ condition = legacy_vector_similarity_index->createIndexCondition(query_info, context);
if (!condition)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown vector search index {}", index_helper->index.name);
}
diff --git a/src/Storages/IndicesDescription.cpp b/src/Storages/IndicesDescription.cpp
index cef8fd85f97..753fbf1d635 100644
--- a/src/Storages/IndicesDescription.cpp
+++ b/src/Storages/IndicesDescription.cpp
@@ -3,6 +3,7 @@
#include
#include
+#include
#include
#include
#include
@@ -130,10 +131,15 @@ IndexDescription IndexDescription::getIndexFromAST(const ASTPtr & definition_ast
{
for (size_t i = 0; i < index_type->arguments->children.size(); ++i)
{
- const auto * argument = index_type->arguments->children[i]->as();
- if (!argument)
+ const auto & child = index_type->arguments->children[i];
+ if (const auto * ast_literal = child->as(); ast_literal != nullptr)
+ /// E.g. INDEX index_name column_name TYPE vector_similarity('hnsw', 'f32')
+ result.arguments.emplace_back(ast_literal->value);
+ else if (const auto * ast_identifier = child->as(); ast_identifier != nullptr)
+ /// E.g. INDEX index_name column_name TYPE vector_similarity(hnsw, f32)
+ result.arguments.emplace_back(ast_identifier->name());
+ else
throw Exception(ErrorCodes::INCORRECT_QUERY, "Only literals can be skip index arguments");
- result.arguments.emplace_back(argument->value);
}
}
diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp
deleted file mode 100644
index d6a8af3238e..00000000000
--- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp
+++ /dev/null
@@ -1,507 +0,0 @@
-#include
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
- extern const int LOGICAL_ERROR;
- extern const int INCORRECT_QUERY;
-}
-
-namespace
-{
-
-template
-void extractReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & reference_vector, Literal literal)
-{
- Float64 float_element_of_reference_vector;
- Int64 int_element_of_reference_vector;
-
- for (const auto & value : literal.value())
- {
- if (value.tryGet(float_element_of_reference_vector))
- reference_vector.emplace_back(float_element_of_reference_vector);
- else if (value.tryGet(int_element_of_reference_vector))
- reference_vector.emplace_back(static_cast(int_element_of_reference_vector));
- else
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in reference vector. Only float or int are supported.");
- }
-}
-
-ApproximateNearestNeighborInformation::Metric stringToMetric(std::string_view metric)
-{
- if (metric == "L2Distance")
- return ApproximateNearestNeighborInformation::Metric::L2;
- else if (metric == "LpDistance")
- return ApproximateNearestNeighborInformation::Metric::Lp;
- else
- return ApproximateNearestNeighborInformation::Metric::Unknown;
-}
-
-}
-
-ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context)
- : block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context))
- , index_granularity(context->getMergeTreeSettings().index_granularity)
- , max_limit_for_ann_queries(context->getSettingsRef().max_limit_for_ann_queries)
- , index_is_useful(checkQueryStructure(query_info))
-{}
-
-bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric) const
-{
- if (!index_is_useful)
- return true; // Query isn't supported
- // If query is supported, check metrics for match
- return !(stringToMetric(metric) == query_information->metric);
-}
-
-float ApproximateNearestNeighborCondition::getComparisonDistanceForWhereQuery() const
-{
- if (index_is_useful && query_information.has_value()
- && query_information->type == ApproximateNearestNeighborInformation::Type::Where)
- return query_information->distance;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type");
-}
-
-UInt64 ApproximateNearestNeighborCondition::getLimit() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->limit;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported");
-}
-
-std::vector ApproximateNearestNeighborCondition::getReferenceVector() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->reference_vector;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index.");
-}
-
-size_t ApproximateNearestNeighborCondition::getDimensions() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->reference_vector.size();
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index.");
-}
-
-String ApproximateNearestNeighborCondition::getColumnName() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->column_name;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index.");
-}
-
-ApproximateNearestNeighborInformation::Metric ApproximateNearestNeighborCondition::getMetricType() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->metric;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Metric name was requested for useless or uninitialized index.");
-}
-
-float ApproximateNearestNeighborCondition::getPValueForLpDistance() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->p_for_lp_dist;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "P from LPDistance was requested for useless or uninitialized index.");
-}
-
-ApproximateNearestNeighborInformation::Type ApproximateNearestNeighborCondition::getQueryType() const
-{
- if (index_is_useful && query_information.has_value())
- return query_information->type;
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index.");
-}
-
-bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryInfo & query)
-{
- /// RPN-s for different sections of the query
- RPN rpn_prewhere_clause;
- RPN rpn_where_clause;
- RPN rpn_order_by_clause;
- RPNElement rpn_limit;
- UInt64 limit;
-
- ApproximateNearestNeighborInformation prewhere_info;
- ApproximateNearestNeighborInformation where_info;
- ApproximateNearestNeighborInformation order_by_info;
-
- /// Build rpns for query sections
- const auto & select = query.query->as();
-
- /// If query has PREWHERE clause
- if (select.prewhere())
- traverseAST(select.prewhere(), rpn_prewhere_clause);
-
- /// If query has WHERE clause
- if (select.where())
- traverseAST(select.where(), rpn_where_clause);
-
- /// If query has LIMIT clause
- if (select.limitLength())
- traverseAtomAST(select.limitLength(), rpn_limit);
-
- if (select.orderBy()) // If query has ORDERBY clause
- traverseOrderByAST(select.orderBy(), rpn_order_by_clause);
-
- /// Reverse RPNs for conveniences during parsing
- std::reverse(rpn_prewhere_clause.begin(), rpn_prewhere_clause.end());
- std::reverse(rpn_where_clause.begin(), rpn_where_clause.end());
- std::reverse(rpn_order_by_clause.begin(), rpn_order_by_clause.end());
-
- /// Match rpns with supported types and extract information
- const bool prewhere_is_valid = matchRPNWhere(rpn_prewhere_clause, prewhere_info);
- const bool where_is_valid = matchRPNWhere(rpn_where_clause, where_info);
- const bool order_by_is_valid = matchRPNOrderBy(rpn_order_by_clause, order_by_info);
- const bool limit_is_valid = matchRPNLimit(rpn_limit, limit);
-
- /// Query without a LIMIT clause or with a limit greater than a restriction is not supported
- if (!limit_is_valid || max_limit_for_ann_queries < limit)
- return false;
-
- /// Search type query in both sections isn't supported
- if (prewhere_is_valid && where_is_valid)
- return false;
-
- /// Search type should be in WHERE or PREWHERE clause
- if (prewhere_is_valid || where_is_valid)
- query_information = std::move(prewhere_is_valid ? prewhere_info : where_info);
-
- if (order_by_is_valid)
- {
- /// Query with valid where and order by type is not supported
- if (query_information.has_value())
- return false;
-
- query_information = std::move(order_by_info);
- }
-
- if (query_information)
- query_information->limit = limit;
-
- return query_information.has_value();
-}
-
-void ApproximateNearestNeighborCondition::traverseAST(const ASTPtr & node, RPN & rpn)
-{
- // If the node is ASTFunction, it may have children nodes
- if (const auto * func = node->as())
- {
- const ASTs & children = func->arguments->children;
- // Traverse children nodes
- for (const auto& child : children)
- traverseAST(child, rpn);
- }
-
- RPNElement element;
- /// Get the data behind node
- if (!traverseAtomAST(node, element))
- element.function = RPNElement::FUNCTION_UNKNOWN;
-
- rpn.emplace_back(std::move(element));
-}
-
-bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out)
-{
- /// Match Functions
- if (const auto * function = node->as())
- {
- /// Set the name
- out.func_name = function->name;
-
- if (function->name == "L1Distance" ||
- function->name == "L2Distance" ||
- function->name == "LinfDistance" ||
- function->name == "cosineDistance" ||
- function->name == "dotProduct" ||
- function->name == "LpDistance")
- out.function = RPNElement::FUNCTION_DISTANCE;
- else if (function->name == "tuple")
- out.function = RPNElement::FUNCTION_TUPLE;
- else if (function->name == "array")
- out.function = RPNElement::FUNCTION_ARRAY;
- else if (function->name == "less" ||
- function->name == "greater" ||
- function->name == "lessOrEquals" ||
- function->name == "greaterOrEquals")
- out.function = RPNElement::FUNCTION_COMPARISON;
- else if (function->name == "_CAST")
- out.function = RPNElement::FUNCTION_CAST;
- else
- return false;
-
- return true;
- }
- /// Match identifier
- else if (const auto * identifier = node->as())
- {
- out.function = RPNElement::FUNCTION_IDENTIFIER;
- out.identifier.emplace(identifier->name());
- out.func_name = "column identifier";
-
- return true;
- }
-
- /// Check if we have constants behind the node
- return tryCastToConstType(node, out);
-}
-
-bool ApproximateNearestNeighborCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out)
-{
- Field const_value;
- DataTypePtr const_type;
-
- if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
- {
- /// Check for constant types
- if (const_value.getType() == Field::Types::Float64)
- {
- out.function = RPNElement::FUNCTION_FLOAT_LITERAL;
- out.float_literal.emplace(const_value.safeGet());
- out.func_name = "Float literal";
- return true;
- }
-
- if (const_value.getType() == Field::Types::UInt64)
- {
- out.function = RPNElement::FUNCTION_INT_LITERAL;
- out.int_literal.emplace(const_value.safeGet());
- out.func_name = "Int literal";
- return true;
- }
-
- if (const_value.getType() == Field::Types::Int64)
- {
- out.function = RPNElement::FUNCTION_INT_LITERAL;
- out.int_literal.emplace(const_value.safeGet());
- out.func_name = "Int literal";
- return true;
- }
-
- if (const_value.getType() == Field::Types::Tuple)
- {
- out.function = RPNElement::FUNCTION_LITERAL_TUPLE;
- out.tuple_literal = const_value.safeGet();
- out.func_name = "Tuple literal";
- return true;
- }
-
- if (const_value.getType() == Field::Types::Array)
- {
- out.function = RPNElement::FUNCTION_LITERAL_ARRAY;
- out.array_literal = const_value.safeGet();
- out.func_name = "Array literal";
- return true;
- }
-
- if (const_value.getType() == Field::Types::String)
- {
- out.function = RPNElement::FUNCTION_STRING_LITERAL;
- out.func_name = const_value.safeGet();
- return true;
- }
- }
-
- return false;
-}
-
-void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn)
-{
- if (const auto * expr_list = node->as())
- if (const auto * order_by_element = expr_list->children.front()->as())
- traverseAST(order_by_element->children.front(), rpn);
-}
-
-/// Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause
-bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info)
-{
- /// Fill query type field
- ann_info.type = ApproximateNearestNeighborInformation::Type::Where;
-
- /// WHERE section must have at least 5 expressions
- /// Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats))
- if (rpn.size() < 5)
- return false;
-
- auto iter = rpn.begin();
-
- /// Query starts from operator less
- if (iter->function != RPNElement::FUNCTION_COMPARISON)
- return false;
-
- const bool greater_case = iter->func_name == "greater" || iter->func_name == "greaterOrEquals";
- const bool less_case = iter->func_name == "less" || iter->func_name == "lessOrEquals";
-
- ++iter;
-
- if (less_case)
- {
- if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL)
- return false;
-
- ann_info.distance = getFloatOrIntLiteralOrPanic(iter);
- if (ann_info.distance < 0)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance);
-
- ++iter;
-
- }
- else if (!greater_case)
- return false;
-
- auto end = rpn.end();
- if (!matchMainParts(iter, end, ann_info))
- return false;
-
- if (greater_case)
- {
- if (ann_info.reference_vector.size() < 2)
- return false;
- ann_info.distance = ann_info.reference_vector.back();
- if (ann_info.distance < 0)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance);
- ann_info.reference_vector.pop_back();
- }
-
- /// query is ok
- return true;
-}
-
-/// Returns true and stores ANNExpr if the query has valid ORDERBY clause
-bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info)
-{
- /// Fill query type field
- ann_info.type = ApproximateNearestNeighborInformation::Type::OrderBy;
-
- // ORDER BY clause must have at least 3 expressions
- if (rpn.size() < 3)
- return false;
-
- auto iter = rpn.begin();
- auto end = rpn.end();
-
- return ApproximateNearestNeighborCondition::matchMainParts(iter, end, ann_info);
-}
-
-/// Returns true and stores Length if we have valid LIMIT clause in query
-bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit)
-{
- if (rpn.function == RPNElement::FUNCTION_INT_LITERAL)
- {
- limit = rpn.int_literal.value();
- return true;
- }
-
- return false;
-}
-
-/// Matches dist function, referencer vector, column name
-bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info)
-{
- bool identifier_found = false;
-
- /// Matches DistanceFunc->[Column]->[Tuple(array)Func]->ReferenceVector(floats)->[Column]
- if (iter->function != RPNElement::FUNCTION_DISTANCE)
- return false;
-
- ann_info.metric = stringToMetric(iter->func_name);
- ++iter;
-
- if (ann_info.metric == ApproximateNearestNeighborInformation::Metric::Lp)
- {
- if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL &&
- iter->function != RPNElement::FUNCTION_INT_LITERAL)
- return false;
- ann_info.p_for_lp_dist = getFloatOrIntLiteralOrPanic(iter);
- ++iter;
- }
-
- if (iter->function == RPNElement::FUNCTION_IDENTIFIER)
- {
- identifier_found = true;
- ann_info.column_name = std::move(iter->identifier.value());
- ++iter;
- }
-
- if (iter->function == RPNElement::FUNCTION_TUPLE || iter->function == RPNElement::FUNCTION_ARRAY)
- ++iter;
-
- if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE)
- {
- extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal);
- ++iter;
- }
-
- if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY)
- {
- extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal);
- ++iter;
- }
-
- /// further conditions are possible if there is no tuple or array, or no identifier is found
- /// the tuple or array can be inside a cast function. For other cases, see the loop after this condition
- if (iter != end && iter->function == RPNElement::FUNCTION_CAST)
- {
- ++iter;
- /// Cast should be made to array or tuple
- if (!iter->func_name.starts_with("Array") && !iter->func_name.starts_with("Tuple"))
- return false;
- ++iter;
- if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE)
- {
- extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal);
- ++iter;
- }
- else if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY)
- {
- extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal);
- ++iter;
- }
- else
- return false;
- }
-
- while (iter != end)
- {
- if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL ||
- iter->function == RPNElement::FUNCTION_INT_LITERAL)
- ann_info.reference_vector.emplace_back(getFloatOrIntLiteralOrPanic(iter));
- else if (iter->function == RPNElement::FUNCTION_IDENTIFIER)
- {
- if (identifier_found)
- return false;
- ann_info.column_name = std::move(iter->identifier.value());
- identifier_found = true;
- }
- else
- return false;
-
- ++iter;
- }
-
- /// Final checks of correctness
- return identifier_found && !ann_info.reference_vector.empty();
-}
-
-/// Gets float or int from AST node
-float ApproximateNearestNeighborCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter)
-{
- if (iter->float_literal.has_value())
- return iter->float_literal.value();
- if (iter->int_literal.has_value())
- return static_cast(iter->int_literal.value());
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong parsed AST in buildRPN\n");
-}
-
-}
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index a6ef0063069..59f3a299c99 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -11,6 +11,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -48,7 +49,6 @@
#include
#include
-#include
namespace CurrentMetrics
{
@@ -1406,11 +1406,10 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin)
reader.read(granule);
- auto ann_condition = std::dynamic_pointer_cast(condition);
- if (ann_condition != nullptr)
+ if (index_helper->isVectorSimilarityIndex())
{
/// An array of indices of useful ranges.
- auto result = ann_condition->getUsefulRanges(granule);
+ auto result = condition->getUsefulRanges(granule);
for (auto range : result)
{
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.cpp b/src/Storages/MergeTree/MergeTreeIOSettings.cpp
index 58c3bd28d6a..24cb25afe47 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.cpp
@@ -27,7 +27,6 @@ MergeTreeWriterSettings::MergeTreeWriterSettings(
, rewrite_primary_key(rewrite_primary_key_)
, blocks_are_granules_size(blocks_are_granules_size_)
, query_write_settings(query_write_settings_)
- , max_threads_for_annoy_index_creation(global_settings.max_threads_for_annoy_index_creation)
, low_cardinality_max_dictionary_size(global_settings.low_cardinality_max_dictionary_size)
, low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part != 0)
, use_compact_variant_discriminators_serialization(storage_settings->use_compact_variant_discriminators_serialization)
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h
index c79ca1e66ee..47b174b2e29 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.h
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.h
@@ -77,8 +77,6 @@ struct MergeTreeWriterSettings
bool blocks_are_granules_size;
WriteSettings query_write_settings;
- size_t max_threads_for_annoy_index_creation;
-
size_t low_cardinality_max_dictionary_size;
bool low_cardinality_use_single_dictionary_for_part;
bool use_compact_variant_discriminators_serialization;
diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
deleted file mode 100644
index b68e48eeb3a..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#ifdef ENABLE_ANNOY
-
-#include
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
- extern const int ILLEGAL_COLUMN;
- extern const int INCORRECT_DATA;
- extern const int INCORRECT_NUMBER_OF_COLUMNS;
- extern const int INCORRECT_QUERY;
- extern const int LOGICAL_ERROR;
- extern const int NOT_IMPLEMENTED;
-}
-
-template
-AnnoyIndexWithSerialization::AnnoyIndexWithSerialization(size_t dimensions)
- : Base::AnnoyIndex(static_cast(dimensions))
-{
-}
-
-template
-void AnnoyIndexWithSerialization::serialize(WriteBuffer & ostr) const
-{
- chassert(Base::_built);
- writeIntBinary(Base::_s, ostr);
- writeIntBinary(Base::_n_items, ostr);
- writeIntBinary(Base::_n_nodes, ostr);
- writeIntBinary(Base::_nodes_size, ostr);
- writeIntBinary(Base::_K, ostr);
- writeIntBinary(Base::_seed, ostr);
- writeVectorBinary(Base::_roots, ostr);
- ostr.write(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes);
-}
-
-template
-void AnnoyIndexWithSerialization::deserialize(ReadBuffer & istr)
-{
- chassert(!Base::_built);
- readIntBinary(Base::_s, istr);
- readIntBinary(Base::_n_items, istr);
- readIntBinary(Base::_n_nodes, istr);
- readIntBinary(Base::_nodes_size, istr);
- readIntBinary(Base::_K, istr);
- readIntBinary(Base::_seed, istr);
- readVectorBinary(Base::_roots, istr);
- Base::_nodes = realloc(Base::_nodes, Base::_s * Base::_n_nodes);
- istr.readStrict(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes);
-
- Base::_fd = 0;
- // set flags
- Base::_loaded = false;
- Base::_verbose = false;
- Base::_on_disk = false;
- Base::_built = true;
-}
-
-template
-size_t AnnoyIndexWithSerialization::getDimensions() const
-{
- return Base::get_f();
-}
-
-
-template
-MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_)
- : index_name(index_name_)
- , index_sample_block(index_sample_block_)
- , index(nullptr)
-{}
-
-template
-MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(
- const String & index_name_,
- const Block & index_sample_block_,
- AnnoyIndexWithSerializationPtr index_)
- : index_name(index_name_)
- , index_sample_block(index_sample_block_)
- , index(std::move(index_))
-{}
-
-template
-void MergeTreeIndexGranuleAnnoy::serializeBinary(WriteBuffer & ostr) const
-{
- /// Number of dimensions is required in the index constructor,
- /// so it must be written and read separately from the other part
- writeIntBinary(static_cast(index->getDimensions()), ostr); // write dimension
- index->serialize(ostr);
-}
-
-template
-void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
-{
- UInt64 dimension;
- readIntBinary(dimension, istr);
- index = std::make_shared>(dimension);
- index->deserialize(istr);
-}
-
-template
-MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy(
- const String & index_name_,
- const Block & index_sample_block_,
- UInt64 trees_,
- size_t max_threads_for_creation_)
- : index_name(index_name_)
- , index_sample_block(index_sample_block_)
- , trees(trees_)
- , max_threads_for_creation(max_threads_for_creation_)
-{}
-
-template
-MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset()
-{
- int threads = (max_threads_for_creation == 0) ? -1 : static_cast(max_threads_for_creation);
- /// clang-tidy reports a false positive: it considers %p with an outdated pointer in fprintf() (used by logging which we don't do) dereferencing
- index->build(static_cast(trees), threads);
- auto granule = std::make_shared>(index_name, index_sample_block, index);
- index = nullptr;
- return granule;
-}
-
-template
-void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t * pos, size_t limit)
-{
- if (*pos >= block.rows())
- throw Exception(
- ErrorCodes::LOGICAL_ERROR,
- "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
- *pos, block.rows());
-
- size_t rows_read = std::min(limit, block.rows() - *pos);
-
- if (rows_read == 0)
- return;
-
- if (rows_read > std::numeric_limits::max())
- throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than 4B rows per index granule.");
-
- if (index_sample_block.columns() > 1)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
-
- const String & index_column_name = index_sample_block.getByPosition(0).name;
- ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
-
- if (const auto & column_array = typeid_cast(column_cut.get()))
- {
- const auto & column_array_data = column_array->getData();
- const auto & column_array_data_float = typeid_cast(column_array_data);
- const auto & column_array_data_float_data = column_array_data_float.getData();
-
- const auto & column_array_offsets = column_array->getOffsets();
- const size_t num_rows = column_array_offsets.size();
-
- if (column_array->empty())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
-
- /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
- /// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default
- /// value which is also empty.
- if (column_array->isDefaultAt(0))
- throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
-
- /// Check all sizes are the same
- size_t dimension = column_array_offsets[0];
- for (size_t i = 0; i < num_rows - 1; ++i)
- if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
- throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
-
- /// Also check that previously inserted blocks have the same size as this block.
- /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
- /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
- if (index && index->getDimensions() != dimension)
- throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
-
- if (!index)
- index = std::make_shared>(dimension);
-
- /// Add all rows of block
- index->add_item(index->get_n_items(), column_array_data_float_data.data());
- for (size_t current_row = 1; current_row < num_rows; ++current_row)
- index->add_item(index->get_n_items(), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
- }
- else if (const auto & column_tuple = typeid_cast(column_cut.get()))
- {
- const auto & column_tuple_columns = column_tuple->getColumns();
-
- /// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything
- std::vector> data(column_tuple->size(), std::vector());
- for (const auto & column : column_tuple_columns)
- {
- const auto & pod_array = typeid_cast(column.get())->getData();
- for (size_t i = 0; i < pod_array.size(); ++i)
- data[i].push_back(pod_array[i]);
- }
-
- if (data.empty())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
-
- if (!index)
- index = std::make_shared>(data[0].size());
-
- for (const auto & item : data)
- index->add_item(index->get_n_items(), item.data());
- }
- else
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column");
-
- *pos += rows_read;
-}
-
-
-MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy(
- const IndexDescription & /*index_description*/,
- const SelectQueryInfo & query,
- const String & distance_function_,
- ContextPtr context)
- : ann_condition(query, context)
- , distance_function(distance_function_)
- , search_k(context->getSettingsRef().annoy_index_search_k_nodes)
-{}
-
-bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const
-{
- throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes");
-}
-
-bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const
-{
- return ann_condition.alwaysUnknownOrTrue(distance_function);
-}
-
-std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
-{
- if (distance_function == DISTANCE_FUNCTION_L2)
- return getUsefulRangesImpl(idx_granule);
- else if (distance_function == DISTANCE_FUNCTION_COSINE)
- return getUsefulRangesImpl(idx_granule);
- std::unreachable();
-}
-
-template
-std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const
-{
- const UInt64 limit = ann_condition.getLimit();
- const UInt64 index_granularity = ann_condition.getIndexGranularity();
- const std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where
- ? std::optional(ann_condition.getComparisonDistanceForWhereQuery())
- : std::nullopt;
-
- if (comparison_distance && comparison_distance.value() < 0)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance");
-
- const std::vector reference_vector = ann_condition.getReferenceVector();
-
- const auto granule = std::dynamic_pointer_cast>(idx_granule);
- if (granule == nullptr)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type");
-
- const AnnoyIndexWithSerializationPtr annoy = granule->index;
-
- if (ann_condition.getDimensions() != annoy->getDimensions())
- throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
- "does not match the dimension in the index ({})",
- ann_condition.getDimensions(), annoy->getDimensions());
-
- std::vector neighbors; /// indexes of dots which were closest to the reference vector
- std::vector distances;
- neighbors.reserve(limit);
- distances.reserve(limit);
-
- annoy->get_nns_by_vector(reference_vector.data(), limit, static_cast(search_k), &neighbors, &distances);
-
- chassert(neighbors.size() == distances.size());
-
- std::vector granules;
- granules.reserve(neighbors.size());
- for (size_t i = 0; i < neighbors.size(); ++i)
- {
- if (comparison_distance && distances[i] > comparison_distance)
- continue;
- granules.push_back(neighbors[i] / index_granularity);
- }
-
- /// make unique
- std::sort(granules.begin(), granules.end());
- granules.erase(std::unique(granules.begin(), granules.end()), granules.end());
-
- return granules;
-}
-
-MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_)
- : IMergeTreeIndex(index_)
- , trees(trees_)
- , distance_function(distance_function_)
-{}
-
-MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const
-{
- if (distance_function == DISTANCE_FUNCTION_L2)
- return std::make_shared>(index.name, index.sample_block);
- else if (distance_function == DISTANCE_FUNCTION_COSINE)
- return std::make_shared>(index.name, index.sample_block);
- std::unreachable();
-}
-
-MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator(const MergeTreeWriterSettings & settings) const
-{
- /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171
- if (distance_function == DISTANCE_FUNCTION_L2)
- return std::make_shared>(index.name, index.sample_block, trees, settings.max_threads_for_annoy_index_creation);
- else if (distance_function == DISTANCE_FUNCTION_COSINE)
- return std::make_shared>(index.name, index.sample_block, trees, settings.max_threads_for_annoy_index_creation);
- std::unreachable();
-}
-
-MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
-{
- return std::make_shared(index, query, distance_function, context);
-};
-
-MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const ActionsDAG *, ContextPtr) const
-{
- throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG");
-}
-
-MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index)
-{
- static constexpr auto DEFAULT_DISTANCE_FUNCTION = DISTANCE_FUNCTION_L2;
- String distance_function = DEFAULT_DISTANCE_FUNCTION;
- if (!index.arguments.empty())
- distance_function = index.arguments[0].safeGet();
-
- static constexpr auto DEFAULT_TREES = 100uz;
- UInt64 trees = DEFAULT_TREES;
- if (index.arguments.size() > 1)
- trees = index.arguments[1].safeGet();
-
- return std::make_shared(index, trees, distance_function);
-}
-
-void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
-{
- /// Check number and type of Annoy index arguments:
-
- if (index.arguments.size() > 2)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters");
-
- if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of Annoy index must be of type String");
-
- if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::UInt64)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be of type UInt64");
-
- /// Check that the index is created on a single column
-
- if (index.column_names.size() != 1 || index.data_types.size() != 1)
- throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column");
-
- /// Check that a supported metric was passed as first argument
-
- if (!index.arguments.empty())
- {
- String distance_name = index.arguments[0].safeGet();
- if (distance_name != DISTANCE_FUNCTION_L2 && distance_name != DISTANCE_FUNCTION_COSINE)
- throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index only supports distance functions '{}' and '{}'", DISTANCE_FUNCTION_L2, DISTANCE_FUNCTION_COSINE);
- }
-
- /// Check data type of indexed column:
-
- auto throw_unsupported_underlying_column_exception = []()
- {
- throw Exception(
- ErrorCodes::ILLEGAL_COLUMN,
- "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
- };
-
- DataTypePtr data_type = index.sample_block.getDataTypes()[0];
-
- if (const auto * data_type_array = typeid_cast(data_type.get()))
- {
- TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
- if (!WhichDataType(nested_type_index).isFloat32())
- throw_unsupported_underlying_column_exception();
- }
- else if (const auto * data_type_tuple = typeid_cast(data_type.get()))
- {
- const DataTypes & inner_types = data_type_tuple->getElements();
- for (const auto & inner_type : inner_types)
- {
- TypeIndex nested_type_index = inner_type->getTypeId();
- if (!WhichDataType(nested_type_index).isFloat32())
- throw_unsupported_underlying_column_exception();
- }
- }
- else
- throw_unsupported_underlying_column_exception();
-}
-
-}
-
-#endif
diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h
deleted file mode 100644
index 282920c608e..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#pragma once
-
-#ifdef ENABLE_ANNOY
-
-#include
-
-#include
-#include
-
-namespace DB
-{
-
-template
-class AnnoyIndexWithSerialization : public Annoy::AnnoyIndex
-{
- using Base = Annoy::AnnoyIndex;
-
-public:
- explicit AnnoyIndexWithSerialization(size_t dimensions);
- void serialize(WriteBuffer & ostr) const;
- void deserialize(ReadBuffer & istr);
- size_t getDimensions() const;
-};
-
-template
-using AnnoyIndexWithSerializationPtr = std::shared_ptr>;
-
-
-template
-struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule
-{
- MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_);
- MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexWithSerializationPtr index_);
-
- ~MergeTreeIndexGranuleAnnoy() override = default;
-
- void serializeBinary(WriteBuffer & ostr) const override;
- void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
-
- bool empty() const override { return !index.get(); }
-
- const String index_name;
- const Block index_sample_block;
- AnnoyIndexWithSerializationPtr index;
-};
-
-
-template
-struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator
-{
- MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, UInt64 trees, size_t max_threads_for_creation);
- ~MergeTreeIndexAggregatorAnnoy() override = default;
-
- bool empty() const override { return !index || index->get_n_items() == 0; }
- MergeTreeIndexGranulePtr getGranuleAndReset() override;
- void update(const Block & block, size_t * pos, size_t limit) override;
-
- const String index_name;
- const Block index_sample_block;
- const UInt64 trees;
- const size_t max_threads_for_creation;
- AnnoyIndexWithSerializationPtr index;
-};
-
-
-class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexConditionApproximateNearestNeighbor
-{
-public:
- MergeTreeIndexConditionAnnoy(
- const IndexDescription & index_description,
- const SelectQueryInfo & query,
- const String & distance_function,
- ContextPtr context);
-
- ~MergeTreeIndexConditionAnnoy() override = default;
-
- bool alwaysUnknownOrTrue() const override;
- bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
- std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override;
-
-private:
- template
- std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const;
-
- const ApproximateNearestNeighborCondition ann_condition;
- const String distance_function;
- const Int64 search_k;
-};
-
-
-class MergeTreeIndexAnnoy final : public IMergeTreeIndex
-{
-public:
-
- MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_);
-
- ~MergeTreeIndexAnnoy() override = default;
-
- MergeTreeIndexGranulePtr createIndexGranule() const override;
- MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
- MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const;
- MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
- bool isVectorSearch() const override { return true; }
-
-private:
- const UInt64 trees;
- const String distance_function;
-};
-
-}
-
-#endif
diff --git a/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp
new file mode 100644
index 00000000000..29de109d4fc
--- /dev/null
+++ b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp
@@ -0,0 +1,45 @@
+#include
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_INDEX;
+}
+
+MergeTreeIndexLegacyVectorSimilarity::MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_)
+ : IMergeTreeIndex(index_)
+{
+}
+
+MergeTreeIndexGranulePtr MergeTreeIndexLegacyVectorSimilarity::createIndexGranule() const
+{
+ throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
+}
+
+MergeTreeIndexAggregatorPtr MergeTreeIndexLegacyVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings &) const
+{
+ throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
+}
+
+MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const SelectQueryInfo &, ContextPtr) const
+{
+ throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
+};
+
+MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const
+{
+ throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'");
+}
+
+MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index)
+{
+ return std::make_shared(index);
+}
+
+void legacyVectorSimilarityIndexValidator(const IndexDescription &, bool)
+{
+}
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h
new file mode 100644
index 00000000000..1015401823d
--- /dev/null
+++ b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include
+
+/// Walking corpse implementation for removed skipping index of type "annoy" and "usearch".
+/// Its only purpose is to allow loading old tables with indexes of these types.
+/// Data insertion and index usage/search will throw an exception, suggesting to migrate to "vector_similarity" indexes.
+
+namespace DB
+{
+
+class MergeTreeIndexLegacyVectorSimilarity : public IMergeTreeIndex
+{
+public:
+ explicit MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_);
+ ~MergeTreeIndexLegacyVectorSimilarity() override = default;
+
+ MergeTreeIndexGranulePtr createIndexGranule() const override;
+ MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings &) const override;
+ MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo &, ContextPtr) const;
+ MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
+
+ bool isVectorSimilarityIndex() const override { return true; }
+};
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
deleted file mode 100644
index efd9bb754e1..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
+++ /dev/null
@@ -1,463 +0,0 @@
-#ifdef ENABLE_USEARCH
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpass-failed"
-
-#include
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-namespace ProfileEvents
-{
- extern const Event USearchAddCount;
- extern const Event USearchAddVisitedMembers;
- extern const Event USearchAddComputedDistances;
- extern const Event USearchSearchCount;
- extern const Event USearchSearchVisitedMembers;
- extern const Event USearchSearchComputedDistances;
-}
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
- extern const int CANNOT_ALLOCATE_MEMORY;
- extern const int ILLEGAL_COLUMN;
- extern const int INCORRECT_DATA;
- extern const int INCORRECT_NUMBER_OF_COLUMNS;
- extern const int INCORRECT_QUERY;
- extern const int LOGICAL_ERROR;
- extern const int NOT_IMPLEMENTED;
-}
-
-namespace
-{
-
-std::unordered_map nameToScalarKind = {
- {"f64", unum::usearch::scalar_kind_t::f64_k},
- {"f32", unum::usearch::scalar_kind_t::f32_k},
- {"f16", unum::usearch::scalar_kind_t::f16_k},
- {"i8", unum::usearch::scalar_kind_t::i8_k}};
-
-}
-
-template
-USearchIndexWithSerialization::USearchIndexWithSerialization(size_t dimensions, unum::usearch::scalar_kind_t scalar_kind)
- : Base(Base::make(unum::usearch::metric_punned_t(dimensions, Metric, scalar_kind)))
-{
-}
-
-template
-void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const
-{
- auto callback = [&ostr](void * from, size_t n)
- {
- ostr.write(reinterpret_cast(from), n);
- return true;
- };
-
- Base::save_to_stream(callback);
-}
-
-template
-void USearchIndexWithSerialization::deserialize(ReadBuffer & istr)
-{
- auto callback = [&istr](void * from, size_t n)
- {
- istr.readStrict(reinterpret_cast(from), n);
- return true;
- };
-
- Base::load_from_stream(callback);
-}
-
-template
-size_t USearchIndexWithSerialization::getDimensions() const
-{
- return Base::dimensions();
-}
-
-template
-MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch(
- const String & index_name_,
- const Block & index_sample_block_,
- unum::usearch::scalar_kind_t scalar_kind_)
- : index_name(index_name_)
- , index_sample_block(index_sample_block_)
- , scalar_kind(scalar_kind_)
- , index(nullptr)
-{
-}
-
-template
-MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch(
- const String & index_name_,
- const Block & index_sample_block_,
- unum::usearch::scalar_kind_t scalar_kind_,
- USearchIndexWithSerializationPtr index_)
- : index_name(index_name_)
- , index_sample_block(index_sample_block_)
- , scalar_kind(scalar_kind_)
- , index(std::move(index_))
-{
-}
-
-template
-void MergeTreeIndexGranuleUSearch::serializeBinary(WriteBuffer & ostr) const
-{
- /// Number of dimensions is required in the index constructor,
- /// so it must be written and read separately from the other part
- writeIntBinary(static_cast(index->getDimensions()), ostr); // write dimension
- index->serialize(ostr);
-}
-
-template
-void MergeTreeIndexGranuleUSearch::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
-{
- UInt64 dimension;
- readIntBinary(dimension, istr);
- index = std::make_shared>(dimension, scalar_kind);
- index->deserialize(istr);
-}
-
-template
-MergeTreeIndexAggregatorUSearch::MergeTreeIndexAggregatorUSearch(
- const String & index_name_,
- const Block & index_sample_block_,
- unum::usearch::scalar_kind_t scalar_kind_)
- : index_name(index_name_)
- , index_sample_block(index_sample_block_)
- , scalar_kind(scalar_kind_)
-{
-}
-
-template
-MergeTreeIndexGranulePtr MergeTreeIndexAggregatorUSearch::getGranuleAndReset()
-{
- auto granule = std::make_shared>(index_name, index_sample_block, scalar_kind, index);
- index = nullptr;
- return granule;
-}
-
-template
-void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, size_t limit)
-{
- if (*pos >= block.rows())
- throw Exception(
- ErrorCodes::LOGICAL_ERROR,
- "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
- *pos,
- block.rows());
-
- size_t rows_read = std::min(limit, block.rows() - *pos);
-
- if (rows_read == 0)
- return;
-
- if (rows_read > std::numeric_limits::max())
- throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than 4B rows per index granule.");
-
- if (index_sample_block.columns() > 1)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
-
- const String & index_column_name = index_sample_block.getByPosition(0).name;
- ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
-
- if (const auto & column_array = typeid_cast(column_cut.get()))
- {
- const auto & column_array_data = column_array->getData();
- const auto & column_array_data_float = typeid_cast(column_array_data);
- const auto & column_array_data_float_data = column_array_data_float.getData();
-
- const auto & column_array_offsets = column_array->getOffsets();
- const size_t num_rows = column_array_offsets.size();
-
- if (column_array->empty())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
-
- /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
- /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default
- /// values which is also empty.
- if (column_array->isDefaultAt(0))
- throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
-
- /// Check all sizes are the same
- size_t dimension = column_array_offsets[0];
- for (size_t i = 0; i < num_rows - 1; ++i)
- if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
- throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
-
- /// Also check that previously inserted blocks have the same size as this block.
- /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
- /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
- if (index && index->getDimensions() != dimension)
- throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
-
- if (!index)
- index = std::make_shared>(dimension, scalar_kind);
-
- /// Add all rows of block
- if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows)))
- throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index");
-
- for (size_t current_row = 0; current_row < num_rows; ++current_row)
- {
- auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
- if (!rc)
- throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, rc.error.release());
-
- ProfileEvents::increment(ProfileEvents::USearchAddCount);
- ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members);
- ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances);
- }
- }
- else if (const auto & column_tuple = typeid_cast(column_cut.get()))
- {
- const auto & column_tuple_columns = column_tuple->getColumns();
- std::vector> data(column_tuple->size(), std::vector());
- for (const auto & column : column_tuple_columns)
- {
- const auto & pod_array = typeid_cast(column.get())->getData();
- for (size_t i = 0; i < pod_array.size(); ++i)
- data[i].push_back(pod_array[i]);
- }
-
- if (data.empty())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
-
- if (!index)
- index = std::make_shared>(data[0].size(), scalar_kind);
-
- if (!index->reserve(unum::usearch::ceil2(index->size() + data.size())))
- throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index");
-
- for (const auto & item : data)
- {
- auto rc = index->add(static_cast(index->size()), item.data());
- if (!rc)
- throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, rc.error.release());
-
- ProfileEvents::increment(ProfileEvents::USearchAddCount);
- ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members);
- ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances);
- }
- }
- else
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column");
-
- *pos += rows_read;
-}
-
-MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch(
- const IndexDescription & /*index_description*/,
- const SelectQueryInfo & query,
- const String & distance_function_,
- ContextPtr context)
- : ann_condition(query, context)
- , distance_function(distance_function_)
-{
-}
-
-bool MergeTreeIndexConditionUSearch::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const
-{
- throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes");
-}
-
-bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const
-{
- return ann_condition.alwaysUnknownOrTrue(distance_function);
-}
-
-std::vector MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
-{
- if (distance_function == DISTANCE_FUNCTION_L2)
- return getUsefulRangesImpl(idx_granule);
- else if (distance_function == DISTANCE_FUNCTION_COSINE)
- return getUsefulRangesImpl(idx_granule);
- std::unreachable();
-}
-
-template
-std::vector MergeTreeIndexConditionUSearch::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const
-{
- const UInt64 limit = ann_condition.getLimit();
- const UInt64 index_granularity = ann_condition.getIndexGranularity();
- const std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where
- ? std::optional(ann_condition.getComparisonDistanceForWhereQuery())
- : std::nullopt;
-
- if (comparison_distance && comparison_distance.value() < 0)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance");
-
- const std::vector reference_vector = ann_condition.getReferenceVector();
-
- const auto granule = std::dynamic_pointer_cast>(idx_granule);
- if (granule == nullptr)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type");
-
- const USearchIndexWithSerializationPtr index = granule->index;
-
- if (ann_condition.getDimensions() != index->dimensions())
- throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
- "does not match the dimension in the index ({})",
- ann_condition.getDimensions(), index->dimensions());
-
- auto result = index->search(reference_vector.data(), limit);
-
- ProfileEvents::increment(ProfileEvents::USearchSearchCount);
- ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members);
- ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances);
-
- std::vector neighbors(result.size()); /// indexes of dots which were closest to the reference vector
- std::vector distances(result.size());
- result.dump_to(neighbors.data(), distances.data());
-
- std::vector granules;
- granules.reserve(neighbors.size());
- for (size_t i = 0; i < neighbors.size(); ++i)
- {
- if (comparison_distance && distances[i] > comparison_distance)
- continue;
- granules.push_back(neighbors[i] / index_granularity);
- }
-
- /// make unique
- std::sort(granules.begin(), granules.end());
- granules.erase(std::unique(granules.begin(), granules.end()), granules.end());
-
- return granules;
-}
-
-MergeTreeIndexUSearch::MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_, unum::usearch::scalar_kind_t scalar_kind_)
- : IMergeTreeIndex(index_)
- , distance_function(distance_function_)
- , scalar_kind(scalar_kind_)
-{
-}
-
-MergeTreeIndexGranulePtr MergeTreeIndexUSearch::createIndexGranule() const
-{
- if (distance_function == DISTANCE_FUNCTION_L2)
- return std::make_shared>(index.name, index.sample_block, scalar_kind);
- else if (distance_function == DISTANCE_FUNCTION_COSINE)
- return std::make_shared>(index.name, index.sample_block, scalar_kind);
- std::unreachable();
-}
-
-MergeTreeIndexAggregatorPtr MergeTreeIndexUSearch::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
-{
- if (distance_function == DISTANCE_FUNCTION_L2)
- return std::make_shared>(index.name, index.sample_block, scalar_kind);
- else if (distance_function == DISTANCE_FUNCTION_COSINE)
- return std::make_shared>(index.name, index.sample_block, scalar_kind);
- std::unreachable();
-}
-
-MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
-{
- return std::make_shared(index, query, distance_function, context);
-};
-
-MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const ActionsDAG *, ContextPtr) const
-{
- throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG");
-}
-
-MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index)
-{
- static constexpr auto default_distance_function = DISTANCE_FUNCTION_L2;
- String distance_function = default_distance_function;
- if (!index.arguments.empty())
- distance_function = index.arguments[0].safeGet();
-
- static constexpr auto default_scalar_kind = unum::usearch::scalar_kind_t::f16_k;
- auto scalar_kind = default_scalar_kind;
- if (index.arguments.size() > 1)
- scalar_kind = nameToScalarKind.at(index.arguments[1].safeGet());
-
- return std::make_shared(index, distance_function, scalar_kind);
-}
-
-void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
-{
- /// Check number and type of USearch index arguments:
-
- if (index.arguments.size() > 2)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index must not have more than one parameters");
-
- if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of USearch index (distance function) must be of type String");
- if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String)
- throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of USearch index (scalar type) must be of type String");
-
- /// Check that the index is created on a single column
-
- if (index.column_names.size() != 1 || index.data_types.size() != 1)
- throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "USearch indexes must be created on a single column");
-
- /// Check that a supported metric was passed as first argument
-
- if (!index.arguments.empty())
- {
- String distance_name = index.arguments[0].safeGet();
- if (distance_name != DISTANCE_FUNCTION_L2 && distance_name != DISTANCE_FUNCTION_COSINE)
- throw Exception(ErrorCodes::INCORRECT_DATA, "USearch index only supports distance functions '{}' and '{}'", DISTANCE_FUNCTION_L2, DISTANCE_FUNCTION_COSINE);
- }
-
- /// Check that a supported kind was passed as a second argument
-
- if (index.arguments.size() > 1 && !nameToScalarKind.contains(index.arguments[1].safeGet()))
- {
- String supported_kinds;
- for (const auto & [name, kind] : nameToScalarKind)
- {
- if (!supported_kinds.empty())
- supported_kinds += ", ";
- supported_kinds += name;
- }
- throw Exception(ErrorCodes::INCORRECT_DATA, "Unrecognized scalar kind (second argument) for USearch index. Supported kinds are: {}", supported_kinds);
- }
-
- /// Check data type of indexed column:
-
- auto throw_unsupported_underlying_column_exception = []()
- {
- throw Exception(
- ErrorCodes::ILLEGAL_COLUMN,
- "USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
- };
-
- DataTypePtr data_type = index.sample_block.getDataTypes()[0];
-
- if (const auto * data_type_array = typeid_cast(data_type.get()))
- {
- TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
- if (!WhichDataType(nested_type_index).isFloat32())
- throw_unsupported_underlying_column_exception();
- }
- else if (const auto * data_type_tuple = typeid_cast(data_type.get()))
- {
- const DataTypes & inner_types = data_type_tuple->getElements();
- for (const auto & inner_type : inner_types)
- {
- TypeIndex nested_type_index = inner_type->getTypeId();
- if (!WhichDataType(nested_type_index).isFloat32())
- throw_unsupported_underlying_column_exception();
- }
- }
- else
- throw_unsupported_underlying_column_exception();
-}
-
-}
-
-#endif
diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.h b/src/Storages/MergeTree/MergeTreeIndexUSearch.h
deleted file mode 100644
index 41de94402c9..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#pragma once
-
-#ifdef ENABLE_USEARCH
-
-#include
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpass-failed"
-#include
-#pragma clang diagnostic pop
-
-namespace DB
-{
-
-using USearchImplType = unum::usearch::index_dense_gt* key_at */ uint32_t, /* compressed_slot_at */ uint32_t>;
-
-template
-class USearchIndexWithSerialization : public USearchImplType
-{
- using Base = USearchImplType;
-
-public:
- USearchIndexWithSerialization(size_t dimensions, unum::usearch::scalar_kind_t scalar_kind);
- void serialize(WriteBuffer & ostr) const;
- void deserialize(ReadBuffer & istr);
- size_t getDimensions() const;
-};
-
-template
-using USearchIndexWithSerializationPtr = std::shared_ptr>;
-
-
-template
-struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule
-{
- MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_, unum::usearch::scalar_kind_t scalar_kind_);
- MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_, unum::usearch::scalar_kind_t scalar_kind_, USearchIndexWithSerializationPtr index_);
-
- ~MergeTreeIndexGranuleUSearch() override = default;
-
- void serializeBinary(WriteBuffer & ostr) const override;
- void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
-
- bool empty() const override { return !index.get(); }
-
- const String index_name;
- const Block index_sample_block;
- const unum::usearch::scalar_kind_t scalar_kind;
- USearchIndexWithSerializationPtr index;
-};
-
-
-template
-struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator
-{
- MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block, unum::usearch::scalar_kind_t scalar_kind_);
- ~MergeTreeIndexAggregatorUSearch() override = default;
-
- bool empty() const override { return !index || index->size() == 0; }
- MergeTreeIndexGranulePtr getGranuleAndReset() override;
- void update(const Block & block, size_t * pos, size_t limit) override;
-
- const String index_name;
- const Block index_sample_block;
- const unum::usearch::scalar_kind_t scalar_kind;
- USearchIndexWithSerializationPtr index;
-};
-
-
-class MergeTreeIndexConditionUSearch final : public IMergeTreeIndexConditionApproximateNearestNeighbor
-{
-public:
- MergeTreeIndexConditionUSearch(
- const IndexDescription & index_description,
- const SelectQueryInfo & query,
- const String & distance_function,
- ContextPtr context);
-
- ~MergeTreeIndexConditionUSearch() override = default;
-
- bool alwaysUnknownOrTrue() const override;
- bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
- std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override;
-
-private:
- template
- std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const;
-
- const ApproximateNearestNeighborCondition ann_condition;
- const String distance_function;
-};
-
-
-class MergeTreeIndexUSearch : public IMergeTreeIndex
-{
-public:
- MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_, unum::usearch::scalar_kind_t scalar_kind_);
-
- ~MergeTreeIndexUSearch() override = default;
-
- MergeTreeIndexGranulePtr createIndexGranule() const override;
- MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
- MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const;
- MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override;
- bool isVectorSearch() const override { return true; }
-
-private:
- const String distance_function;
- const unum::usearch::scalar_kind_t scalar_kind;
-};
-
-}
-
-
-#endif
-
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
new file mode 100644
index 00000000000..5b0793fa0c8
--- /dev/null
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
@@ -0,0 +1,492 @@
+#include
+
+#if USE_USEARCH
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpass-failed"
+
+#include
+#include
+#include
+#include
+#include
+#include