diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e045170561d..3dcce68ab46 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -59,6 +59,9 @@ At a minimum, the following information should be added (but add more as needed) - [ ] Exclude: All with TSAN, MSAN, UBSAN, Coverage - [ ] Exclude: All with aarch64, release, debug --- +- [ ] Run only fuzzers related jobs (libFuzzer fuzzers, AST fuzzers, etc.) +- [ ] Exclude: AST fuzzers +--- - [ ] Do not test - [ ] Woolen Wolfdog - [ ] Upload binaries for special builds diff --git a/.gitmodules b/.gitmodules index 7fdfb1103c5..0a66031de8d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -230,9 +230,6 @@ [submodule "contrib/minizip-ng"] path = contrib/minizip-ng url = https://github.com/zlib-ng/minizip-ng -[submodule "contrib/annoy"] - path = contrib/annoy - url = https://github.com/ClickHouse/annoy [submodule "contrib/qpl"] path = contrib/qpl url = https://github.com/intel/qpl diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index 341c92d3042..247028b96e0 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -1,4 +1,4 @@ -add_compile_options($<$,$>:${COVERAGE_FLAGS}>) +add_compile_options("$<$,$>:${COVERAGE_FLAGS}>") if (USE_CLANG_TIDY) set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") diff --git a/base/poco/Net/include/Poco/Net/HTTPServerSession.h b/base/poco/Net/include/Poco/Net/HTTPServerSession.h index 3df7995509a..b0659ca405c 100644 --- a/base/poco/Net/include/Poco/Net/HTTPServerSession.h +++ b/base/poco/Net/include/Poco/Net/HTTPServerSession.h @@ -58,6 +58,10 @@ namespace Net void setKeepAliveTimeout(Poco::Timespan keepAliveTimeout); + size_t getKeepAliveTimeout() const { return _keepAliveTimeout.totalSeconds(); } + + size_t getMaxKeepAliveRequests() const { return _maxKeepAliveRequests; } + private: bool _firstRequest; Poco::Timespan _keepAliveTimeout; diff --git a/base/poco/Net/src/HTTPServerSession.cpp b/base/poco/Net/src/HTTPServerSession.cpp index f67a63a9e0e..8eec3e14872 100644 --- a/base/poco/Net/src/HTTPServerSession.cpp +++ b/base/poco/Net/src/HTTPServerSession.cpp @@ -19,11 +19,11 @@ namespace Poco { namespace Net { -HTTPServerSession::HTTPServerSession(const StreamSocket& socket, HTTPServerParams::Ptr pParams): - HTTPSession(socket, pParams->getKeepAlive()), - _firstRequest(true), - _keepAliveTimeout(pParams->getKeepAliveTimeout()), - _maxKeepAliveRequests(pParams->getMaxKeepAliveRequests()) +HTTPServerSession::HTTPServerSession(const StreamSocket & socket, HTTPServerParams::Ptr pParams) + : HTTPSession(socket, pParams->getKeepAlive()) + , _firstRequest(true) + , _keepAliveTimeout(pParams->getKeepAliveTimeout()) + , _maxKeepAliveRequests(pParams->getMaxKeepAliveRequests()) { setTimeout(pParams->getTimeout()); } @@ -52,11 +52,12 @@ bool HTTPServerSession::hasMoreRequests() } else if (_maxKeepAliveRequests != 0 && getKeepAlive()) { - if (_maxKeepAliveRequests > 0) - --_maxKeepAliveRequests; - return buffered() > 0 || socket().poll(_keepAliveTimeout, Socket::SELECT_READ); - } - else return false; + if (_maxKeepAliveRequests > 0) + --_maxKeepAliveRequests; + return buffered() > 0 || socket().poll(_keepAliveTimeout, Socket::SELECT_READ); + } + else + return false; } diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index d8d211b9329..091e20e1387 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -57,8 +57,8 @@ option(WITH_COVERAGE "Instrumentation for code coverage with default implementat if (WITH_COVERAGE) message (STATUS "Enabled instrumentation for code coverage") - set(COVERAGE_FLAGS "SHELL:-fprofile-instr-generate -fcoverage-mapping") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fprofile-instr-generate -fcoverage-mapping") + set (COVERAGE_FLAGS -fprofile-instr-generate -fcoverage-mapping) + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fprofile-instr-generate -fcoverage-mapping") endif() option (SANITIZE_COVERAGE "Instrumentation for code coverage with custom callbacks" OFF) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b33e7083e32..dc2ad2a3150 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -205,9 +205,8 @@ add_contrib (morton-nd-cmake morton-nd) if (ARCH_S390X) add_contrib(crc32-s390x-cmake crc32-s390x) endif() -add_contrib (annoy-cmake annoy) -option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES}) +option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES}) if (ENABLE_USEARCH) add_contrib (FP16-cmake FP16) add_contrib (robin-map-cmake robin-map) diff --git a/contrib/annoy b/contrib/annoy deleted file mode 160000 index f2ac8e7b48f..00000000000 --- a/contrib/annoy +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f2ac8e7b48f9a9cf676d3b58286e5455aba8e956 diff --git a/contrib/annoy-cmake/CMakeLists.txt b/contrib/annoy-cmake/CMakeLists.txt deleted file mode 100644 index bdef7d92132..00000000000 --- a/contrib/annoy-cmake/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -option(ENABLE_ANNOY "Enable Annoy index support" ${ENABLE_LIBRARIES}) - -# Annoy index should be disabled with undefined sanitizer. Because of memory storage optimizations -# (https://github.com/ClickHouse/annoy/blob/9d8a603a4cd252448589e84c9846f94368d5a289/src/annoylib.h#L442-L463) -# UBSan fails and leads to crash. Simmilar issue is already opened in Annoy repo -# https://github.com/spotify/annoy/issues/456 -# Problem with aligment can lead to errors like -# (https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment) -# or will lead to crash on arm https://developer.arm.com/documentation/ka003038/latest -# This issues should be resolved before annoy became non-experimental (--> setting "allow_experimental_annoy_index") -if ((NOT ENABLE_ANNOY) OR (SANITIZE STREQUAL "undefined") OR (ARCH_AARCH64)) - message (STATUS "Not using annoy") - return() -endif() - -set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy") -set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src") - -add_library(_annoy INTERFACE) -target_include_directories(_annoy SYSTEM INTERFACE ${ANNOY_SOURCE_DIR}) - -add_library(ch_contrib::annoy ALIAS _annoy) -target_compile_definitions(_annoy INTERFACE ENABLE_ANNOY) -target_compile_definitions(_annoy INTERFACE ANNOYLIB_MULTITHREADED_BUILD) diff --git a/contrib/libprotobuf-mutator b/contrib/libprotobuf-mutator index 1f95f808306..b922c8ab900 160000 --- a/contrib/libprotobuf-mutator +++ b/contrib/libprotobuf-mutator @@ -1 +1 @@ -Subproject commit 1f95f8083066f5b38fd2db172e7e7f9aa7c49d2d +Subproject commit b922c8ab9004ef9944982e4f165e2747b13223fa diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt index 29fbe57106c..6be622275ae 100644 --- a/contrib/usearch-cmake/CMakeLists.txt +++ b/contrib/usearch-cmake/CMakeLists.txt @@ -1,9 +1,7 @@ -set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch") -set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include") - set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16") set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map") -set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD-map") +set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD") +set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch") add_library(_usearch INTERFACE) @@ -11,7 +9,6 @@ target_include_directories(_usearch SYSTEM INTERFACE ${FP16_PROJECT_DIR}/include ${ROBIN_MAP_PROJECT_DIR}/include ${SIMSIMD_PROJECT_DIR}/include - ${USEARCH_SOURCE_DIR}) + ${USEARCH_PROJECT_DIR}/include) add_library(ch_contrib::usearch ALIAS _usearch) -target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH) diff --git a/docker/packager/binary-builder/build.sh b/docker/packager/binary-builder/build.sh index bd5f2fe8466..ba9311c6880 100755 --- a/docker/packager/binary-builder/build.sh +++ b/docker/packager/binary-builder/build.sh @@ -108,7 +108,8 @@ if [ -n "$MAKE_DEB" ]; then bash -x /build/packages/build fi -mv ./programs/clickhouse* /output || mv ./programs/*_fuzzer /output +mv ./programs/clickhouse* /output ||: +mv ./programs/*_fuzzer /output ||: [ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output [ -x ./programs/self-extracting/clickhouse-stripped ] && mv ./programs/self-extracting/clickhouse-stripped /output [ -x ./programs/self-extracting/clickhouse-keeper ] && mv ./programs/self-extracting/clickhouse-keeper /output diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 5a81313f62e..e73d6f07a32 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -17,7 +17,7 @@ In terms of SQL, the nearest neighborhood problem can be expressed as follows: ``` sql SELECT * -FROM table_with_ann_index +FROM table ORDER BY Distance(vectors, Point) LIMIT N ``` @@ -27,75 +27,109 @@ Function `Distance` computes the distance between two vectors. Often, the Euclid distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results. -An alternative formulation of the nearest neighborhood search problem looks as follows: +This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for +situations where `MaxDistance` is difficult to determine in advance. -``` sql -SELECT * -FROM table_with_ann_index -WHERE Distance(vectors, Point) < MaxDistance -LIMIT N -``` - -While the first query returns the top-`N` closest points to the reference point, the second query returns all points closer to the reference -point than a maximally allowed radius `MaxDistance`. Parameter `N` limits the number of returned values which is useful for situations where -`MaxDistance` is difficult to determine in advance. - -With brute force search, both queries are expensive (linear in the number of points) because the distance between all points in `vectors` and +With brute force search, the query is expensive (linear in the number of points) because the distance between all points in `vectors` and `Point` must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer much quicker (in sub-linear time). -# Creating and Using ANN Indexes {#creating_using_ann_indexes} +# Creating and Using Vector Similarity Indexes -Syntax to create an ANN index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column: +Syntax to create a vector similarity index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column: ```sql -CREATE TABLE table_with_ann_index +CREATE TABLE table ( - `id` Int64, - `vectors` Array(Float32), - INDEX [ann_index_name vectors TYPE [ann_index_type]([ann_index_parameters]) [GRANULARITY [N]] + id Int64, + vectors Array(Float32), + INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; ``` +Parameters: +- `method`: Supports currently only `hnsw`. +- `distance_function`: either `L2Distance` (the [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) - the length of a + line between two points in Euclidean space), or `cosineDistance` (the [cosine + distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors). +- `quantization`: either `f32`, `f16`, or `i8` for storing the vector with reduced precision (optional, default: `f32`) +- `m`: the number of neighbors per graph node (optional, default: 16) +- `ef_construction`: (optional, default: 128) +- `ef_search`: (optional, default: 64) + +Example: + +```sql +CREATE TABLE table +( + id Int64, + vectors Array(Float32), + INDEX idx vectors TYPE vector_similarity('hnsw', 'L2Distance') -- Alternative syntax: TYPE vector_similarity(hnsw, L2Distance) +) +ENGINE = MergeTree +ORDER BY id; +``` + +Vector similarity indexes are based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW +algorithm](https://arxiv.org/abs/1603.09320), i.e., a hierarchical graph where each point represents a vector and the edges represent +similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the +overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors, that are expensive +to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further distance computations on modern +Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient navigation around immutable persistent +files, without loading them into RAM. + +USearch indexes are currently experimental, to use them you first need to `SET allow_experimental_vector_similarity_index = 1`. + +Vector similarity indexes currently support two distance functions: +- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space + ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)). +- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors + ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). + +Vector similarity indexes allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`. +If no scalar kind was specified during index creation, `f16` is used as default. + +For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no +distance function was specified during index creation, `L2Distance` is used as default. + +:::note +All arrays must have same length. To avoid errors, you can use a +[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK +length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported. +::: + +:::note +The vector similarity index currently does not work with per-table, non-default `index_granularity` settings (see +[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml. +::: + ANN indexes are built during column insertion and merge. As a result, `INSERT` and `OPTIMIZE` statements will be slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write requests. -ANN indexes support two types of queries: - -- ORDER BY queries: +ANN indexes support these queries: ``` sql SELECT * - FROM table_with_ann_index + FROM table [WHERE ...] ORDER BY Distance(vectors, Point) LIMIT N ``` -- WHERE queries: - - ``` sql - SELECT * - FROM table_with_ann_index - WHERE Distance(vectors, Point) < MaxDistance - LIMIT N - ``` - :::tip To avoid writing out large vectors, you can use [query parameters](/docs/en/interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g. ```bash -clickhouse-client --param_vec='hello' --query="SELECT * FROM table_with_ann_index WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0" +clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0" ``` ::: -**Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)` -clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries -without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting +**Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause +cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for approximate neighbor search. @@ -122,128 +156,3 @@ brute-force distance calculation over all rows of the granules. With a small `GR equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall back to a smaller `GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY` was specified for ANN indexes, the default value is 100 million. - - -# Available ANN Indexes {#available_ann_indexes} - -- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy) - -- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch) - -## Annoy {#annoy} - -Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently -disabled on ARM due to memory safety problems with the algorithm. - -This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random -linear surfaces (lines in 2D, planes in 3D etc.). - -
- -
- -Syntax to create an Annoy index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column: - -```sql -CREATE TABLE table_with_annoy_index -( - id Int64, - vectors Array(Float32), - INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] -) -ENGINE = MergeTree -ORDER BY id; -``` - -Annoy currently supports two distance functions: -- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space - ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)). -- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors - ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). - -For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no -distance function was specified during index creation, `L2Distance` is used as default. - -Parameter `NumTrees` is the number of trees which the algorithm creates (default if not specified: 100). Higher values of `NumTree` mean -more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes. - -:::note -All arrays must have same length. To avoid errors, you can use a -[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK -length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported. -::: - -The creation of Annoy indexes (whenever a new part is build, e.g. at the end of a merge) is a relatively slow process. You can increase -setting `max_threads_for_annoy_index_creation` (default: 4) which controls how many threads are used to create an Annoy index. Please be -careful with this setting, it is possible that multiple indexes are created in parallel in which case there can be overparallelization. - -Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger -values mean more accurate results at the cost of longer query runtime: - -```sql -SELECT * -FROM table_name -ORDER BY L2Distance(vectors, Point) -LIMIT N -SETTINGS annoy_index_search_k_nodes=100; -``` - -:::note -The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see -[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml. -::: - -## USearch {#usearch} - -This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW -algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent -similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the -overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors, -that are expensive to load and compare. The library also has several hardware-specific SIMD optimizations to accelerate further -distance computations on modern Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient -navigation around immutable persistent files, without loading them into RAM. - -
- -
- -Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column: - -```sql -CREATE TABLE table_with_usearch_index -( - id Int64, - vectors Array(Float32), - INDEX [ann_index_name] vectors TYPE usearch([Distance[, ScalarKind]]) [GRANULARITY N] -) -ENGINE = MergeTree -ORDER BY id; -``` - -USearch currently supports two distance functions: -- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space - ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)). -- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors - ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). - -USearch allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`. If no scalar kind -was specified during index creation, `f16` is used as default. - -For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no -distance function was specified during index creation, `L2Distance` is used as default. diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 68f61650e00..9fce83a0dc4 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1400,6 +1400,16 @@ The number of seconds that ClickHouse waits for incoming requests before closing 10 ``` +## max_keep_alive_requests {#max-keep-alive-requests} + +Maximal number of requests through a single keep-alive connection until it will be closed by ClickHouse server. Default to 10000. + +**Example** + +``` xml +10 +``` + ## listen_host {#listen_host} Restriction on hosts that requests can come from. If you want the server to answer all of them, specify `::`. diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index ce3a4659e0e..3add371b30f 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -1,4 +1,4 @@ -add_compile_options($<$,$>:${COVERAGE_FLAGS}>) +add_compile_options("$<$,$>:${COVERAGE_FLAGS}>") if (USE_CLANG_TIDY) set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") diff --git a/programs/library-bridge/LibraryBridge.cpp b/programs/library-bridge/LibraryBridge.cpp index d35e7a9b832..261484ac744 100644 --- a/programs/library-bridge/LibraryBridge.cpp +++ b/programs/library-bridge/LibraryBridge.cpp @@ -27,7 +27,7 @@ std::string LibraryBridge::bridgeName() const LibraryBridge::HandlerFactoryPtr LibraryBridge::getHandlerFactoryPtr(ContextPtr context) const { - return std::make_shared("LibraryRequestHandlerFactory", keep_alive_timeout, context); + return std::make_shared("LibraryRequestHandlerFactory", context); } } diff --git a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp index e5ab22f2d40..234904c6265 100644 --- a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp +++ b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp @@ -9,12 +9,10 @@ namespace DB { LibraryBridgeHandlerFactory::LibraryBridgeHandlerFactory( const std::string & name_, - size_t keep_alive_timeout_, ContextPtr context_) : WithContext(context_) , log(getLogger(name_)) , name(name_) - , keep_alive_timeout(keep_alive_timeout_) { } @@ -26,17 +24,17 @@ std::unique_ptr LibraryBridgeHandlerFactory::createRequestHa if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) { if (uri.getPath() == "/extdict_ping") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); else if (uri.getPath() == "/catboost_ping") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); } if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { if (uri.getPath() == "/extdict_request") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); else if (uri.getPath() == "/catboost_request") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); } return nullptr; diff --git a/programs/library-bridge/LibraryBridgeHandlerFactory.h b/programs/library-bridge/LibraryBridgeHandlerFactory.h index 5b0f088bc29..c65394efa3b 100644 --- a/programs/library-bridge/LibraryBridgeHandlerFactory.h +++ b/programs/library-bridge/LibraryBridgeHandlerFactory.h @@ -13,7 +13,6 @@ class LibraryBridgeHandlerFactory : public HTTPRequestHandlerFactory, WithContex public: LibraryBridgeHandlerFactory( const std::string & name_, - size_t keep_alive_timeout_, ContextPtr context_); std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; @@ -21,7 +20,6 @@ public: private: LoggerPtr log; const std::string name; - const size_t keep_alive_timeout; }; } diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index 53f4b31ca06..236256dc36b 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -87,10 +87,8 @@ static void writeData(Block data, OutputFormatPtr format) } -ExternalDictionaryLibraryBridgeRequestHandler::ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("ExternalDictionaryLibraryBridgeRequestHandler")) +ExternalDictionaryLibraryBridgeRequestHandler::ExternalDictionaryLibraryBridgeRequestHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("ExternalDictionaryLibraryBridgeRequestHandler")) { } @@ -137,7 +135,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ const String & dictionary_id = params.get("dictionary_id"); LOG_TRACE(log, "Library method: '{}', dictionary id: {}", method, dictionary_id); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { @@ -374,10 +372,8 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ } -ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("ExternalDictionaryLibraryBridgeExistsHandler")) +ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExistsHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("ExternalDictionaryLibraryBridgeExistsHandler")) { } @@ -401,7 +397,7 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque String res = library_handler ? "1" : "0"; - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); LOG_TRACE(log, "Sending ping response: {} (dictionary id: {})", res, dictionary_id); response.sendBuffer(res.data(), res.size()); } @@ -412,11 +408,8 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque } -CatBoostLibraryBridgeRequestHandler::CatBoostLibraryBridgeRequestHandler( - size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("CatBoostLibraryBridgeRequestHandler")) +CatBoostLibraryBridgeRequestHandler::CatBoostLibraryBridgeRequestHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("CatBoostLibraryBridgeRequestHandler")) { } @@ -455,7 +448,7 @@ void CatBoostLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & requ const String & method = params.get("method"); LOG_TRACE(log, "Library method: '{}'", method); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { @@ -617,10 +610,8 @@ void CatBoostLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & requ } -CatBoostLibraryBridgeExistsHandler::CatBoostLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("CatBoostLibraryBridgeExistsHandler")) +CatBoostLibraryBridgeExistsHandler::CatBoostLibraryBridgeExistsHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("CatBoostLibraryBridgeExistsHandler")) { } @@ -634,7 +625,7 @@ void CatBoostLibraryBridgeExistsHandler::handleRequest(HTTPServerRequest & reque String res = "1"; - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); LOG_TRACE(log, "Sending ping response: {}", res); response.sendBuffer(res.data(), res.size()); } diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 62fbf2caede..367d25a6e2d 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -18,14 +18,13 @@ namespace DB class ExternalDictionaryLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); + explicit ExternalDictionaryLibraryBridgeRequestHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: static constexpr auto FORMAT = "RowBinary"; - const size_t keep_alive_timeout; LoggerPtr log; }; @@ -34,12 +33,11 @@ private: class ExternalDictionaryLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); + explicit ExternalDictionaryLibraryBridgeExistsHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - const size_t keep_alive_timeout; LoggerPtr log; }; @@ -63,12 +61,11 @@ private: class CatBoostLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - CatBoostLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); + explicit CatBoostLibraryBridgeRequestHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - const size_t keep_alive_timeout; LoggerPtr log; }; @@ -77,12 +74,11 @@ private: class CatBoostLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - CatBoostLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); + explicit CatBoostLibraryBridgeExistsHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - const size_t keep_alive_timeout; LoggerPtr log; }; diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index a738d5e8c12..6c149d7a08c 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -202,10 +202,7 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ if (columns.empty()) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Columns definition was not returned"); - WriteBufferFromHTTPServerResponse out( - response, - request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, - keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { writeStringBinary(columns.toString(), out); diff --git a/programs/odbc-bridge/ColumnInfoHandler.h b/programs/odbc-bridge/ColumnInfoHandler.h index 610fb128c9d..1edd62dcba3 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.h +++ b/programs/odbc-bridge/ColumnInfoHandler.h @@ -15,18 +15,12 @@ namespace DB class ODBCColumnsInfoHandler : public HTTPRequestHandler, WithContext { public: - ODBCColumnsInfoHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger("ODBCColumnsInfoHandler")) - , keep_alive_timeout(keep_alive_timeout_) - { - } + explicit ODBCColumnsInfoHandler(ContextPtr context_) : WithContext(context_), log(getLogger("ODBCColumnsInfoHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: LoggerPtr log; - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index 2f00e1171bb..778e6019be1 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -74,7 +74,7 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ auto identifier = getIdentifierQuote(std::move(connection)); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { writeStringBinary(identifier, out); diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.h b/programs/odbc-bridge/IdentifierQuoteHandler.h index 7b78c5b4b93..a85b56a9f6a 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.h +++ b/programs/odbc-bridge/IdentifierQuoteHandler.h @@ -14,18 +14,12 @@ namespace DB class IdentifierQuoteHandler : public HTTPRequestHandler, WithContext { public: - IdentifierQuoteHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger("IdentifierQuoteHandler")) - , keep_alive_timeout(keep_alive_timeout_) - { - } + explicit IdentifierQuoteHandler(ContextPtr context_) : WithContext(context_), log(getLogger("IdentifierQuoteHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: LoggerPtr log; - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index 404f66eab3b..160add9d2ed 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -132,7 +132,7 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse return; } - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { diff --git a/programs/odbc-bridge/MainHandler.h b/programs/odbc-bridge/MainHandler.h index ed0c6b2e28c..0fcad61d274 100644 --- a/programs/odbc-bridge/MainHandler.h +++ b/programs/odbc-bridge/MainHandler.h @@ -20,12 +20,10 @@ class ODBCHandler : public HTTPRequestHandler, WithContext { public: ODBCHandler( - size_t keep_alive_timeout_, ContextPtr context_, const String & mode_) : WithContext(context_) , log(getLogger("ODBCHandler")) - , keep_alive_timeout(keep_alive_timeout_) , mode(mode_) { } @@ -35,7 +33,6 @@ public: private: LoggerPtr log; - size_t keep_alive_timeout; String mode; static inline std::mutex mutex; diff --git a/programs/odbc-bridge/ODBCBridge.cpp b/programs/odbc-bridge/ODBCBridge.cpp index 155e26c8a27..096d1b2dcca 100644 --- a/programs/odbc-bridge/ODBCBridge.cpp +++ b/programs/odbc-bridge/ODBCBridge.cpp @@ -27,7 +27,7 @@ std::string ODBCBridge::bridgeName() const ODBCBridge::HandlerFactoryPtr ODBCBridge::getHandlerFactoryPtr(ContextPtr context) const { - return std::make_shared("ODBCRequestHandlerFactory-factory", keep_alive_timeout, context); + return std::make_shared("ODBCRequestHandlerFactory-factory", context); } } diff --git a/programs/odbc-bridge/ODBCHandlerFactory.cpp b/programs/odbc-bridge/ODBCHandlerFactory.cpp index eebb0c24c7a..b5d0be908f4 100644 --- a/programs/odbc-bridge/ODBCHandlerFactory.cpp +++ b/programs/odbc-bridge/ODBCHandlerFactory.cpp @@ -9,11 +9,8 @@ namespace DB { -ODBCBridgeHandlerFactory::ODBCBridgeHandlerFactory(const std::string & name_, size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger(name_)) - , name(name_) - , keep_alive_timeout(keep_alive_timeout_) +ODBCBridgeHandlerFactory::ODBCBridgeHandlerFactory(const std::string & name_, ContextPtr context_) + : WithContext(context_), log(getLogger(name_)), name(name_) { } @@ -23,33 +20,33 @@ std::unique_ptr ODBCBridgeHandlerFactory::createRequestHandl LOG_TRACE(log, "Request URI: {}", uri.toString()); if (uri.getPath() == "/ping" && request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) - return std::make_unique(keep_alive_timeout); + return std::make_unique(); if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { if (uri.getPath() == "/columns_info") #if USE_ODBC - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); #else return nullptr; #endif else if (uri.getPath() == "/identifier_quote") #if USE_ODBC - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); #else return nullptr; #endif else if (uri.getPath() == "/schema_allowed") #if USE_ODBC - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); #else return nullptr; #endif else if (uri.getPath() == "/write") - return std::make_unique(keep_alive_timeout, getContext(), "write"); + return std::make_unique(getContext(), "write"); else - return std::make_unique(keep_alive_timeout, getContext(), "read"); + return std::make_unique(getContext(), "read"); } return nullptr; } diff --git a/programs/odbc-bridge/ODBCHandlerFactory.h b/programs/odbc-bridge/ODBCHandlerFactory.h index 4aaf1b55453..f4a2717dc9f 100644 --- a/programs/odbc-bridge/ODBCHandlerFactory.h +++ b/programs/odbc-bridge/ODBCHandlerFactory.h @@ -17,14 +17,13 @@ namespace DB class ODBCBridgeHandlerFactory : public HTTPRequestHandlerFactory, WithContext { public: - ODBCBridgeHandlerFactory(const std::string & name_, size_t keep_alive_timeout_, ContextPtr context_); + ODBCBridgeHandlerFactory(const std::string & name_, ContextPtr context_); std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; private: LoggerPtr log; std::string name; - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/PingHandler.cpp b/programs/odbc-bridge/PingHandler.cpp index 80d0e2bf4a9..e5d094fb7eb 100644 --- a/programs/odbc-bridge/PingHandler.cpp +++ b/programs/odbc-bridge/PingHandler.cpp @@ -10,7 +10,7 @@ void PingHandler::handleRequest(HTTPServerRequest & /* request */, HTTPServerRes { try { - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); const char * data = "Ok.\n"; response.sendBuffer(data, strlen(data)); } diff --git a/programs/odbc-bridge/PingHandler.h b/programs/odbc-bridge/PingHandler.h index c5447107e0c..4c557bd3cf6 100644 --- a/programs/odbc-bridge/PingHandler.h +++ b/programs/odbc-bridge/PingHandler.h @@ -9,11 +9,7 @@ namespace DB class PingHandler : public HTTPRequestHandler { public: - explicit PingHandler(size_t keep_alive_timeout_) : keep_alive_timeout(keep_alive_timeout_) {} void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; - -private: - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index dfb4333240b..e17d7c5b880 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -88,7 +88,7 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer bool result = isSchemaAllowed(std::move(connection)); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { writeBoolText(result, out); diff --git a/programs/odbc-bridge/SchemaAllowedHandler.h b/programs/odbc-bridge/SchemaAllowedHandler.h index 8dc725dbb33..59022151b53 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.h +++ b/programs/odbc-bridge/SchemaAllowedHandler.h @@ -17,18 +17,12 @@ class Context; class SchemaAllowedHandler : public HTTPRequestHandler, WithContext { public: - SchemaAllowedHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger("SchemaAllowedHandler")) - , keep_alive_timeout(keep_alive_timeout_) - { - } + explicit SchemaAllowedHandler(ContextPtr context_) : WithContext(context_), log(getLogger("SchemaAllowedHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: LoggerPtr log; - size_t keep_alive_timeout; }; } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 51fcd494111..196af80f2cd 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -2428,6 +2428,7 @@ void Server::createServers( Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; http_params->setTimeout(settings.http_receive_timeout); http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); + http_params->setMaxKeepAliveRequests(static_cast(global_context->getServerSettings().max_keep_alive_requests)); Poco::Util::AbstractConfiguration::Keys protocols; config.keys("protocols", protocols); diff --git a/programs/server/fuzzers/tcp_protocol_fuzzer.cpp b/programs/server/fuzzers/tcp_protocol_fuzzer.cpp index 950ea09669a..7cebdc2ad65 100644 --- a/programs/server/fuzzers/tcp_protocol_fuzzer.cpp +++ b/programs/server/fuzzers/tcp_protocol_fuzzer.cpp @@ -10,6 +10,7 @@ #include #include +#include #include @@ -25,6 +26,12 @@ static int64_t port = 9000; using namespace std::chrono_literals; +void on_exit() +{ + BaseDaemon::terminate(); + main_app.wait(); +} + extern "C" int LLVMFuzzerInitialize(int * argc, char ***argv) { @@ -60,6 +67,8 @@ int LLVMFuzzerInitialize(int * argc, char ***argv) exit(-1); } + atexit(on_exit); + return 0; } diff --git a/src/AggregateFunctions/fuzzers/CMakeLists.txt b/src/AggregateFunctions/fuzzers/CMakeLists.txt index 3ff7c86a00c..1ce0c52feb7 100644 --- a/src/AggregateFunctions/fuzzers/CMakeLists.txt +++ b/src/AggregateFunctions/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(aggregate_function_state_deserialization_fuzzer aggregate_function_state_deserialization_fuzzer.cpp ${SRCS}) -target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions clickhouse_functions) +target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE clickhouse_functions clickhouse_aggregate_functions) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 43092d10dd2..95e431b54be 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -add_compile_options($<$,$>:${COVERAGE_FLAGS}>) +add_compile_options("$<$,$>:${COVERAGE_FLAGS}>") if (USE_INCLUDE_WHAT_YOU_USE) set (CMAKE_CXX_INCLUDE_WHAT_YOU_USE ${IWYU_PATH}) @@ -601,10 +601,6 @@ endif() dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing) -if (TARGET ch_contrib::annoy) - dbms_target_link_libraries(PUBLIC ch_contrib::annoy) -endif() - if (TARGET ch_contrib::usearch) dbms_target_link_libraries(PUBLIC ch_contrib::usearch) endif() diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 3f4c1cab31b..8f69eaf54b4 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -2751,7 +2751,7 @@ void ClientBase::runLibFuzzer() for (auto & arg : fuzzer_args_holder) fuzzer_args.emplace_back(arg.data()); - int fuzzer_argc = fuzzer_args.size(); + int fuzzer_argc = static_cast(fuzzer_args.size()); char ** fuzzer_argv = fuzzer_args.data(); LLVMFuzzerRunDriver(&fuzzer_argc, &fuzzer_argv, [](const uint8_t * data, size_t size) diff --git a/src/Common/config.h.in b/src/Common/config.h.in index e3f8882850f..2e3b8d84366 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -58,6 +58,7 @@ #cmakedefine01 USE_FILELOG #cmakedefine01 USE_ODBC #cmakedefine01 USE_BLAKE3 +#cmakedefine01 USE_USEARCH #cmakedefine01 USE_SKIM #cmakedefine01 USE_PRQL #cmakedefine01 USE_ULID diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index d13e6251ca9..256f78ddea0 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -134,6 +134,7 @@ namespace DB M(Bool, async_load_databases, false, "Enable asynchronous loading of databases and tables to speedup server startup. Queries to not yet loaded entity will be blocked until load is finished.", 0) \ M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) \ M(Seconds, keep_alive_timeout, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT, "The number of seconds that ClickHouse waits for incoming requests before closing the connection.", 0) \ + M(UInt64, max_keep_alive_requests, 10000, "The maximum number of requests handled via a single http keepalive connection before the server closes this connection.", 0) \ M(Seconds, replicated_fetches_http_connection_timeout, 0, "HTTP connection timeout for part fetch requests. Inherited from default profile `http_connection_timeout` if not set explicitly.", 0) \ M(Seconds, replicated_fetches_http_send_timeout, 0, "HTTP send timeout for part fetch requests. Inherited from default profile `http_send_timeout` if not set explicitly.", 0) \ M(Seconds, replicated_fetches_http_receive_timeout, 0, "HTTP receive timeout for fetch part requests. Inherited from default profile `http_receive_timeout` if not set explicitly.", 0) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index cbb62227da0..98ec33d6a91 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -908,14 +908,11 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_time_series_table, false, "Allows experimental TimeSeries table engine", 0) \ + M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \ M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \ - M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ - M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ - M(UInt64, max_threads_for_annoy_index_creation, 4, "Number of threads used to build Annoy indexes (0 means all cores, not recommended)", 0) \ - M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \ @@ -1039,6 +1036,10 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, parallel_replicas_min_number_of_granules_to_enable, 0) \ MAKE_OBSOLETE(M, Bool, query_plan_optimize_projection, true) \ MAKE_OBSOLETE(M, Bool, query_cache_store_results_of_queries_with_nondeterministic_functions, false) \ + MAKE_OBSOLETE(M, Bool, allow_experimental_annoy_index, false) \ + MAKE_OBSOLETE(M, UInt64, max_threads_for_annoy_index_creation, 4) \ + MAKE_OBSOLETE(M, Int64, annoy_index_search_k_nodes, -1) \ + MAKE_OBSOLETE(M, Bool, allow_experimental_usearch_index, false) \ MAKE_OBSOLETE(M, Bool, optimize_move_functions_out_of_any, false) \ MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index d2a6f0423b2..e9cb333a92f 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -88,6 +88,7 @@ static std::initializer_listsetSetting("allow_experimental_object_type", 1); query_context->setSetting("allow_experimental_variant_type", 1); query_context->setSetting("allow_experimental_dynamic_type", 1); - query_context->setSetting("allow_experimental_annoy_index", 1); - query_context->setSetting("allow_experimental_usearch_index", 1); + query_context->setSetting("allow_experimental_vector_similarity_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); query_context->setSetting("allow_experimental_window_functions", 1); query_context->setSetting("allow_experimental_geo_types", 1); diff --git a/src/Formats/fuzzers/CMakeLists.txt b/src/Formats/fuzzers/CMakeLists.txt index 99e57bfbca3..b8a7e78b6e2 100644 --- a/src/Formats/fuzzers/CMakeLists.txt +++ b/src/Formats/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(format_fuzzer format_fuzzer.cpp ${SRCS}) -target_link_libraries(format_fuzzer PRIVATE dbms clickhouse_aggregate_functions clickhouse_functions) +target_link_libraries(format_fuzzer PRIVATE clickhouse_functions clickhouse_aggregate_functions) diff --git a/src/Formats/fuzzers/format_fuzzer.cpp b/src/Formats/fuzzers/format_fuzzer.cpp index 408e7218221..12cd40f9442 100644 --- a/src/Formats/fuzzers/format_fuzzer.cpp +++ b/src/Formats/fuzzers/format_fuzzer.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include diff --git a/src/IO/HTTPCommon.cpp b/src/IO/HTTPCommon.cpp index 9704d034b2a..fcd96e97b4e 100644 --- a/src/IO/HTTPCommon.cpp +++ b/src/IO/HTTPCommon.cpp @@ -34,14 +34,20 @@ namespace ErrorCodes extern const int RECEIVED_ERROR_TOO_MANY_REQUESTS; } -void setResponseDefaultHeaders(HTTPServerResponse & response, size_t keep_alive_timeout) +void setResponseDefaultHeaders(HTTPServerResponse & response) { if (!response.getKeepAlive()) return; - Poco::Timespan timeout(keep_alive_timeout, 0); - if (timeout.totalSeconds()) - response.set("Keep-Alive", "timeout=" + std::to_string(timeout.totalSeconds())); + const size_t keep_alive_timeout = response.getSession().getKeepAliveTimeout(); + const size_t keep_alive_max_requests = response.getSession().getMaxKeepAliveRequests(); + if (keep_alive_timeout) + { + if (keep_alive_max_requests) + response.set("Keep-Alive", fmt::format("timeout={}, max={}", keep_alive_timeout, keep_alive_max_requests)); + else + response.set("Keep-Alive", fmt::format("timeout={}", keep_alive_timeout)); + } } HTTPSessionPtr makeHTTPSession( diff --git a/src/IO/HTTPCommon.h b/src/IO/HTTPCommon.h index 3a1fa5bebee..4d0580acaba 100644 --- a/src/IO/HTTPCommon.h +++ b/src/IO/HTTPCommon.h @@ -54,7 +54,7 @@ private: using HTTPSessionPtr = std::shared_ptr; -void setResponseDefaultHeaders(HTTPServerResponse & response, size_t keep_alive_timeout); +void setResponseDefaultHeaders(HTTPServerResponse & response); /// Create session object to perform requests and set required parameters. HTTPSessionPtr makeHTTPSession( diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8bcb7f18a0f..95143031707 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -787,10 +787,8 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index) throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'"); /// ---- - if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index is disabled. Turn on allow_experimental_annoy_index"); - if (index_desc.type == "usearch" && !settings.allow_experimental_usearch_index) - throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index is disabled. Turn on allow_experimental_usearch_index"); + if (index_desc.type == "vector_similarity" && !settings.allow_experimental_vector_similarity_index) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index is disabled. Turn on allow_experimental_vector_similarity_index"); properties.indices.push_back(index_desc); } diff --git a/src/Parsers/ASTIndexDeclaration.h b/src/Parsers/ASTIndexDeclaration.h index dd05ad08184..72f3f017a99 100644 --- a/src/Parsers/ASTIndexDeclaration.h +++ b/src/Parsers/ASTIndexDeclaration.h @@ -13,8 +13,7 @@ class ASTIndexDeclaration : public IAST { public: static const auto DEFAULT_INDEX_GRANULARITY = 1uz; - static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz; - static const auto DEFAULT_USEARCH_INDEX_GRANULARITY = 100'000'000uz; + static const auto DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY = 100'000'000uz; ASTIndexDeclaration(ASTPtr expression, ASTPtr type, const String & name_); diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp index 9ebee4cc852..ed89b80edca 100644 --- a/src/Parsers/ParserCreateIndexQuery.cpp +++ b/src/Parsers/ParserCreateIndexQuery.cpp @@ -89,10 +89,8 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected else { auto index_type = index->getType(); - if (index_type && index_type->name == "annoy") - index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY; - else if (index_type && index_type->name == "usearch") - index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY; + if (index_type && index_type->name == "vector_similarity") + index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY; else index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY; } diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 318fe9da1b4..cc4e02f46a3 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -214,10 +214,8 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe else { auto index_type = index->getType(); - if (index_type->name == "annoy") - index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY; - else if (index_type->name == "usearch") - index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY; + if (index_type->name == "vector_similarity") + index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY; else index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY; } diff --git a/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt b/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt index 20fd951d390..74fdcff79f7 100644 --- a/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt +++ b/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt @@ -39,7 +39,7 @@ set(CMAKE_INCLUDE_CURRENT_DIR TRUE) clickhouse_add_executable(codegen_select_fuzzer ${FUZZER_SRCS}) -set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier") +set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier -Wno-extra-semi-stmt -Wno-used-but-marked-unused") # contrib/libprotobuf-mutator/src/libfuzzer/libfuzzer_macro.h:143:44: error: no newline at end of file [-Werror,-Wnewline-eof] target_compile_options (codegen_select_fuzzer PRIVATE -Wno-newline-eof) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 6f543a05fba..4a7a23158ff 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -203,25 +204,15 @@ template void ORCBlockOutputFormat::writeNumbers( orc::ColumnVectorBatch & orc_column, const IColumn & column, - const PaddedPODArray * null_bytemap, + const PaddedPODArray * /*null_bytemap*/, ConvertFunc convert) { NumberVectorBatch & number_orc_column = dynamic_cast(orc_column); const auto & number_column = assert_cast &>(column); - number_orc_column.resize(number_column.size()); + number_orc_column.data.resize(number_column.size()); for (size_t i = 0; i != number_column.size(); ++i) - { - if (null_bytemap && (*null_bytemap)[i]) - { - number_orc_column.notNull[i] = 0; - continue; - } - - number_orc_column.notNull[i] = 1; number_orc_column.data[i] = convert(number_column.getElement(i)); - } - number_orc_column.numElements = number_column.size(); } template @@ -229,7 +220,7 @@ void ORCBlockOutputFormat::writeDecimals( orc::ColumnVectorBatch & orc_column, const IColumn & column, DataTypePtr & type, - const PaddedPODArray * null_bytemap, + const PaddedPODArray * /*null_bytemap*/, ConvertFunc convert) { DecimalVectorBatch & decimal_orc_column = dynamic_cast(orc_column); @@ -238,71 +229,49 @@ void ORCBlockOutputFormat::writeDecimals( decimal_orc_column.precision = decimal_type->getPrecision(); decimal_orc_column.scale = decimal_type->getScale(); decimal_orc_column.resize(decimal_column.size()); - for (size_t i = 0; i != decimal_column.size(); ++i) - { - if (null_bytemap && (*null_bytemap)[i]) - { - decimal_orc_column.notNull[i] = 0; - continue; - } - decimal_orc_column.notNull[i] = 1; + decimal_orc_column.values.resize(decimal_column.size()); + for (size_t i = 0; i != decimal_column.size(); ++i) decimal_orc_column.values[i] = convert(decimal_column.getElement(i).value); - } - decimal_orc_column.numElements = decimal_column.size(); } template void ORCBlockOutputFormat::writeStrings( orc::ColumnVectorBatch & orc_column, const IColumn & column, - const PaddedPODArray * null_bytemap) + const PaddedPODArray * /*null_bytemap*/) { orc::StringVectorBatch & string_orc_column = dynamic_cast(orc_column); const auto & string_column = assert_cast(column); - string_orc_column.resize(string_column.size()); + string_orc_column.data.resize(string_column.size()); + string_orc_column.length.resize(string_column.size()); for (size_t i = 0; i != string_column.size(); ++i) { - if (null_bytemap && (*null_bytemap)[i]) - { - string_orc_column.notNull[i] = 0; - continue; - } - - string_orc_column.notNull[i] = 1; const std::string_view & string = string_column.getDataAt(i).toView(); string_orc_column.data[i] = const_cast(string.data()); string_orc_column.length[i] = string.size(); } - string_orc_column.numElements = string_column.size(); } template void ORCBlockOutputFormat::writeDateTimes( orc::ColumnVectorBatch & orc_column, const IColumn & column, - const PaddedPODArray * null_bytemap, + const PaddedPODArray * /*null_bytemap*/, GetSecondsFunc get_seconds, GetNanosecondsFunc get_nanoseconds) { orc::TimestampVectorBatch & timestamp_orc_column = dynamic_cast(orc_column); const auto & timestamp_column = assert_cast(column); - timestamp_orc_column.resize(timestamp_column.size()); + timestamp_orc_column.data.resize(timestamp_column.size()); + timestamp_orc_column.nanoseconds.resize(timestamp_column.size()); for (size_t i = 0; i != timestamp_column.size(); ++i) { - if (null_bytemap && (*null_bytemap)[i]) - { - timestamp_orc_column.notNull[i] = 0; - continue; - } - - timestamp_orc_column.notNull[i] = 1; timestamp_orc_column.data[i] = static_cast(get_seconds(timestamp_column.getElement(i))); timestamp_orc_column.nanoseconds[i] = static_cast(get_nanoseconds(timestamp_column.getElement(i))); } - timestamp_orc_column.numElements = timestamp_column.size(); } void ORCBlockOutputFormat::writeColumn( @@ -311,9 +280,27 @@ void ORCBlockOutputFormat::writeColumn( DataTypePtr & type, const PaddedPODArray * null_bytemap) { - orc_column.notNull.resize(column.size()); + size_t rows = column.size(); + orc_column.resize(rows); + orc_column.numElements = rows; + + /// Calculate orc_column.hasNulls if (null_bytemap) - orc_column.hasNulls = true; + orc_column.hasNulls = !memoryIsZero(null_bytemap->data(), 0, null_bytemap->size()); + else + orc_column.hasNulls = false; + + /// Fill orc_column.notNull + if (orc_column.hasNulls) + { + for (size_t i = 0; i < rows; ++i) + orc_column.notNull[i] = !(*null_bytemap)[i]; + } + else + { + for (size_t i = 0; i < rows; ++i) + orc_column.notNull[i] = 1; + } /// ORC doesn't have unsigned types, so cast everything to signed and sign-extend to Int64 to /// make the ORC library calculate min and max correctly. @@ -471,6 +458,7 @@ void ORCBlockOutputFormat::writeColumn( } case TypeIndex::Nullable: { + chassert(!null_bytemap); const auto & nullable_column = assert_cast(column); const PaddedPODArray & new_null_bytemap = assert_cast &>(*nullable_column.getNullMapColumnPtr()).getData(); auto nested_type = removeNullable(type); @@ -485,19 +473,15 @@ void ORCBlockOutputFormat::writeColumn( const ColumnArray::Offsets & offsets = list_column.getOffsets(); size_t column_size = list_column.size(); - list_orc_column.resize(column_size); + list_orc_column.offsets.resize(column_size + 1); /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. list_orc_column.offsets[0] = 0; for (size_t i = 0; i != column_size; ++i) - { list_orc_column.offsets[i + 1] = offsets[i]; - list_orc_column.notNull[i] = 1; - } orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements; - writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap); - list_orc_column.numElements = column_size; + writeColumn(nested_orc_column, list_column.getData(), nested_type, nullptr); break; } case TypeIndex::Tuple: @@ -505,10 +489,8 @@ void ORCBlockOutputFormat::writeColumn( orc::StructVectorBatch & struct_orc_column = dynamic_cast(orc_column); const auto & tuple_column = assert_cast(column); auto nested_types = assert_cast(type.get())->getElements(); - for (size_t i = 0; i != tuple_column.size(); ++i) - struct_orc_column.notNull[i] = 1; for (size_t i = 0; i != tuple_column.tupleSize(); ++i) - writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], null_bytemap); + writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], nullptr); break; } case TypeIndex::Map: @@ -520,25 +502,21 @@ void ORCBlockOutputFormat::writeColumn( size_t column_size = list_column.size(); - map_orc_column.resize(list_column.size()); + map_orc_column.offsets.resize(column_size + 1); /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. map_orc_column.offsets[0] = 0; for (size_t i = 0; i != column_size; ++i) - { map_orc_column.offsets[i + 1] = offsets[i]; - map_orc_column.notNull[i] = 1; - } + const auto nested_columns = assert_cast(list_column.getDataPtr().get())->getColumns(); orc::ColumnVectorBatch & keys_orc_column = *map_orc_column.keys; auto key_type = map_type.getKeyType(); - writeColumn(keys_orc_column, *nested_columns[0], key_type, null_bytemap); + writeColumn(keys_orc_column, *nested_columns[0], key_type, nullptr); orc::ColumnVectorBatch & values_orc_column = *map_orc_column.elements; auto value_type = map_type.getValueType(); - writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap); - - map_orc_column.numElements = column_size; + writeColumn(values_orc_column, *nested_columns[1], value_type, nullptr); break; } default: @@ -546,27 +524,6 @@ void ORCBlockOutputFormat::writeColumn( } } -size_t ORCBlockOutputFormat::getColumnSize(const IColumn & column, DataTypePtr & type) -{ - if (type->getTypeId() == TypeIndex::Array) - { - auto nested_type = assert_cast(*type).getNestedType(); - const IColumn & nested_column = assert_cast(column).getData(); - return std::max(column.size(), getColumnSize(nested_column, nested_type)); - } - - return column.size(); -} - -size_t ORCBlockOutputFormat::getMaxColumnSize(Chunk & chunk) -{ - size_t columns_num = chunk.getNumColumns(); - size_t max_column_size = 0; - for (size_t i = 0; i != columns_num; ++i) - max_column_size = std::max(max_column_size, getColumnSize(*chunk.getColumns()[i], data_types[i])); - return max_column_size; -} - void ORCBlockOutputFormat::consume(Chunk chunk) { if (!writer) @@ -575,10 +532,7 @@ void ORCBlockOutputFormat::consume(Chunk chunk) size_t columns_num = chunk.getNumColumns(); size_t rows_num = chunk.getNumRows(); - /// getMaxColumnSize is needed to write arrays. - /// The size of the batch must be no less than total amount of array elements - /// and no less than the number of rows (ORC writes a null bit for every row). - std::unique_ptr batch = writer->createRowBatch(getMaxColumnSize(chunk)); + std::unique_ptr batch = writer->createRowBatch(chunk.getNumRows()); orc::StructVectorBatch & root = dynamic_cast(*batch); auto columns = chunk.detachColumns(); diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h index 28837193d1a..06ecac9b820 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h @@ -69,11 +69,6 @@ private: void writeColumn(orc::ColumnVectorBatch & orc_column, const IColumn & column, DataTypePtr & type, const PaddedPODArray * null_bytemap); - /// These two functions are needed to know maximum nested size of arrays to - /// create an ORC Batch with the appropriate size - size_t getColumnSize(const IColumn & column, DataTypePtr & type); - size_t getMaxColumnSize(Chunk & chunk); - void prepareWriter(); const FormatSettings format_settings; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index ab7e81eeffc..523d4ce386a 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -24,8 +24,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include @@ -52,6 +52,8 @@ #include #include +#include "config.h" + using namespace DB; namespace @@ -1519,16 +1521,14 @@ static void buildIndexes( else { MergeTreeIndexConditionPtr condition; - if (index_helper->isVectorSearch()) + if (index_helper->isVectorSimilarityIndex()) { -#ifdef ENABLE_ANNOY - if (const auto * annoy = typeid_cast(index_helper.get())) - condition = annoy->createIndexCondition(query_info, context); -#endif -#ifdef ENABLE_USEARCH - if (const auto * usearch = typeid_cast(index_helper.get())) - condition = usearch->createIndexCondition(query_info, context); +#if USE_USEARCH + if (const auto * vector_similarity_index = typeid_cast(index_helper.get())) + condition = vector_similarity_index->createIndexCondition(query_info, context); #endif + if (const auto * legacy_vector_similarity_index = typeid_cast(index_helper.get())) + condition = legacy_vector_similarity_index->createIndexCondition(query_info, context); if (!condition) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown vector search index {}", index_helper->index.name); } diff --git a/src/Server/HTTP/HTTPServerResponse.h b/src/Server/HTTP/HTTPServerResponse.h index ac4f52e7766..51f5814556d 100644 --- a/src/Server/HTTP/HTTPServerResponse.h +++ b/src/Server/HTTP/HTTPServerResponse.h @@ -248,6 +248,8 @@ public: void attachRequest(HTTPServerRequest * request_) { request = request_; } + const Poco::Net::HTTPServerSession & getSession() const { return session; } + private: Poco::Net::HTTPServerSession & session; HTTPServerRequest * request = nullptr; diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index e2098b284bf..2fcb66ae606 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -30,7 +30,7 @@ void WriteBufferFromHTTPServerResponse::startSendHeaders() if (add_cors_header) response.set("Access-Control-Allow-Origin", "*"); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); std::stringstream header; //STYLE_CHECK_ALLOW_STD_STRING_STREAM response.beginWrite(header); @@ -119,12 +119,10 @@ void WriteBufferFromHTTPServerResponse::nextImpl() WriteBufferFromHTTPServerResponse::WriteBufferFromHTTPServerResponse( HTTPServerResponse & response_, bool is_http_method_head_, - UInt64 keep_alive_timeout_, const ProfileEvents::Event & write_event_) : HTTPWriteBuffer(response_.getSocket(), write_event_) , response(response_) , is_http_method_head(is_http_method_head_) - , keep_alive_timeout(keep_alive_timeout_) { } diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h index a3952b7c553..f0c80f24582 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h @@ -29,7 +29,6 @@ public: WriteBufferFromHTTPServerResponse( HTTPServerResponse & response_, bool is_http_method_head_, - UInt64 keep_alive_timeout_, const ProfileEvents::Event & write_event_ = ProfileEvents::end()); ~WriteBufferFromHTTPServerResponse() override; @@ -91,7 +90,6 @@ private: bool is_http_method_head; bool add_cors_header = false; - size_t keep_alive_timeout = 0; bool initialized = false; diff --git a/src/Server/HTTP/sendExceptionToHTTPClient.cpp b/src/Server/HTTP/sendExceptionToHTTPClient.cpp index 321c4866e8f..07a649dc396 100644 --- a/src/Server/HTTP/sendExceptionToHTTPClient.cpp +++ b/src/Server/HTTP/sendExceptionToHTTPClient.cpp @@ -29,7 +29,7 @@ void sendExceptionToHTTPClient( if (!out) { /// If nothing was sent yet. - WriteBufferFromHTTPServerResponse out_for_message{response, request.getMethod() == HTTPRequest::HTTP_HEAD, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT}; + WriteBufferFromHTTPServerResponse out_for_message{response, request.getMethod() == HTTPRequest::HTTP_HEAD}; out_for_message.writeln(exception_message); out_for_message.finalize(); diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index c8a58527f2c..d2bc22e98cc 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -266,7 +266,6 @@ void HTTPHandler::processQuery( std::make_shared( response, request.getMethod() == HTTPRequest::HTTP_HEAD, - context->getServerSettings().keep_alive_timeout.totalSeconds(), write_event); used_output.out = used_output.out_holder; used_output.out_maybe_compressed = used_output.out_holder; @@ -558,7 +557,7 @@ try if (!used_output.out_holder && !used_output.exception_is_written) { /// If nothing was sent yet and we don't even know if we must compress the response. - WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT).writeln(s); + WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD).writeln(s); } else if (used_output.out_maybe_compressed) { diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 2e7f16f59d7..fc31ad2874e 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -122,7 +122,8 @@ static inline auto createHandlersFactoryFromConfig( } else if (handler_type == "prometheus") { - main_handler_factory->addHandler(createPrometheusHandlerFactoryForHTTPRule(server, config, prefix + "." + key, async_metrics)); + main_handler_factory->addHandler( + createPrometheusHandlerFactoryForHTTPRule(server, config, prefix + "." + key, async_metrics)); } else if (handler_type == "replicas_status") { diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index e46021c8e68..59852c79139 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -87,9 +87,8 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe response.setChunkedTransferEncoding(true); Output used_output; - const auto keep_alive_timeout = server.context()->getServerSettings().keep_alive_timeout.totalSeconds(); used_output.out = std::make_shared( - response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout, write_event); + response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, write_event); auto finalize_output = [&] { diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index bbd9a978c91..ae1fb6d629e 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -95,7 +95,7 @@ public: class PrometheusRequestHandler::ImplWithContext : public Impl { public: - explicit ImplWithContext(PrometheusRequestHandler & parent) : Impl(parent), default_settings(parent.server.context()->getSettingsRef()) { } + explicit ImplWithContext(PrometheusRequestHandler & parent) : Impl(parent), default_settings(server().context()->getSettingsRef()) { } virtual void handlingRequestWithContext(HTTPServerRequest & request, HTTPServerResponse & response) = 0; @@ -353,7 +353,7 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); - setResponseDefaultHeaders(response, config.keep_alive_timeout); + setResponseDefaultHeaders(response); impl->beforeHandlingRequest(request); impl->handleRequest(request, response); @@ -379,7 +379,7 @@ WriteBufferFromHTTPServerResponse & PrometheusRequestHandler::getOutputStream(HT if (write_buffer_from_response) return *write_buffer_from_response; write_buffer_from_response = std::make_unique( - response, http_method == HTTPRequest::HTTP_HEAD, config.keep_alive_timeout, write_event); + response, http_method == HTTPRequest::HTTP_HEAD, write_event); return *write_buffer_from_response; } @@ -399,7 +399,7 @@ void PrometheusRequestHandler::finalizeResponse(HTTPServerResponse & response) if (write_buffer_from_response) std::exchange(write_buffer_from_response, {})->finalize(); else - WriteBufferFromHTTPServerResponse{response, http_method == HTTPRequest::HTTP_HEAD, config.keep_alive_timeout, write_event}.finalize(); + WriteBufferFromHTTPServerResponse{response, http_method == HTTPRequest::HTTP_HEAD, write_event}.finalize(); } chassert(response_finalized && !write_buffer_from_response); } diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 3a2dac0b7ad..281ecf5260e 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -15,8 +15,11 @@ class WriteBufferFromHTTPServerResponse; class PrometheusRequestHandler : public HTTPRequestHandler { public: - PrometheusRequestHandler(IServer & server_, const PrometheusRequestHandlerConfig & config_, - const AsynchronousMetrics & async_metrics_, std::shared_ptr metrics_writer_); + PrometheusRequestHandler( + IServer & server_, + const PrometheusRequestHandlerConfig & config_, + const AsynchronousMetrics & async_metrics_, + std::shared_ptr metrics_writer_); ~PrometheusRequestHandler() override; void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event_) override; diff --git a/src/Server/ReplicasStatusHandler.cpp b/src/Server/ReplicasStatusHandler.cpp index f43357db0a8..419ad635d0d 100644 --- a/src/Server/ReplicasStatusHandler.cpp +++ b/src/Server/ReplicasStatusHandler.cpp @@ -89,8 +89,7 @@ void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServe } } - const auto & server_settings = getContext()->getServerSettings(); - setResponseDefaultHeaders(response, server_settings.keep_alive_timeout.totalSeconds()); + setResponseDefaultHeaders(response); if (!ok) { diff --git a/src/Server/StaticRequestHandler.cpp b/src/Server/StaticRequestHandler.cpp index f3981dea9fb..d8c0765bca4 100644 --- a/src/Server/StaticRequestHandler.cpp +++ b/src/Server/StaticRequestHandler.cpp @@ -35,10 +35,9 @@ namespace ErrorCodes extern const int INVALID_CONFIG_PARAMETER; } -static inline std::unique_ptr -responseWriteBuffer(HTTPServerRequest & request, HTTPServerResponse & response, UInt64 keep_alive_timeout) +static inline std::unique_ptr responseWriteBuffer(HTTPServerRequest & request, HTTPServerResponse & response) { - auto buf = std::unique_ptr(new WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout)); + auto buf = std::unique_ptr(new WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD)); /// The client can pass a HTTP header indicating supported compression method (gzip or deflate). String http_response_compression_methods = request.get("Accept-Encoding", ""); @@ -91,8 +90,7 @@ static inline void trySendExceptionToClient( void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & /*write_event*/) { - auto keep_alive_timeout = server.context()->getServerSettings().keep_alive_timeout.totalSeconds(); - auto out = responseWriteBuffer(request, response, keep_alive_timeout); + auto out = responseWriteBuffer(request, response); try { @@ -107,7 +105,7 @@ void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServer "The Transfer-Encoding is not chunked and there " "is no Content-Length header for POST request"); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); response.setStatusAndReason(Poco::Net::HTTPResponse::HTTPStatus(status)); writeResponse(*out); } diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index a3d098014e7..c04d7a3f2a0 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -30,23 +30,20 @@ DashboardWebUIRequestHandler::DashboardWebUIRequestHandler(IServer & server_) : BinaryWebUIRequestHandler::BinaryWebUIRequestHandler(IServer & server_) : server(server_) {} JavaScriptWebUIRequestHandler::JavaScriptWebUIRequestHandler(IServer & server_) : server(server_) {} -static void handle(const IServer & server, HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) +static void handle(HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) { - auto keep_alive_timeout = server.context()->getServerSettings().keep_alive_timeout.totalSeconds(); - response.setContentType("text/html; charset=UTF-8"); if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK); - WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout).write(html.data(), html.size()); - + WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD).write(html.data(), html.size()); } void PlayWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { - handle(server, request, response, {reinterpret_cast(gresource_play_htmlData), gresource_play_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_play_htmlData), gresource_play_htmlSize}); } void DashboardWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) @@ -64,23 +61,23 @@ void DashboardWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HT static re2::RE2 lz_string_url = R"(https://[^\s"'`]+lz-string[^\s"'`]*\.js)"; RE2::Replace(&html, lz_string_url, "/js/lz-string.js"); - handle(server, request, response, html); + handle(request, response, html); } void BinaryWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { - handle(server, request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); } void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { if (request.getURI() == "/js/uplot.js") { - handle(server, request, response, {reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize}); + handle(request, response, {reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize}); } else if (request.getURI() == "/js/lz-string.js") { - handle(server, request, response, {reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize}); + handle(request, response, {reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize}); } else { @@ -88,7 +85,7 @@ void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, H *response.send() << "Not found.\n"; } - handle(server, request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); } } diff --git a/src/Storages/IndicesDescription.cpp b/src/Storages/IndicesDescription.cpp index cef8fd85f97..753fbf1d635 100644 --- a/src/Storages/IndicesDescription.cpp +++ b/src/Storages/IndicesDescription.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -130,10 +131,15 @@ IndexDescription IndexDescription::getIndexFromAST(const ASTPtr & definition_ast { for (size_t i = 0; i < index_type->arguments->children.size(); ++i) { - const auto * argument = index_type->arguments->children[i]->as(); - if (!argument) + const auto & child = index_type->arguments->children[i]; + if (const auto * ast_literal = child->as(); ast_literal != nullptr) + /// E.g. INDEX index_name column_name TYPE vector_similarity('hnsw', 'f32') + result.arguments.emplace_back(ast_literal->value); + else if (const auto * ast_identifier = child->as(); ast_identifier != nullptr) + /// E.g. INDEX index_name column_name TYPE vector_similarity(hnsw, f32) + result.arguments.emplace_back(ast_identifier->name()); + else throw Exception(ErrorCodes::INCORRECT_QUERY, "Only literals can be skip index arguments"); - result.arguments.emplace_back(argument->value); } } diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp b/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp deleted file mode 100644 index d6a8af3238e..00000000000 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.cpp +++ /dev/null @@ -1,507 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; - extern const int INCORRECT_QUERY; -} - -namespace -{ - -template -void extractReferenceVectorFromLiteral(ApproximateNearestNeighborInformation::Embedding & reference_vector, Literal literal) -{ - Float64 float_element_of_reference_vector; - Int64 int_element_of_reference_vector; - - for (const auto & value : literal.value()) - { - if (value.tryGet(float_element_of_reference_vector)) - reference_vector.emplace_back(float_element_of_reference_vector); - else if (value.tryGet(int_element_of_reference_vector)) - reference_vector.emplace_back(static_cast(int_element_of_reference_vector)); - else - throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in reference vector. Only float or int are supported."); - } -} - -ApproximateNearestNeighborInformation::Metric stringToMetric(std::string_view metric) -{ - if (metric == "L2Distance") - return ApproximateNearestNeighborInformation::Metric::L2; - else if (metric == "LpDistance") - return ApproximateNearestNeighborInformation::Metric::Lp; - else - return ApproximateNearestNeighborInformation::Metric::Unknown; -} - -} - -ApproximateNearestNeighborCondition::ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context) - : block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)) - , index_granularity(context->getMergeTreeSettings().index_granularity) - , max_limit_for_ann_queries(context->getSettingsRef().max_limit_for_ann_queries) - , index_is_useful(checkQueryStructure(query_info)) -{} - -bool ApproximateNearestNeighborCondition::alwaysUnknownOrTrue(String metric) const -{ - if (!index_is_useful) - return true; // Query isn't supported - // If query is supported, check metrics for match - return !(stringToMetric(metric) == query_information->metric); -} - -float ApproximateNearestNeighborCondition::getComparisonDistanceForWhereQuery() const -{ - if (index_is_useful && query_information.has_value() - && query_information->type == ApproximateNearestNeighborInformation::Type::Where) - return query_information->distance; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supported method for this query type"); -} - -UInt64 ApproximateNearestNeighborCondition::getLimit() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->limit; - throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported"); -} - -std::vector ApproximateNearestNeighborCondition::getReferenceVector() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->reference_vector; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index."); -} - -size_t ApproximateNearestNeighborCondition::getDimensions() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->reference_vector.size(); - throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index."); -} - -String ApproximateNearestNeighborCondition::getColumnName() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->column_name; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index."); -} - -ApproximateNearestNeighborInformation::Metric ApproximateNearestNeighborCondition::getMetricType() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->metric; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Metric name was requested for useless or uninitialized index."); -} - -float ApproximateNearestNeighborCondition::getPValueForLpDistance() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->p_for_lp_dist; - throw Exception(ErrorCodes::LOGICAL_ERROR, "P from LPDistance was requested for useless or uninitialized index."); -} - -ApproximateNearestNeighborInformation::Type ApproximateNearestNeighborCondition::getQueryType() const -{ - if (index_is_useful && query_information.has_value()) - return query_information->type; - throw Exception(ErrorCodes::LOGICAL_ERROR, "Query type was requested for useless or uninitialized index."); -} - -bool ApproximateNearestNeighborCondition::checkQueryStructure(const SelectQueryInfo & query) -{ - /// RPN-s for different sections of the query - RPN rpn_prewhere_clause; - RPN rpn_where_clause; - RPN rpn_order_by_clause; - RPNElement rpn_limit; - UInt64 limit; - - ApproximateNearestNeighborInformation prewhere_info; - ApproximateNearestNeighborInformation where_info; - ApproximateNearestNeighborInformation order_by_info; - - /// Build rpns for query sections - const auto & select = query.query->as(); - - /// If query has PREWHERE clause - if (select.prewhere()) - traverseAST(select.prewhere(), rpn_prewhere_clause); - - /// If query has WHERE clause - if (select.where()) - traverseAST(select.where(), rpn_where_clause); - - /// If query has LIMIT clause - if (select.limitLength()) - traverseAtomAST(select.limitLength(), rpn_limit); - - if (select.orderBy()) // If query has ORDERBY clause - traverseOrderByAST(select.orderBy(), rpn_order_by_clause); - - /// Reverse RPNs for conveniences during parsing - std::reverse(rpn_prewhere_clause.begin(), rpn_prewhere_clause.end()); - std::reverse(rpn_where_clause.begin(), rpn_where_clause.end()); - std::reverse(rpn_order_by_clause.begin(), rpn_order_by_clause.end()); - - /// Match rpns with supported types and extract information - const bool prewhere_is_valid = matchRPNWhere(rpn_prewhere_clause, prewhere_info); - const bool where_is_valid = matchRPNWhere(rpn_where_clause, where_info); - const bool order_by_is_valid = matchRPNOrderBy(rpn_order_by_clause, order_by_info); - const bool limit_is_valid = matchRPNLimit(rpn_limit, limit); - - /// Query without a LIMIT clause or with a limit greater than a restriction is not supported - if (!limit_is_valid || max_limit_for_ann_queries < limit) - return false; - - /// Search type query in both sections isn't supported - if (prewhere_is_valid && where_is_valid) - return false; - - /// Search type should be in WHERE or PREWHERE clause - if (prewhere_is_valid || where_is_valid) - query_information = std::move(prewhere_is_valid ? prewhere_info : where_info); - - if (order_by_is_valid) - { - /// Query with valid where and order by type is not supported - if (query_information.has_value()) - return false; - - query_information = std::move(order_by_info); - } - - if (query_information) - query_information->limit = limit; - - return query_information.has_value(); -} - -void ApproximateNearestNeighborCondition::traverseAST(const ASTPtr & node, RPN & rpn) -{ - // If the node is ASTFunction, it may have children nodes - if (const auto * func = node->as()) - { - const ASTs & children = func->arguments->children; - // Traverse children nodes - for (const auto& child : children) - traverseAST(child, rpn); - } - - RPNElement element; - /// Get the data behind node - if (!traverseAtomAST(node, element)) - element.function = RPNElement::FUNCTION_UNKNOWN; - - rpn.emplace_back(std::move(element)); -} - -bool ApproximateNearestNeighborCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) -{ - /// Match Functions - if (const auto * function = node->as()) - { - /// Set the name - out.func_name = function->name; - - if (function->name == "L1Distance" || - function->name == "L2Distance" || - function->name == "LinfDistance" || - function->name == "cosineDistance" || - function->name == "dotProduct" || - function->name == "LpDistance") - out.function = RPNElement::FUNCTION_DISTANCE; - else if (function->name == "tuple") - out.function = RPNElement::FUNCTION_TUPLE; - else if (function->name == "array") - out.function = RPNElement::FUNCTION_ARRAY; - else if (function->name == "less" || - function->name == "greater" || - function->name == "lessOrEquals" || - function->name == "greaterOrEquals") - out.function = RPNElement::FUNCTION_COMPARISON; - else if (function->name == "_CAST") - out.function = RPNElement::FUNCTION_CAST; - else - return false; - - return true; - } - /// Match identifier - else if (const auto * identifier = node->as()) - { - out.function = RPNElement::FUNCTION_IDENTIFIER; - out.identifier.emplace(identifier->name()); - out.func_name = "column identifier"; - - return true; - } - - /// Check if we have constants behind the node - return tryCastToConstType(node, out); -} - -bool ApproximateNearestNeighborCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out) -{ - Field const_value; - DataTypePtr const_type; - - if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type)) - { - /// Check for constant types - if (const_value.getType() == Field::Types::Float64) - { - out.function = RPNElement::FUNCTION_FLOAT_LITERAL; - out.float_literal.emplace(const_value.safeGet()); - out.func_name = "Float literal"; - return true; - } - - if (const_value.getType() == Field::Types::UInt64) - { - out.function = RPNElement::FUNCTION_INT_LITERAL; - out.int_literal.emplace(const_value.safeGet()); - out.func_name = "Int literal"; - return true; - } - - if (const_value.getType() == Field::Types::Int64) - { - out.function = RPNElement::FUNCTION_INT_LITERAL; - out.int_literal.emplace(const_value.safeGet()); - out.func_name = "Int literal"; - return true; - } - - if (const_value.getType() == Field::Types::Tuple) - { - out.function = RPNElement::FUNCTION_LITERAL_TUPLE; - out.tuple_literal = const_value.safeGet(); - out.func_name = "Tuple literal"; - return true; - } - - if (const_value.getType() == Field::Types::Array) - { - out.function = RPNElement::FUNCTION_LITERAL_ARRAY; - out.array_literal = const_value.safeGet(); - out.func_name = "Array literal"; - return true; - } - - if (const_value.getType() == Field::Types::String) - { - out.function = RPNElement::FUNCTION_STRING_LITERAL; - out.func_name = const_value.safeGet(); - return true; - } - } - - return false; -} - -void ApproximateNearestNeighborCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn) -{ - if (const auto * expr_list = node->as()) - if (const auto * order_by_element = expr_list->children.front()->as()) - traverseAST(order_by_element->children.front(), rpn); -} - -/// Returns true and stores ApproximateNearestNeighborInformation if the query has valid WHERE clause -bool ApproximateNearestNeighborCondition::matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) -{ - /// Fill query type field - ann_info.type = ApproximateNearestNeighborInformation::Type::Where; - - /// WHERE section must have at least 5 expressions - /// Operator->Distance(float)->DistanceFunc->Column->Tuple(Array)Func(ReferenceVector(floats)) - if (rpn.size() < 5) - return false; - - auto iter = rpn.begin(); - - /// Query starts from operator less - if (iter->function != RPNElement::FUNCTION_COMPARISON) - return false; - - const bool greater_case = iter->func_name == "greater" || iter->func_name == "greaterOrEquals"; - const bool less_case = iter->func_name == "less" || iter->func_name == "lessOrEquals"; - - ++iter; - - if (less_case) - { - if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL) - return false; - - ann_info.distance = getFloatOrIntLiteralOrPanic(iter); - if (ann_info.distance < 0) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance); - - ++iter; - - } - else if (!greater_case) - return false; - - auto end = rpn.end(); - if (!matchMainParts(iter, end, ann_info)) - return false; - - if (greater_case) - { - if (ann_info.reference_vector.size() < 2) - return false; - ann_info.distance = ann_info.reference_vector.back(); - if (ann_info.distance < 0) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance can't be negative. Got {}", ann_info.distance); - ann_info.reference_vector.pop_back(); - } - - /// query is ok - return true; -} - -/// Returns true and stores ANNExpr if the query has valid ORDERBY clause -bool ApproximateNearestNeighborCondition::matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info) -{ - /// Fill query type field - ann_info.type = ApproximateNearestNeighborInformation::Type::OrderBy; - - // ORDER BY clause must have at least 3 expressions - if (rpn.size() < 3) - return false; - - auto iter = rpn.begin(); - auto end = rpn.end(); - - return ApproximateNearestNeighborCondition::matchMainParts(iter, end, ann_info); -} - -/// Returns true and stores Length if we have valid LIMIT clause in query -bool ApproximateNearestNeighborCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit) -{ - if (rpn.function == RPNElement::FUNCTION_INT_LITERAL) - { - limit = rpn.int_literal.value(); - return true; - } - - return false; -} - -/// Matches dist function, referencer vector, column name -bool ApproximateNearestNeighborCondition::matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info) -{ - bool identifier_found = false; - - /// Matches DistanceFunc->[Column]->[Tuple(array)Func]->ReferenceVector(floats)->[Column] - if (iter->function != RPNElement::FUNCTION_DISTANCE) - return false; - - ann_info.metric = stringToMetric(iter->func_name); - ++iter; - - if (ann_info.metric == ApproximateNearestNeighborInformation::Metric::Lp) - { - if (iter->function != RPNElement::FUNCTION_FLOAT_LITERAL && - iter->function != RPNElement::FUNCTION_INT_LITERAL) - return false; - ann_info.p_for_lp_dist = getFloatOrIntLiteralOrPanic(iter); - ++iter; - } - - if (iter->function == RPNElement::FUNCTION_IDENTIFIER) - { - identifier_found = true; - ann_info.column_name = std::move(iter->identifier.value()); - ++iter; - } - - if (iter->function == RPNElement::FUNCTION_TUPLE || iter->function == RPNElement::FUNCTION_ARRAY) - ++iter; - - if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) - { - extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); - ++iter; - } - - if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) - { - extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); - ++iter; - } - - /// further conditions are possible if there is no tuple or array, or no identifier is found - /// the tuple or array can be inside a cast function. For other cases, see the loop after this condition - if (iter != end && iter->function == RPNElement::FUNCTION_CAST) - { - ++iter; - /// Cast should be made to array or tuple - if (!iter->func_name.starts_with("Array") && !iter->func_name.starts_with("Tuple")) - return false; - ++iter; - if (iter->function == RPNElement::FUNCTION_LITERAL_TUPLE) - { - extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->tuple_literal); - ++iter; - } - else if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) - { - extractReferenceVectorFromLiteral(ann_info.reference_vector, iter->array_literal); - ++iter; - } - else - return false; - } - - while (iter != end) - { - if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL || - iter->function == RPNElement::FUNCTION_INT_LITERAL) - ann_info.reference_vector.emplace_back(getFloatOrIntLiteralOrPanic(iter)); - else if (iter->function == RPNElement::FUNCTION_IDENTIFIER) - { - if (identifier_found) - return false; - ann_info.column_name = std::move(iter->identifier.value()); - identifier_found = true; - } - else - return false; - - ++iter; - } - - /// Final checks of correctness - return identifier_found && !ann_info.reference_vector.empty(); -} - -/// Gets float or int from AST node -float ApproximateNearestNeighborCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) -{ - if (iter->float_literal.has_value()) - return iter->float_literal.value(); - if (iter->int_literal.has_value()) - return static_cast(iter->int_literal.value()); - throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong parsed AST in buildRPN\n"); -} - -} diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index a6ef0063069..59f3a299c99 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -48,7 +49,6 @@ #include #include -#include namespace CurrentMetrics { @@ -1406,11 +1406,10 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin) reader.read(granule); - auto ann_condition = std::dynamic_pointer_cast(condition); - if (ann_condition != nullptr) + if (index_helper->isVectorSimilarityIndex()) { /// An array of indices of useful ranges. - auto result = ann_condition->getUsefulRanges(granule); + auto result = condition->getUsefulRanges(granule); for (auto range : result) { diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.cpp b/src/Storages/MergeTree/MergeTreeIOSettings.cpp index 58c3bd28d6a..24cb25afe47 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.cpp +++ b/src/Storages/MergeTree/MergeTreeIOSettings.cpp @@ -27,7 +27,6 @@ MergeTreeWriterSettings::MergeTreeWriterSettings( , rewrite_primary_key(rewrite_primary_key_) , blocks_are_granules_size(blocks_are_granules_size_) , query_write_settings(query_write_settings_) - , max_threads_for_annoy_index_creation(global_settings.max_threads_for_annoy_index_creation) , low_cardinality_max_dictionary_size(global_settings.low_cardinality_max_dictionary_size) , low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part != 0) , use_compact_variant_discriminators_serialization(storage_settings->use_compact_variant_discriminators_serialization) diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index c79ca1e66ee..47b174b2e29 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -77,8 +77,6 @@ struct MergeTreeWriterSettings bool blocks_are_granules_size; WriteSettings query_write_settings; - size_t max_threads_for_annoy_index_creation; - size_t low_cardinality_max_dictionary_size; bool low_cardinality_use_single_dictionary_for_part; bool use_compact_variant_discriminators_serialization; diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp deleted file mode 100644 index b68e48eeb3a..00000000000 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ /dev/null @@ -1,416 +0,0 @@ -#ifdef ENABLE_ANNOY - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; - extern const int INCORRECT_DATA; - extern const int INCORRECT_NUMBER_OF_COLUMNS; - extern const int INCORRECT_QUERY; - extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; -} - -template -AnnoyIndexWithSerialization::AnnoyIndexWithSerialization(size_t dimensions) - : Base::AnnoyIndex(static_cast(dimensions)) -{ -} - -template -void AnnoyIndexWithSerialization::serialize(WriteBuffer & ostr) const -{ - chassert(Base::_built); - writeIntBinary(Base::_s, ostr); - writeIntBinary(Base::_n_items, ostr); - writeIntBinary(Base::_n_nodes, ostr); - writeIntBinary(Base::_nodes_size, ostr); - writeIntBinary(Base::_K, ostr); - writeIntBinary(Base::_seed, ostr); - writeVectorBinary(Base::_roots, ostr); - ostr.write(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); -} - -template -void AnnoyIndexWithSerialization::deserialize(ReadBuffer & istr) -{ - chassert(!Base::_built); - readIntBinary(Base::_s, istr); - readIntBinary(Base::_n_items, istr); - readIntBinary(Base::_n_nodes, istr); - readIntBinary(Base::_nodes_size, istr); - readIntBinary(Base::_K, istr); - readIntBinary(Base::_seed, istr); - readVectorBinary(Base::_roots, istr); - Base::_nodes = realloc(Base::_nodes, Base::_s * Base::_n_nodes); - istr.readStrict(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); - - Base::_fd = 0; - // set flags - Base::_loaded = false; - Base::_verbose = false; - Base::_on_disk = false; - Base::_built = true; -} - -template -size_t AnnoyIndexWithSerialization::getDimensions() const -{ - return Base::get_f(); -} - - -template -MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_) - : index_name(index_name_) - , index_sample_block(index_sample_block_) - , index(nullptr) -{} - -template -MergeTreeIndexGranuleAnnoy::MergeTreeIndexGranuleAnnoy( - const String & index_name_, - const Block & index_sample_block_, - AnnoyIndexWithSerializationPtr index_) - : index_name(index_name_) - , index_sample_block(index_sample_block_) - , index(std::move(index_)) -{} - -template -void MergeTreeIndexGranuleAnnoy::serializeBinary(WriteBuffer & ostr) const -{ - /// Number of dimensions is required in the index constructor, - /// so it must be written and read separately from the other part - writeIntBinary(static_cast(index->getDimensions()), ostr); // write dimension - index->serialize(ostr); -} - -template -void MergeTreeIndexGranuleAnnoy::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) -{ - UInt64 dimension; - readIntBinary(dimension, istr); - index = std::make_shared>(dimension); - index->deserialize(istr); -} - -template -MergeTreeIndexAggregatorAnnoy::MergeTreeIndexAggregatorAnnoy( - const String & index_name_, - const Block & index_sample_block_, - UInt64 trees_, - size_t max_threads_for_creation_) - : index_name(index_name_) - , index_sample_block(index_sample_block_) - , trees(trees_) - , max_threads_for_creation(max_threads_for_creation_) -{} - -template -MergeTreeIndexGranulePtr MergeTreeIndexAggregatorAnnoy::getGranuleAndReset() -{ - int threads = (max_threads_for_creation == 0) ? -1 : static_cast(max_threads_for_creation); - /// clang-tidy reports a false positive: it considers %p with an outdated pointer in fprintf() (used by logging which we don't do) dereferencing - index->build(static_cast(trees), threads); - auto granule = std::make_shared>(index_name, index_sample_block, index); - index = nullptr; - return granule; -} - -template -void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t * pos, size_t limit) -{ - if (*pos >= block.rows()) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.", - *pos, block.rows()); - - size_t rows_read = std::min(limit, block.rows() - *pos); - - if (rows_read == 0) - return; - - if (rows_read > std::numeric_limits::max()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than 4B rows per index granule."); - - if (index_sample_block.columns() > 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); - - const String & index_column_name = index_sample_block.getByPosition(0).name; - ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); - - if (const auto & column_array = typeid_cast(column_cut.get())) - { - const auto & column_array_data = column_array->getData(); - const auto & column_array_data_float = typeid_cast(column_array_data); - const auto & column_array_data_float_data = column_array_data_float.getData(); - - const auto & column_array_offsets = column_array->getOffsets(); - const size_t num_rows = column_array_offsets.size(); - - if (column_array->empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty"); - - /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays - /// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default - /// value which is also empty. - if (column_array->isDefaultAt(0)) - throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); - - /// Check all sizes are the same - size_t dimension = column_array_offsets[0]; - for (size_t i = 0; i < num_rows - 1; ++i) - if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); - - /// Also check that previously inserted blocks have the same size as this block. - /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across - /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. - if (index && index->getDimensions() != dimension) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); - - if (!index) - index = std::make_shared>(dimension); - - /// Add all rows of block - index->add_item(index->get_n_items(), column_array_data_float_data.data()); - for (size_t current_row = 1; current_row < num_rows; ++current_row) - index->add_item(index->get_n_items(), &column_array_data_float_data[column_array_offsets[current_row - 1]]); - } - else if (const auto & column_tuple = typeid_cast(column_cut.get())) - { - const auto & column_tuple_columns = column_tuple->getColumns(); - - /// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything - std::vector> data(column_tuple->size(), std::vector()); - for (const auto & column : column_tuple_columns) - { - const auto & pod_array = typeid_cast(column.get())->getData(); - for (size_t i = 0; i < pod_array.size(); ++i) - data[i].push_back(pod_array[i]); - } - - if (data.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read); - - if (!index) - index = std::make_shared>(data[0].size()); - - for (const auto & item : data) - index->add_item(index->get_n_items(), item.data()); - } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column"); - - *pos += rows_read; -} - - -MergeTreeIndexConditionAnnoy::MergeTreeIndexConditionAnnoy( - const IndexDescription & /*index_description*/, - const SelectQueryInfo & query, - const String & distance_function_, - ContextPtr context) - : ann_condition(query, context) - , distance_function(distance_function_) - , search_k(context->getSettingsRef().annoy_index_search_k_nodes) -{} - -bool MergeTreeIndexConditionAnnoy::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const -{ - throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); -} - -bool MergeTreeIndexConditionAnnoy::alwaysUnknownOrTrue() const -{ - return ann_condition.alwaysUnknownOrTrue(distance_function); -} - -std::vector MergeTreeIndexConditionAnnoy::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const -{ - if (distance_function == DISTANCE_FUNCTION_L2) - return getUsefulRangesImpl(idx_granule); - else if (distance_function == DISTANCE_FUNCTION_COSINE) - return getUsefulRangesImpl(idx_granule); - std::unreachable(); -} - -template -std::vector MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const -{ - const UInt64 limit = ann_condition.getLimit(); - const UInt64 index_granularity = ann_condition.getIndexGranularity(); - const std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where - ? std::optional(ann_condition.getComparisonDistanceForWhereQuery()) - : std::nullopt; - - if (comparison_distance && comparison_distance.value() < 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); - - const std::vector reference_vector = ann_condition.getReferenceVector(); - - const auto granule = std::dynamic_pointer_cast>(idx_granule); - if (granule == nullptr) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); - - const AnnoyIndexWithSerializationPtr annoy = granule->index; - - if (ann_condition.getDimensions() != annoy->getDimensions()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " - "does not match the dimension in the index ({})", - ann_condition.getDimensions(), annoy->getDimensions()); - - std::vector neighbors; /// indexes of dots which were closest to the reference vector - std::vector distances; - neighbors.reserve(limit); - distances.reserve(limit); - - annoy->get_nns_by_vector(reference_vector.data(), limit, static_cast(search_k), &neighbors, &distances); - - chassert(neighbors.size() == distances.size()); - - std::vector granules; - granules.reserve(neighbors.size()); - for (size_t i = 0; i < neighbors.size(); ++i) - { - if (comparison_distance && distances[i] > comparison_distance) - continue; - granules.push_back(neighbors[i] / index_granularity); - } - - /// make unique - std::sort(granules.begin(), granules.end()); - granules.erase(std::unique(granules.begin(), granules.end()), granules.end()); - - return granules; -} - -MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_) - : IMergeTreeIndex(index_) - , trees(trees_) - , distance_function(distance_function_) -{} - -MergeTreeIndexGranulePtr MergeTreeIndexAnnoy::createIndexGranule() const -{ - if (distance_function == DISTANCE_FUNCTION_L2) - return std::make_shared>(index.name, index.sample_block); - else if (distance_function == DISTANCE_FUNCTION_COSINE) - return std::make_shared>(index.name, index.sample_block); - std::unreachable(); -} - -MergeTreeIndexAggregatorPtr MergeTreeIndexAnnoy::createIndexAggregator(const MergeTreeWriterSettings & settings) const -{ - /// TODO: Support more metrics. Available metrics: https://github.com/spotify/annoy/blob/master/src/annoymodule.cc#L151-L171 - if (distance_function == DISTANCE_FUNCTION_L2) - return std::make_shared>(index.name, index.sample_block, trees, settings.max_threads_for_annoy_index_creation); - else if (distance_function == DISTANCE_FUNCTION_COSINE) - return std::make_shared>(index.name, index.sample_block, trees, settings.max_threads_for_annoy_index_creation); - std::unreachable(); -} - -MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const -{ - return std::make_shared(index, query, distance_function, context); -}; - -MergeTreeIndexConditionPtr MergeTreeIndexAnnoy::createIndexCondition(const ActionsDAG *, ContextPtr) const -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG"); -} - -MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index) -{ - static constexpr auto DEFAULT_DISTANCE_FUNCTION = DISTANCE_FUNCTION_L2; - String distance_function = DEFAULT_DISTANCE_FUNCTION; - if (!index.arguments.empty()) - distance_function = index.arguments[0].safeGet(); - - static constexpr auto DEFAULT_TREES = 100uz; - UInt64 trees = DEFAULT_TREES; - if (index.arguments.size() > 1) - trees = index.arguments[1].safeGet(); - - return std::make_shared(index, trees, distance_function); -} - -void annoyIndexValidator(const IndexDescription & index, bool /* attach */) -{ - /// Check number and type of Annoy index arguments: - - if (index.arguments.size() > 2) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index must not have more than two parameters"); - - if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of Annoy index must be of type String"); - - if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::UInt64) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be of type UInt64"); - - /// Check that the index is created on a single column - - if (index.column_names.size() != 1 || index.data_types.size() != 1) - throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Annoy indexes must be created on a single column"); - - /// Check that a supported metric was passed as first argument - - if (!index.arguments.empty()) - { - String distance_name = index.arguments[0].safeGet(); - if (distance_name != DISTANCE_FUNCTION_L2 && distance_name != DISTANCE_FUNCTION_COSINE) - throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index only supports distance functions '{}' and '{}'", DISTANCE_FUNCTION_L2, DISTANCE_FUNCTION_COSINE); - } - - /// Check data type of indexed column: - - auto throw_unsupported_underlying_column_exception = []() - { - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])"); - }; - - DataTypePtr data_type = index.sample_block.getDataTypes()[0]; - - if (const auto * data_type_array = typeid_cast(data_type.get())) - { - TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); - if (!WhichDataType(nested_type_index).isFloat32()) - throw_unsupported_underlying_column_exception(); - } - else if (const auto * data_type_tuple = typeid_cast(data_type.get())) - { - const DataTypes & inner_types = data_type_tuple->getElements(); - for (const auto & inner_type : inner_types) - { - TypeIndex nested_type_index = inner_type->getTypeId(); - if (!WhichDataType(nested_type_index).isFloat32()) - throw_unsupported_underlying_column_exception(); - } - } - else - throw_unsupported_underlying_column_exception(); -} - -} - -#endif diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h deleted file mode 100644 index 282920c608e..00000000000 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ /dev/null @@ -1,112 +0,0 @@ -#pragma once - -#ifdef ENABLE_ANNOY - -#include - -#include -#include - -namespace DB -{ - -template -class AnnoyIndexWithSerialization : public Annoy::AnnoyIndex -{ - using Base = Annoy::AnnoyIndex; - -public: - explicit AnnoyIndexWithSerialization(size_t dimensions); - void serialize(WriteBuffer & ostr) const; - void deserialize(ReadBuffer & istr); - size_t getDimensions() const; -}; - -template -using AnnoyIndexWithSerializationPtr = std::shared_ptr>; - - -template -struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule -{ - MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_); - MergeTreeIndexGranuleAnnoy(const String & index_name_, const Block & index_sample_block_, AnnoyIndexWithSerializationPtr index_); - - ~MergeTreeIndexGranuleAnnoy() override = default; - - void serializeBinary(WriteBuffer & ostr) const override; - void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override; - - bool empty() const override { return !index.get(); } - - const String index_name; - const Block index_sample_block; - AnnoyIndexWithSerializationPtr index; -}; - - -template -struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator -{ - MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, UInt64 trees, size_t max_threads_for_creation); - ~MergeTreeIndexAggregatorAnnoy() override = default; - - bool empty() const override { return !index || index->get_n_items() == 0; } - MergeTreeIndexGranulePtr getGranuleAndReset() override; - void update(const Block & block, size_t * pos, size_t limit) override; - - const String index_name; - const Block index_sample_block; - const UInt64 trees; - const size_t max_threads_for_creation; - AnnoyIndexWithSerializationPtr index; -}; - - -class MergeTreeIndexConditionAnnoy final : public IMergeTreeIndexConditionApproximateNearestNeighbor -{ -public: - MergeTreeIndexConditionAnnoy( - const IndexDescription & index_description, - const SelectQueryInfo & query, - const String & distance_function, - ContextPtr context); - - ~MergeTreeIndexConditionAnnoy() override = default; - - bool alwaysUnknownOrTrue() const override; - bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override; - std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override; - -private: - template - std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; - - const ApproximateNearestNeighborCondition ann_condition; - const String distance_function; - const Int64 search_k; -}; - - -class MergeTreeIndexAnnoy final : public IMergeTreeIndex -{ -public: - - MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_); - - ~MergeTreeIndexAnnoy() override = default; - - MergeTreeIndexGranulePtr createIndexGranule() const override; - MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override; - MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const; - MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override; - bool isVectorSearch() const override { return true; } - -private: - const UInt64 trees; - const String distance_function; -}; - -} - -#endif diff --git a/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp new file mode 100644 index 00000000000..29de109d4fc --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp @@ -0,0 +1,45 @@ +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_INDEX; +} + +MergeTreeIndexLegacyVectorSimilarity::MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_) + : IMergeTreeIndex(index_) +{ +} + +MergeTreeIndexGranulePtr MergeTreeIndexLegacyVectorSimilarity::createIndexGranule() const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +} + +MergeTreeIndexAggregatorPtr MergeTreeIndexLegacyVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings &) const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +} + +MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const SelectQueryInfo &, ContextPtr) const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +}; + +MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +} + +MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index) +{ + return std::make_shared(index); +} + +void legacyVectorSimilarityIndexValidator(const IndexDescription &, bool) +{ +} + +} diff --git a/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h new file mode 100644 index 00000000000..1015401823d --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +/// Walking corpse implementation for removed skipping index of type "annoy" and "usearch". +/// Its only purpose is to allow loading old tables with indexes of these types. +/// Data insertion and index usage/search will throw an exception, suggesting to migrate to "vector_similarity" indexes. + +namespace DB +{ + +class MergeTreeIndexLegacyVectorSimilarity : public IMergeTreeIndex +{ +public: + explicit MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_); + ~MergeTreeIndexLegacyVectorSimilarity() override = default; + + MergeTreeIndexGranulePtr createIndexGranule() const override; + MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings &) const override; + MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo &, ContextPtr) const; + MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override; + + bool isVectorSimilarityIndex() const override { return true; } +}; + +} diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp deleted file mode 100644 index efd9bb754e1..00000000000 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp +++ /dev/null @@ -1,463 +0,0 @@ -#ifdef ENABLE_USEARCH - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wpass-failed" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ProfileEvents -{ - extern const Event USearchAddCount; - extern const Event USearchAddVisitedMembers; - extern const Event USearchAddComputedDistances; - extern const Event USearchSearchCount; - extern const Event USearchSearchVisitedMembers; - extern const Event USearchSearchComputedDistances; -} - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_ALLOCATE_MEMORY; - extern const int ILLEGAL_COLUMN; - extern const int INCORRECT_DATA; - extern const int INCORRECT_NUMBER_OF_COLUMNS; - extern const int INCORRECT_QUERY; - extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; -} - -namespace -{ - -std::unordered_map nameToScalarKind = { - {"f64", unum::usearch::scalar_kind_t::f64_k}, - {"f32", unum::usearch::scalar_kind_t::f32_k}, - {"f16", unum::usearch::scalar_kind_t::f16_k}, - {"i8", unum::usearch::scalar_kind_t::i8_k}}; - -} - -template -USearchIndexWithSerialization::USearchIndexWithSerialization(size_t dimensions, unum::usearch::scalar_kind_t scalar_kind) - : Base(Base::make(unum::usearch::metric_punned_t(dimensions, Metric, scalar_kind))) -{ -} - -template -void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const -{ - auto callback = [&ostr](void * from, size_t n) - { - ostr.write(reinterpret_cast(from), n); - return true; - }; - - Base::save_to_stream(callback); -} - -template -void USearchIndexWithSerialization::deserialize(ReadBuffer & istr) -{ - auto callback = [&istr](void * from, size_t n) - { - istr.readStrict(reinterpret_cast(from), n); - return true; - }; - - Base::load_from_stream(callback); -} - -template -size_t USearchIndexWithSerialization::getDimensions() const -{ - return Base::dimensions(); -} - -template -MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch( - const String & index_name_, - const Block & index_sample_block_, - unum::usearch::scalar_kind_t scalar_kind_) - : index_name(index_name_) - , index_sample_block(index_sample_block_) - , scalar_kind(scalar_kind_) - , index(nullptr) -{ -} - -template -MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch( - const String & index_name_, - const Block & index_sample_block_, - unum::usearch::scalar_kind_t scalar_kind_, - USearchIndexWithSerializationPtr index_) - : index_name(index_name_) - , index_sample_block(index_sample_block_) - , scalar_kind(scalar_kind_) - , index(std::move(index_)) -{ -} - -template -void MergeTreeIndexGranuleUSearch::serializeBinary(WriteBuffer & ostr) const -{ - /// Number of dimensions is required in the index constructor, - /// so it must be written and read separately from the other part - writeIntBinary(static_cast(index->getDimensions()), ostr); // write dimension - index->serialize(ostr); -} - -template -void MergeTreeIndexGranuleUSearch::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) -{ - UInt64 dimension; - readIntBinary(dimension, istr); - index = std::make_shared>(dimension, scalar_kind); - index->deserialize(istr); -} - -template -MergeTreeIndexAggregatorUSearch::MergeTreeIndexAggregatorUSearch( - const String & index_name_, - const Block & index_sample_block_, - unum::usearch::scalar_kind_t scalar_kind_) - : index_name(index_name_) - , index_sample_block(index_sample_block_) - , scalar_kind(scalar_kind_) -{ -} - -template -MergeTreeIndexGranulePtr MergeTreeIndexAggregatorUSearch::getGranuleAndReset() -{ - auto granule = std::make_shared>(index_name, index_sample_block, scalar_kind, index); - index = nullptr; - return granule; -} - -template -void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, size_t limit) -{ - if (*pos >= block.rows()) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.", - *pos, - block.rows()); - - size_t rows_read = std::min(limit, block.rows() - *pos); - - if (rows_read == 0) - return; - - if (rows_read > std::numeric_limits::max()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than 4B rows per index granule."); - - if (index_sample_block.columns() > 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); - - const String & index_column_name = index_sample_block.getByPosition(0).name; - ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); - - if (const auto & column_array = typeid_cast(column_cut.get())) - { - const auto & column_array_data = column_array->getData(); - const auto & column_array_data_float = typeid_cast(column_array_data); - const auto & column_array_data_float_data = column_array_data_float.getData(); - - const auto & column_array_offsets = column_array->getOffsets(); - const size_t num_rows = column_array_offsets.size(); - - if (column_array->empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty"); - - /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays - /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default - /// values which is also empty. - if (column_array->isDefaultAt(0)) - throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); - - /// Check all sizes are the same - size_t dimension = column_array_offsets[0]; - for (size_t i = 0; i < num_rows - 1; ++i) - if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); - - /// Also check that previously inserted blocks have the same size as this block. - /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across - /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. - if (index && index->getDimensions() != dimension) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); - - if (!index) - index = std::make_shared>(dimension, scalar_kind); - - /// Add all rows of block - if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows))) - throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index"); - - for (size_t current_row = 0; current_row < num_rows; ++current_row) - { - auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]); - if (!rc) - throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, rc.error.release()); - - ProfileEvents::increment(ProfileEvents::USearchAddCount); - ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members); - ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances); - } - } - else if (const auto & column_tuple = typeid_cast(column_cut.get())) - { - const auto & column_tuple_columns = column_tuple->getColumns(); - std::vector> data(column_tuple->size(), std::vector()); - for (const auto & column : column_tuple_columns) - { - const auto & pod_array = typeid_cast(column.get())->getData(); - for (size_t i = 0; i < pod_array.size(); ++i) - data[i].push_back(pod_array[i]); - } - - if (data.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read); - - if (!index) - index = std::make_shared>(data[0].size(), scalar_kind); - - if (!index->reserve(unum::usearch::ceil2(index->size() + data.size()))) - throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index"); - - for (const auto & item : data) - { - auto rc = index->add(static_cast(index->size()), item.data()); - if (!rc) - throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, rc.error.release()); - - ProfileEvents::increment(ProfileEvents::USearchAddCount); - ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members); - ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances); - } - } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column"); - - *pos += rows_read; -} - -MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch( - const IndexDescription & /*index_description*/, - const SelectQueryInfo & query, - const String & distance_function_, - ContextPtr context) - : ann_condition(query, context) - , distance_function(distance_function_) -{ -} - -bool MergeTreeIndexConditionUSearch::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const -{ - throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); -} - -bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const -{ - return ann_condition.alwaysUnknownOrTrue(distance_function); -} - -std::vector MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const -{ - if (distance_function == DISTANCE_FUNCTION_L2) - return getUsefulRangesImpl(idx_granule); - else if (distance_function == DISTANCE_FUNCTION_COSINE) - return getUsefulRangesImpl(idx_granule); - std::unreachable(); -} - -template -std::vector MergeTreeIndexConditionUSearch::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const -{ - const UInt64 limit = ann_condition.getLimit(); - const UInt64 index_granularity = ann_condition.getIndexGranularity(); - const std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where - ? std::optional(ann_condition.getComparisonDistanceForWhereQuery()) - : std::nullopt; - - if (comparison_distance && comparison_distance.value() < 0) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); - - const std::vector reference_vector = ann_condition.getReferenceVector(); - - const auto granule = std::dynamic_pointer_cast>(idx_granule); - if (granule == nullptr) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); - - const USearchIndexWithSerializationPtr index = granule->index; - - if (ann_condition.getDimensions() != index->dimensions()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " - "does not match the dimension in the index ({})", - ann_condition.getDimensions(), index->dimensions()); - - auto result = index->search(reference_vector.data(), limit); - - ProfileEvents::increment(ProfileEvents::USearchSearchCount); - ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members); - ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances); - - std::vector neighbors(result.size()); /// indexes of dots which were closest to the reference vector - std::vector distances(result.size()); - result.dump_to(neighbors.data(), distances.data()); - - std::vector granules; - granules.reserve(neighbors.size()); - for (size_t i = 0; i < neighbors.size(); ++i) - { - if (comparison_distance && distances[i] > comparison_distance) - continue; - granules.push_back(neighbors[i] / index_granularity); - } - - /// make unique - std::sort(granules.begin(), granules.end()); - granules.erase(std::unique(granules.begin(), granules.end()), granules.end()); - - return granules; -} - -MergeTreeIndexUSearch::MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_, unum::usearch::scalar_kind_t scalar_kind_) - : IMergeTreeIndex(index_) - , distance_function(distance_function_) - , scalar_kind(scalar_kind_) -{ -} - -MergeTreeIndexGranulePtr MergeTreeIndexUSearch::createIndexGranule() const -{ - if (distance_function == DISTANCE_FUNCTION_L2) - return std::make_shared>(index.name, index.sample_block, scalar_kind); - else if (distance_function == DISTANCE_FUNCTION_COSINE) - return std::make_shared>(index.name, index.sample_block, scalar_kind); - std::unreachable(); -} - -MergeTreeIndexAggregatorPtr MergeTreeIndexUSearch::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const -{ - if (distance_function == DISTANCE_FUNCTION_L2) - return std::make_shared>(index.name, index.sample_block, scalar_kind); - else if (distance_function == DISTANCE_FUNCTION_COSINE) - return std::make_shared>(index.name, index.sample_block, scalar_kind); - std::unreachable(); -} - -MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const -{ - return std::make_shared(index, query, distance_function, context); -}; - -MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const ActionsDAG *, ContextPtr) const -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG"); -} - -MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index) -{ - static constexpr auto default_distance_function = DISTANCE_FUNCTION_L2; - String distance_function = default_distance_function; - if (!index.arguments.empty()) - distance_function = index.arguments[0].safeGet(); - - static constexpr auto default_scalar_kind = unum::usearch::scalar_kind_t::f16_k; - auto scalar_kind = default_scalar_kind; - if (index.arguments.size() > 1) - scalar_kind = nameToScalarKind.at(index.arguments[1].safeGet()); - - return std::make_shared(index, distance_function, scalar_kind); -} - -void usearchIndexValidator(const IndexDescription & index, bool /* attach */) -{ - /// Check number and type of USearch index arguments: - - if (index.arguments.size() > 2) - throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index must not have more than one parameters"); - - if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of USearch index (distance function) must be of type String"); - if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of USearch index (scalar type) must be of type String"); - - /// Check that the index is created on a single column - - if (index.column_names.size() != 1 || index.data_types.size() != 1) - throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "USearch indexes must be created on a single column"); - - /// Check that a supported metric was passed as first argument - - if (!index.arguments.empty()) - { - String distance_name = index.arguments[0].safeGet(); - if (distance_name != DISTANCE_FUNCTION_L2 && distance_name != DISTANCE_FUNCTION_COSINE) - throw Exception(ErrorCodes::INCORRECT_DATA, "USearch index only supports distance functions '{}' and '{}'", DISTANCE_FUNCTION_L2, DISTANCE_FUNCTION_COSINE); - } - - /// Check that a supported kind was passed as a second argument - - if (index.arguments.size() > 1 && !nameToScalarKind.contains(index.arguments[1].safeGet())) - { - String supported_kinds; - for (const auto & [name, kind] : nameToScalarKind) - { - if (!supported_kinds.empty()) - supported_kinds += ", "; - supported_kinds += name; - } - throw Exception(ErrorCodes::INCORRECT_DATA, "Unrecognized scalar kind (second argument) for USearch index. Supported kinds are: {}", supported_kinds); - } - - /// Check data type of indexed column: - - auto throw_unsupported_underlying_column_exception = []() - { - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])"); - }; - - DataTypePtr data_type = index.sample_block.getDataTypes()[0]; - - if (const auto * data_type_array = typeid_cast(data_type.get())) - { - TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); - if (!WhichDataType(nested_type_index).isFloat32()) - throw_unsupported_underlying_column_exception(); - } - else if (const auto * data_type_tuple = typeid_cast(data_type.get())) - { - const DataTypes & inner_types = data_type_tuple->getElements(); - for (const auto & inner_type : inner_types) - { - TypeIndex nested_type_index = inner_type->getTypeId(); - if (!WhichDataType(nested_type_index).isFloat32()) - throw_unsupported_underlying_column_exception(); - } - } - else - throw_unsupported_underlying_column_exception(); -} - -} - -#endif diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.h b/src/Storages/MergeTree/MergeTreeIndexUSearch.h deleted file mode 100644 index 41de94402c9..00000000000 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.h +++ /dev/null @@ -1,116 +0,0 @@ -#pragma once - -#ifdef ENABLE_USEARCH - -#include - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wpass-failed" -#include -#pragma clang diagnostic pop - -namespace DB -{ - -using USearchImplType = unum::usearch::index_dense_gt; - -template -class USearchIndexWithSerialization : public USearchImplType -{ - using Base = USearchImplType; - -public: - USearchIndexWithSerialization(size_t dimensions, unum::usearch::scalar_kind_t scalar_kind); - void serialize(WriteBuffer & ostr) const; - void deserialize(ReadBuffer & istr); - size_t getDimensions() const; -}; - -template -using USearchIndexWithSerializationPtr = std::shared_ptr>; - - -template -struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule -{ - MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_, unum::usearch::scalar_kind_t scalar_kind_); - MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_, unum::usearch::scalar_kind_t scalar_kind_, USearchIndexWithSerializationPtr index_); - - ~MergeTreeIndexGranuleUSearch() override = default; - - void serializeBinary(WriteBuffer & ostr) const override; - void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override; - - bool empty() const override { return !index.get(); } - - const String index_name; - const Block index_sample_block; - const unum::usearch::scalar_kind_t scalar_kind; - USearchIndexWithSerializationPtr index; -}; - - -template -struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator -{ - MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block, unum::usearch::scalar_kind_t scalar_kind_); - ~MergeTreeIndexAggregatorUSearch() override = default; - - bool empty() const override { return !index || index->size() == 0; } - MergeTreeIndexGranulePtr getGranuleAndReset() override; - void update(const Block & block, size_t * pos, size_t limit) override; - - const String index_name; - const Block index_sample_block; - const unum::usearch::scalar_kind_t scalar_kind; - USearchIndexWithSerializationPtr index; -}; - - -class MergeTreeIndexConditionUSearch final : public IMergeTreeIndexConditionApproximateNearestNeighbor -{ -public: - MergeTreeIndexConditionUSearch( - const IndexDescription & index_description, - const SelectQueryInfo & query, - const String & distance_function, - ContextPtr context); - - ~MergeTreeIndexConditionUSearch() override = default; - - bool alwaysUnknownOrTrue() const override; - bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override; - std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override; - -private: - template - std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; - - const ApproximateNearestNeighborCondition ann_condition; - const String distance_function; -}; - - -class MergeTreeIndexUSearch : public IMergeTreeIndex -{ -public: - MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_, unum::usearch::scalar_kind_t scalar_kind_); - - ~MergeTreeIndexUSearch() override = default; - - MergeTreeIndexGranulePtr createIndexGranule() const override; - MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override; - MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const; - MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override; - bool isVectorSearch() const override { return true; } - -private: - const String distance_function; - const unum::usearch::scalar_kind_t scalar_kind; -}; - -} - - -#endif - diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp new file mode 100644 index 00000000000..5b0793fa0c8 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -0,0 +1,492 @@ +#include + +#if USE_USEARCH + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpass-failed" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event USearchAddCount; + extern const Event USearchAddVisitedMembers; + extern const Event USearchAddComputedDistances; + extern const Event USearchSearchCount; + extern const Event USearchSearchVisitedMembers; + extern const Event USearchSearchComputedDistances; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int FORMAT_VERSION_TOO_OLD; + extern const int ILLEGAL_COLUMN; + extern const int INCORRECT_DATA; + extern const int INCORRECT_NUMBER_OF_COLUMNS; + extern const int INCORRECT_QUERY; + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; +} + +namespace +{ + +/// The only indexing method currently supported by USearch +std::set methods = {"hnsw"}; + +/// Maps from user-facing name to internal name +std::unordered_map distanceFunctionToMetricKind = { + {"L2Distance", unum::usearch::metric_kind_t::l2sq_k}, + {"cosineDistance", unum::usearch::metric_kind_t::cos_k}}; + +/// Maps from user-facing name to internal name +std::unordered_map quantizationToScalarKind = { + {"f32", unum::usearch::scalar_kind_t::f32_k}, + {"f16", unum::usearch::scalar_kind_t::f16_k}, + {"i8", unum::usearch::scalar_kind_t::i8_k}}; + +template +concept is_set = std::same_as>; + +template +concept is_unordered_map = std::same_as>; + +template +String joinByComma(const T & t) +{ + if constexpr (is_set) + { + return fmt::format("{}", fmt::join(t, ", ")); + } + else if constexpr (is_unordered_map) + { + String joined_keys; + for (const auto & [k, _] : t) + { + if (!joined_keys.empty()) + joined_keys += ", "; + joined_keys += k; + } + return joined_keys; + } + /// TODO once our libcxx is recent enough, replace above by + /// return fmt::format("{}", fmt::join(std::views::keys(t)), ", ")); + std::unreachable(); +} + +} + +USearchIndexWithSerialization::USearchIndexWithSerialization( + size_t dimensions, + unum::usearch::metric_kind_t metric_kind, + unum::usearch::scalar_kind_t scalar_kind, + UsearchHnswParams usearch_hnsw_params) + : Base(Base::make(unum::usearch::metric_punned_t(dimensions, metric_kind, scalar_kind), + unum::usearch::index_dense_config_t(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search))) +{ +} + +void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const +{ + auto callback = [&ostr](void * from, size_t n) + { + ostr.write(reinterpret_cast(from), n); + return true; + }; + + auto result = Base::save_to_stream(callback); + if (result.error) + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not save vector similarity index, error: " + String(result.error.release())); +} + +void USearchIndexWithSerialization::deserialize(ReadBuffer & istr) +{ + auto callback = [&istr](void * from, size_t n) + { + istr.readStrict(reinterpret_cast(from), n); + return true; + }; + + auto result = Base::load_from_stream(callback); + if (result.error) + /// See the comment in MergeTreeIndexGranuleVectorSimilarity::deserializeBinary why we throw here + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not load vector similarity index, error: " + String(result.error.release()) + " Please drop the index and create it again."); +} + +USearchIndexWithSerialization::Statistics USearchIndexWithSerialization::getStatistics() const +{ + Statistics statistics = { + .max_level = max_level(), + .connectivity = connectivity(), + .size = size(), /// number of vectors + .capacity = capacity(), /// number of vectors reserved + .memory_usage = memory_usage(), /// in bytes, the value is not exact + .bytes_per_vector = bytes_per_vector(), + .scalar_words = scalar_words(), + .statistics = stats()}; + return statistics; +} + +MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity( + const String & index_name_, + const Block & index_sample_block_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_) + : MergeTreeIndexGranuleVectorSimilarity(index_name_, index_sample_block_, metric_kind_, scalar_kind_, usearch_hnsw_params_, nullptr) +{ +} + +MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity( + const String & index_name_, + const Block & index_sample_block_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_, + USearchIndexWithSerializationPtr index_) + : index_name(index_name_) + , index_sample_block(index_sample_block_) + , metric_kind(metric_kind_) + , scalar_kind(scalar_kind_) + , usearch_hnsw_params(usearch_hnsw_params_) + , index(std::move(index_)) +{ +} + +void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr) const +{ + if (empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty minmax index {}", backQuote(index_name)); + + writeIntBinary(FILE_FORMAT_VERSION, ostr); + + /// Number of dimensions is required in the index constructor, + /// so it must be written and read separately from the other part + writeIntBinary(static_cast(index->dimensions()), ostr); + + index->serialize(ostr); + + auto statistics = index->getStatistics(); + LOG_TRACE(logger, "Wrote vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}", + statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage)); +} + +void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) +{ + UInt64 file_version; + readIntBinary(file_version, istr); + if (file_version != FILE_FORMAT_VERSION) + throw Exception( + ErrorCodes::FORMAT_VERSION_TOO_OLD, + "Vector similarity index could not be loaded because its version is too old (current version: {}, persisted version: {}). Please drop the index and create it again.", + FILE_FORMAT_VERSION, file_version); + /// More fancy error handling would be: Set a flag on the index that it failed to load. During usage return all granules, i.e. + /// behave as if the index does not exist. Since format changes are expected to happen only rarely and it is "only" an index, keep it simple for now. + + UInt64 dimension; + readIntBinary(dimension, istr); + index = std::make_shared(dimension, metric_kind, scalar_kind, usearch_hnsw_params); + + index->deserialize(istr); + + auto statistics = index->getStatistics(); + LOG_TRACE(logger, "Loaded vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}", + statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage)); +} + +MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilarity( + const String & index_name_, + const Block & index_sample_block_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_) + : index_name(index_name_) + , index_sample_block(index_sample_block_) + , metric_kind(metric_kind_) + , scalar_kind(scalar_kind_) + , usearch_hnsw_params(usearch_hnsw_params_) +{ +} + +MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAndReset() +{ + auto granule = std::make_shared(index_name, index_sample_block, metric_kind, scalar_kind, usearch_hnsw_params, index); + index = nullptr; + return granule; +} + +void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_t * pos, size_t limit) +{ + if (*pos >= block.rows()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.", + *pos, + block.rows()); + + size_t rows_read = std::min(limit, block.rows() - *pos); + + if (rows_read == 0) + return; + + if (rows_read > std::numeric_limits::max()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Index granularity is too big: more than {} rows per index granule.", std::numeric_limits::max()); + + if (index_sample_block.columns() > 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); + + const String & index_column_name = index_sample_block.getByPosition(0).name; + ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); + + if (const auto & column_array = typeid_cast(column_cut.get())) + { + const auto & column_array_data = column_array->getData(); + const auto & column_array_data_float = typeid_cast(column_array_data); + const auto & column_array_data_float_data = column_array_data_float.getData(); + + const auto & column_array_offsets = column_array->getOffsets(); + const size_t num_rows = column_array_offsets.size(); + + if (column_array->empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty"); + + /// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays + /// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default + /// values which is also empty. + if (column_array->isDefaultAt(0)) + throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); + + /// Check all sizes are the same + const size_t dimensions = column_array_offsets[0]; + for (size_t i = 0; i < num_rows - 1; ++i) + if (column_array_offsets[i + 1] - column_array_offsets[i] != dimensions) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); + + /// Also check that previously inserted blocks have the same size as this block. + /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across + /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. + if (index && index->dimensions() != dimensions) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); + + if (!index) + index = std::make_shared(dimensions, metric_kind, scalar_kind, usearch_hnsw_params); + + /// Reserving space is mandatory + if (!index->reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows))) + throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index"); + + for (size_t row = 0; row < num_rows; ++row) + { + auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); + if (!rc) + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index, error: " + String(rc.error.release())); + + ProfileEvents::increment(ProfileEvents::USearchAddCount); + ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members); + ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances); + } + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float32) column"); + + *pos += rows_read; +} + +MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity( + const IndexDescription & /*index_description*/, + const SelectQueryInfo & query, + unum::usearch::metric_kind_t metric_kind_, + ContextPtr context) + : vector_similarity_condition(query, context) + , metric_kind(metric_kind_) +{ +} + +bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const +{ + throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); +} + +bool MergeTreeIndexConditionVectorSimilarity::alwaysUnknownOrTrue() const +{ + String index_distance_function; + switch (metric_kind) + { + case unum::usearch::metric_kind_t::l2sq_k: index_distance_function = "L2Distance"; break; + case unum::usearch::metric_kind_t::cos_k: index_distance_function = "cosineDistance"; break; + default: std::unreachable(); + } + return vector_similarity_condition.alwaysUnknownOrTrue(index_distance_function); +} + +std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(MergeTreeIndexGranulePtr granule_) const +{ + const UInt64 limit = vector_similarity_condition.getLimit(); + const UInt64 index_granularity = vector_similarity_condition.getIndexGranularity(); + + const auto granule = std::dynamic_pointer_cast(granule_); + if (granule == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); + + const USearchIndexWithSerializationPtr index = granule->index; + + if (vector_similarity_condition.getDimensions() != index->dimensions()) + throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) " + "does not match the dimension in the index ({})", + vector_similarity_condition.getDimensions(), index->dimensions()); + + const std::vector reference_vector = vector_similarity_condition.getReferenceVector(); + + auto result = index->search(reference_vector.data(), limit); + if (result.error) + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index, error: " + String(result.error.release())); + + ProfileEvents::increment(ProfileEvents::USearchSearchCount); + ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members); + ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances); + + std::vector neighbors(result.size()); /// indexes of dots which were closest to the reference vector + std::vector distances(result.size()); + result.dump_to(neighbors.data(), distances.data()); + + std::vector granules; + granules.reserve(neighbors.size()); + for (auto neighbor : neighbors) + granules.push_back(neighbor / index_granularity); + + /// make unique + std::sort(granules.begin(), granules.end()); + granules.erase(std::unique(granules.begin(), granules.end()), granules.end()); + + return granules; +} + +MergeTreeIndexVectorSimilarity::MergeTreeIndexVectorSimilarity( + const IndexDescription & index_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_) + : IMergeTreeIndex(index_) + , metric_kind(metric_kind_) + , scalar_kind(scalar_kind_) + , usearch_hnsw_params(usearch_hnsw_params_) +{ +} + +MergeTreeIndexGranulePtr MergeTreeIndexVectorSimilarity::createIndexGranule() const +{ + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params); +} + +MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const +{ + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params); +} + +MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const +{ + return std::make_shared(index, query, metric_kind, context); +}; + +MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG"); +} + +MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index) +{ + const bool has_six_args = (index.arguments.size() == 6); + + unum::usearch::metric_kind_t metric_kind = distanceFunctionToMetricKind.at(index.arguments[1].safeGet()); + + /// use defaults for the other parameters + unum::usearch::scalar_kind_t scalar_kind = unum::usearch::scalar_kind_t::f32_k; + UsearchHnswParams usearch_hnsw_params; + + if (has_six_args) + { + scalar_kind = quantizationToScalarKind.at(index.arguments[2].safeGet()); + usearch_hnsw_params = {.m = index.arguments[3].safeGet(), + .ef_construction = index.arguments[4].safeGet(), + .ef_search = index.arguments[5].safeGet()}; + } + + return std::make_shared(index, metric_kind, scalar_kind, usearch_hnsw_params); +} + +void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* attach */) +{ + const bool has_two_args = (index.arguments.size() == 2); + const bool has_six_args = (index.arguments.size() == 6); + + /// Check number and type of arguments + if (!has_two_args && !has_six_args) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or six arguments"); + if (index.arguments[0].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (method) must be of type String"); + if (index.arguments[1].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (metric) must be of type String"); + if (has_six_args) + { + if (index.arguments[2].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Third argument of vector similarity index (quantization) must be of type String"); + if (index.arguments[3].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (M) must be of type UInt64"); + if (index.arguments[4].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (ef_construction) must be of type UInt64"); + if (index.arguments[5].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Sixth argument of vector similarity index (ef_search) must be of type UInt64"); + } + + /// Check that passed arguments are supported + if (!methods.contains(index.arguments[0].safeGet())) + throw Exception(ErrorCodes::INCORRECT_DATA, "First argument (method) of vector similarity index is not supported. Supported methods are: {}", joinByComma(methods)); + if (!distanceFunctionToMetricKind.contains(index.arguments[1].safeGet())) + throw Exception(ErrorCodes::INCORRECT_DATA, "Second argument (distance function) of vector similarity index is not supported. Supported distance function are: {}", joinByComma(distanceFunctionToMetricKind)); + if (has_six_args) + { + if (!quantizationToScalarKind.contains(index.arguments[2].safeGet())) + throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind)); + if (index.arguments[3].safeGet() < 2) + throw Exception(ErrorCodes::INCORRECT_DATA, "Fourth argument (M) of vector similarity index must be > 1"); + if (index.arguments[4].safeGet() < 1) + throw Exception(ErrorCodes::INCORRECT_DATA, "Fifth argument (ef_construction) of vector similarity index must be > 0"); + if (index.arguments[5].safeGet() < 1) + throw Exception(ErrorCodes::INCORRECT_DATA, "Sixth argument (ef_search) of vector similarity index must be > 0"); + } + + /// Check that the index is created on a single column + if (index.column_names.size() != 1 || index.data_types.size() != 1) + throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column"); + + /// Check data type of the indexed column: + DataTypePtr data_type = index.sample_block.getDataTypes()[0]; + if (const auto * data_type_array = typeid_cast(data_type.get())) + { + TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); + if (!WhichDataType(nested_type_index).isFloat32()) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); + } + else + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); + } +} + +} + +#endif diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h new file mode 100644 index 00000000000..f7098c1626c --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -0,0 +1,172 @@ +#pragma once + +#include "config.h" + +#if USE_USEARCH + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpass-failed" +# include +# include +# include +#pragma clang diagnostic pop + +namespace DB +{ + +struct UsearchHnswParams +{ + size_t m = unum::usearch::default_connectivity(); + size_t ef_construction = unum::usearch::default_expansion_add(); + size_t ef_search = unum::usearch::default_expansion_search(); +}; + +using USearchIndex = unum::usearch::index_dense_gt; + +class USearchIndexWithSerialization : public USearchIndex +{ + using Base = USearchIndex; + +public: + USearchIndexWithSerialization( + size_t dimensions, + unum::usearch::metric_kind_t metric_kind, + unum::usearch::scalar_kind_t scalar_kind, + UsearchHnswParams usearch_hnsw_params); + + void serialize(WriteBuffer & ostr) const; + void deserialize(ReadBuffer & istr); + + struct Statistics + { + size_t max_level; + size_t connectivity; + size_t size; + size_t capacity; + size_t memory_usage; + /// advanced stats: + size_t bytes_per_vector; + size_t scalar_words; + Base::stats_t statistics; + }; + + Statistics getStatistics() const; +}; + +using USearchIndexWithSerializationPtr = std::shared_ptr; + + +struct MergeTreeIndexGranuleVectorSimilarity final : public IMergeTreeIndexGranule +{ + MergeTreeIndexGranuleVectorSimilarity( + const String & index_name_, + const Block & index_sample_block_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_); + + MergeTreeIndexGranuleVectorSimilarity( + const String & index_name_, + const Block & index_sample_block_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_, + USearchIndexWithSerializationPtr index_); + + ~MergeTreeIndexGranuleVectorSimilarity() override = default; + + void serializeBinary(WriteBuffer & ostr) const override; + void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override; + + bool empty() const override { return !index || index->size() == 0; } + + const String index_name; + const Block index_sample_block; + const unum::usearch::metric_kind_t metric_kind; + const unum::usearch::scalar_kind_t scalar_kind; + const UsearchHnswParams usearch_hnsw_params; + USearchIndexWithSerializationPtr index; + + LoggerPtr logger = getLogger("VectorSimilarityIndex"); + +private: + /// The version of the persistence format of USearch index. Increment whenever you change the format. + /// Note: USearch prefixes the serialized data with its own version header. We can't rely on that because 1. the index in ClickHouse + /// is (at least in theory) agnostic of specific vector search libraries, and 2. additional data (e.g. the number of dimensions) + /// outside USearch exists which we should version separately. + static constexpr UInt64 FILE_FORMAT_VERSION = 1; +}; + + +struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregator +{ + MergeTreeIndexAggregatorVectorSimilarity( + const String & index_name_, + const Block & index_sample_block, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_); + + ~MergeTreeIndexAggregatorVectorSimilarity() override = default; + + bool empty() const override { return !index || index->size() == 0; } + MergeTreeIndexGranulePtr getGranuleAndReset() override; + void update(const Block & block, size_t * pos, size_t limit) override; + + const String index_name; + const Block index_sample_block; + const unum::usearch::metric_kind_t metric_kind; + const unum::usearch::scalar_kind_t scalar_kind; + const UsearchHnswParams usearch_hnsw_params; + USearchIndexWithSerializationPtr index; +}; + + +class MergeTreeIndexConditionVectorSimilarity final : public IMergeTreeIndexCondition +{ +public: + MergeTreeIndexConditionVectorSimilarity( + const IndexDescription & index_description, + const SelectQueryInfo & query, + unum::usearch::metric_kind_t metric_kind_, + ContextPtr context); + + ~MergeTreeIndexConditionVectorSimilarity() override = default; + + bool alwaysUnknownOrTrue() const override; + bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override; + std::vector getUsefulRanges(MergeTreeIndexGranulePtr granule) const override; + +private: + const VectorSimilarityCondition vector_similarity_condition; + const unum::usearch::metric_kind_t metric_kind; +}; + + +class MergeTreeIndexVectorSimilarity : public IMergeTreeIndex +{ +public: + MergeTreeIndexVectorSimilarity( + const IndexDescription & index_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_); + + ~MergeTreeIndexVectorSimilarity() override = default; + + MergeTreeIndexGranulePtr createIndexGranule() const override; + MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override; + MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const; + MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override; + bool isVectorSimilarityIndex() const override { return true; } + +private: + const unum::usearch::metric_kind_t metric_kind; + const unum::usearch::scalar_kind_t scalar_kind; + const UsearchHnswParams usearch_hnsw_params; +}; + +} + + +#endif diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp index bded961db8e..d2fc0e84b56 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.cpp +++ b/src/Storages/MergeTree/MergeTreeIndices.cpp @@ -127,15 +127,21 @@ MergeTreeIndexFactory::MergeTreeIndexFactory() registerCreator("hypothesis", hypothesisIndexCreator); registerValidator("hypothesis", hypothesisIndexValidator); -#ifdef ENABLE_ANNOY - registerCreator("annoy", annoyIndexCreator); - registerValidator("annoy", annoyIndexValidator); -#endif -#ifdef ENABLE_USEARCH - registerCreator("usearch", usearchIndexCreator); - registerValidator("usearch", usearchIndexValidator); +#if USE_USEARCH + registerCreator("vector_similarity", vectorSimilarityIndexCreator); + registerValidator("vector_similarity", vectorSimilarityIndexValidator); #endif + /// ------ + /// TODO: remove this block at the end of 2024. + /// Index types 'annoy' and 'usearch' are no longer supported as of June 2024. Their successor is index type 'vector_similarity'. + /// To support loading tables with old indexes during a transition period, register dummy indexes which allow load/attaching but + /// throw an exception when the user attempts to use them. + registerCreator("annoy", legacyVectorSimilarityIndexCreator); + registerValidator("annoy", legacyVectorSimilarityIndexValidator); + registerCreator("usearch", legacyVectorSimilarityIndexCreator); + registerValidator("usearch", legacyVectorSimilarityIndexValidator); + /// ------ registerCreator("inverted", fullTextIndexCreator); registerValidator("inverted", fullTextIndexValidator); diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 1be73e1c811..c52d7ffe131 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -15,6 +15,7 @@ #include #include +#include "config.h" constexpr auto INDEX_FILE_PREFIX = "skp_idx_"; @@ -92,6 +93,13 @@ public: virtual bool alwaysUnknownOrTrue() const = 0; virtual bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const = 0; + + /// Special stuff for vector similarity indexes + /// - Returns vector of indexes of ranges in granule which are useful for query. + virtual std::vector getUsefulRanges(MergeTreeIndexGranulePtr) const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for non-vector-similarity indexes."); + } }; using MergeTreeIndexConditionPtr = std::shared_ptr; @@ -169,7 +177,7 @@ struct IMergeTreeIndex virtual MergeTreeIndexConditionPtr createIndexCondition( const ActionsDAG * filter_actions_dag, ContextPtr context) const = 0; - virtual bool isVectorSearch() const { return false; } + virtual bool isVectorSimilarityIndex() const { return false; } virtual MergeTreeIndexMergedConditionPtr createIndexMergedCondition( const SelectQueryInfo & /*query_info*/, StorageMetadataPtr /*storage_metadata*/) const @@ -230,17 +238,15 @@ void bloomFilterIndexValidator(const IndexDescription & index, bool attach); MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index); void hypothesisIndexValidator(const IndexDescription & index, bool attach); -#ifdef ENABLE_ANNOY -MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index); -void annoyIndexValidator(const IndexDescription & index, bool attach); +#if USE_USEARCH +MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index); +void vectorSimilarityIndexValidator(const IndexDescription & index, bool attach); #endif -#ifdef ENABLE_USEARCH -MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index); -void usearchIndexValidator(const IndexDescription& index, bool attach); -#endif +MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index); +void legacyVectorSimilarityIndexValidator(const IndexDescription & index, bool attach); -MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription& index); -void fullTextIndexValidator(const IndexDescription& index, bool attach); +MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription & index); +void fullTextIndexValidator(const IndexDescription & index, bool attach); } diff --git a/src/Storages/MergeTree/VectorSimilarityCondition.cpp b/src/Storages/MergeTree/VectorSimilarityCondition.cpp new file mode 100644 index 00000000000..2e53b4ecb3a --- /dev/null +++ b/src/Storages/MergeTree/VectorSimilarityCondition.cpp @@ -0,0 +1,350 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int INCORRECT_QUERY; +} + +namespace +{ + +template +void extractReferenceVectorFromLiteral(std::vector & reference_vector, Literal literal) +{ + Float64 float_element_of_reference_vector; + Int64 int_element_of_reference_vector; + + for (const auto & value : literal.value()) + { + if (value.tryGet(float_element_of_reference_vector)) + reference_vector.emplace_back(float_element_of_reference_vector); + else if (value.tryGet(int_element_of_reference_vector)) + reference_vector.emplace_back(static_cast(int_element_of_reference_vector)); + else + throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong type of elements in reference vector. Only float or int are supported."); + } +} + +VectorSimilarityCondition::Info::DistanceFunction stringToDistanceFunction(std::string_view distance_function) +{ + if (distance_function == "L2Distance") + return VectorSimilarityCondition::Info::DistanceFunction::L2; + else + return VectorSimilarityCondition::Info::DistanceFunction::Unknown; +} + +} + +VectorSimilarityCondition::VectorSimilarityCondition(const SelectQueryInfo & query_info, ContextPtr context) + : block_with_constants(KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)) + , index_granularity(context->getMergeTreeSettings().index_granularity) + , max_limit_for_ann_queries(context->getSettingsRef().max_limit_for_ann_queries) + , index_is_useful(checkQueryStructure(query_info)) +{} + +bool VectorSimilarityCondition::alwaysUnknownOrTrue(String distance_function) const +{ + if (!index_is_useful) + return true; /// query isn't supported + /// If query is supported, check if distance function of index is the same as distance function in query + return !(stringToDistanceFunction(distance_function) == query_information->distance_function); +} + +UInt64 VectorSimilarityCondition::getLimit() const +{ + if (index_is_useful && query_information.has_value()) + return query_information->limit; + throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported"); +} + +std::vector VectorSimilarityCondition::getReferenceVector() const +{ + if (index_is_useful && query_information.has_value()) + return query_information->reference_vector; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index."); +} + +size_t VectorSimilarityCondition::getDimensions() const +{ + if (index_is_useful && query_information.has_value()) + return query_information->reference_vector.size(); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of dimensions was requested for useless or uninitialized index."); +} + +String VectorSimilarityCondition::getColumnName() const +{ + if (index_is_useful && query_information.has_value()) + return query_information->column_name; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column name was requested for useless or uninitialized index."); +} + +VectorSimilarityCondition::Info::DistanceFunction VectorSimilarityCondition::getDistanceFunction() const +{ + if (index_is_useful && query_information.has_value()) + return query_information->distance_function; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Distance function was requested for useless or uninitialized index."); +} + +bool VectorSimilarityCondition::checkQueryStructure(const SelectQueryInfo & query) +{ + Info order_by_info; + + /// Build rpns for query sections + const auto & select = query.query->as(); + + RPN rpn_order_by; + RPNElement rpn_limit; + UInt64 limit; + + if (select.limitLength()) + traverseAtomAST(select.limitLength(), rpn_limit); + + if (select.orderBy()) + traverseOrderByAST(select.orderBy(), rpn_order_by); + + /// Reverse RPNs for conveniences during parsing + std::reverse(rpn_order_by.begin(), rpn_order_by.end()); + + const bool order_by_is_valid = matchRPNOrderBy(rpn_order_by, order_by_info); + const bool limit_is_valid = matchRPNLimit(rpn_limit, limit); + + if (!limit_is_valid || limit > max_limit_for_ann_queries) + return false; + + if (order_by_is_valid) + { + query_information = std::move(order_by_info); + query_information->limit = limit; + return true; + } + + return false; +} + +void VectorSimilarityCondition::traverseAST(const ASTPtr & node, RPN & rpn) +{ + /// If the node is ASTFunction, it may have children nodes + if (const auto * func = node->as()) + { + const ASTs & children = func->arguments->children; + /// Traverse children nodes + for (const auto& child : children) + traverseAST(child, rpn); + } + + RPNElement element; + /// Get the data behind node + if (!traverseAtomAST(node, element)) + element.function = RPNElement::FUNCTION_UNKNOWN; + + rpn.emplace_back(std::move(element)); +} + +bool VectorSimilarityCondition::traverseAtomAST(const ASTPtr & node, RPNElement & out) +{ + /// Match Functions + if (const auto * function = node->as()) + { + /// Set the name + out.func_name = function->name; + + if (function->name == "L1Distance" || + function->name == "L2Distance" || + function->name == "LinfDistance" || + function->name == "cosineDistance" || + function->name == "dotProduct") + out.function = RPNElement::FUNCTION_DISTANCE; + else if (function->name == "array") + out.function = RPNElement::FUNCTION_ARRAY; + else if (function->name == "_CAST") + out.function = RPNElement::FUNCTION_CAST; + else + return false; + + return true; + } + /// Match identifier + else if (const auto * identifier = node->as()) + { + out.function = RPNElement::FUNCTION_IDENTIFIER; + out.identifier.emplace(identifier->name()); + out.func_name = "column identifier"; + + return true; + } + + /// Check if we have constants behind the node + return tryCastToConstType(node, out); +} + +bool VectorSimilarityCondition::tryCastToConstType(const ASTPtr & node, RPNElement & out) +{ + Field const_value; + DataTypePtr const_type; + + if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type)) + { + /// Check for constant types + if (const_value.getType() == Field::Types::Float64) + { + out.function = RPNElement::FUNCTION_FLOAT_LITERAL; + out.float_literal.emplace(const_value.safeGet()); + out.func_name = "Float literal"; + return true; + } + + if (const_value.getType() == Field::Types::UInt64) + { + out.function = RPNElement::FUNCTION_INT_LITERAL; + out.int_literal.emplace(const_value.safeGet()); + out.func_name = "Int literal"; + return true; + } + + if (const_value.getType() == Field::Types::Int64) + { + out.function = RPNElement::FUNCTION_INT_LITERAL; + out.int_literal.emplace(const_value.safeGet()); + out.func_name = "Int literal"; + return true; + } + + if (const_value.getType() == Field::Types::Array) + { + out.function = RPNElement::FUNCTION_LITERAL_ARRAY; + out.array_literal = const_value.safeGet(); + out.func_name = "Array literal"; + return true; + } + + if (const_value.getType() == Field::Types::String) + { + out.function = RPNElement::FUNCTION_STRING_LITERAL; + out.func_name = const_value.safeGet(); + return true; + } + } + + return false; +} + +void VectorSimilarityCondition::traverseOrderByAST(const ASTPtr & node, RPN & rpn) +{ + if (const auto * expr_list = node->as()) + if (const auto * order_by_element = expr_list->children.front()->as()) + traverseAST(order_by_element->children.front(), rpn); +} + +/// Returns true and stores ANNExpr if the query has valid ORDERBY clause +bool VectorSimilarityCondition::matchRPNOrderBy(RPN & rpn, Info & info) +{ + /// ORDER BY clause must have at least 3 expressions + if (rpn.size() < 3) + return false; + + auto iter = rpn.begin(); + auto end = rpn.end(); + + bool identifier_found = false; + + /// Matches DistanceFunc->[Column]->[ArrayFunc]->ReferenceVector(floats)->[Column] + if (iter->function != RPNElement::FUNCTION_DISTANCE) + return false; + + info.distance_function = stringToDistanceFunction(iter->func_name); + ++iter; + + if (iter->function == RPNElement::FUNCTION_IDENTIFIER) + { + identifier_found = true; + info.column_name = std::move(iter->identifier.value()); + ++iter; + } + + if (iter->function == RPNElement::FUNCTION_ARRAY) + ++iter; + + if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) + { + extractReferenceVectorFromLiteral(info.reference_vector, iter->array_literal); + ++iter; + } + + /// further conditions are possible if there is no array, or no identifier is found + /// the array can be inside a cast function. For other cases, see the loop after this condition + if (iter != end && iter->function == RPNElement::FUNCTION_CAST) + { + ++iter; + /// Cast should be made to array + if (!iter->func_name.starts_with("Array")) + return false; + ++iter; + if (iter->function == RPNElement::FUNCTION_LITERAL_ARRAY) + { + extractReferenceVectorFromLiteral(info.reference_vector, iter->array_literal); + ++iter; + } + else + return false; + } + + while (iter != end) + { + if (iter->function == RPNElement::FUNCTION_FLOAT_LITERAL || + iter->function == RPNElement::FUNCTION_INT_LITERAL) + info.reference_vector.emplace_back(getFloatOrIntLiteralOrPanic(iter)); + else if (iter->function == RPNElement::FUNCTION_IDENTIFIER) + { + if (identifier_found) + return false; + info.column_name = std::move(iter->identifier.value()); + identifier_found = true; + } + else + return false; + + ++iter; + } + + /// Final checks of correctness + return identifier_found && !info.reference_vector.empty(); +} + +/// Returns true and stores Length if we have valid LIMIT clause in query +bool VectorSimilarityCondition::matchRPNLimit(RPNElement & rpn, UInt64 & limit) +{ + if (rpn.function == RPNElement::FUNCTION_INT_LITERAL) + { + limit = rpn.int_literal.value(); + return true; + } + + return false; +} + +/// Gets float or int from AST node +float VectorSimilarityCondition::getFloatOrIntLiteralOrPanic(const RPN::iterator& iter) +{ + if (iter->float_literal.has_value()) + return iter->float_literal.value(); + if (iter->int_literal.has_value()) + return static_cast(iter->int_literal.value()); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Wrong parsed AST in buildRPN\n"); +} + +} diff --git a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h b/src/Storages/MergeTree/VectorSimilarityCondition.h similarity index 54% rename from src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h rename to src/Storages/MergeTree/VectorSimilarityCondition.h index 5da2a714b02..fd339ed715d 100644 --- a/src/Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h +++ b/src/Storages/MergeTree/VectorSimilarityCondition.h @@ -9,52 +9,9 @@ namespace DB { -static constexpr auto DISTANCE_FUNCTION_L2 = "L2Distance"; -static constexpr auto DISTANCE_FUNCTION_COSINE = "cosineDistance"; - -/// Approximate Nearest Neighbour queries have a similar structure: -/// - reference vector from which all distances are calculated -/// - metric name (e.g L2Distance, LpDistance, etc.) -/// - name of column with embeddings -/// - type of query -/// - maximum number of returned elements (LIMIT) -/// -/// And two optional parameters: -/// - p for LpDistance function -/// - distance to compare with (only for where queries) -/// -/// This struct holds all these components. -struct ApproximateNearestNeighborInformation -{ - using Embedding = std::vector; - Embedding reference_vector; - - enum class Metric : uint8_t - { - Unknown, - L2, - Lp - }; - Metric metric; - - String column_name; - UInt64 limit; - - enum class Type : uint8_t - { - OrderBy, - Where - }; - Type type; - - float p_for_lp_dist = -1.0; - float distance = -1.0; -}; - - -// Class ANNCondition, is responsible for recognizing if the query is an ANN queries which can utilize ANN indexes. It parses the SQL query -/// and checks if it matches ANNIndexes. Method alwaysUnknownOrTrue returns false if we can speed up the query, and true otherwise. It has -/// only one argument, the name of the metric with which index was built. Two main patterns of queries are supported +/// Class VectorSimilarityCondition is responsible for recognizing if the query can utilize vector similarity indexes. +/// Method alwaysUnknownOrTrue returns false if we can speed up the query, and true otherwise. It has +/// only one argument, the name of the distance function with which index was built. Two main patterns of queries are supported /// /// - 1. WHERE queries: /// SELECT * FROM * WHERE DistanceFunc(column, reference_vector) < floatLiteral LIMIT count @@ -64,14 +21,14 @@ struct ApproximateNearestNeighborInformation /// /// Queries without LIMIT count are not supported /// If the query is both of type 1. and 2., than we can't use the index and alwaysUnknownOrTrue returns true. -/// reference_vector should have float coordinates, e.g. (0.2, 0.1, .., 0.5) +/// reference_vector should have float coordinates, e.g. [0.2, 0.1, .., 0.5] /// -/// If the query matches one of these two types, then this class extracts the main information needed for ANN indexes from the query. +/// If the query matches one of these two types, then this class extracts the main information needed for vector similarity indexes from the +/// query. /// /// From matching query it extracts /// - referenceVector -/// - metricName(DistanceFunction) -/// - dimension size if query uses LpDistance +/// - distance function /// - distance to compare(ONLY for search types, otherwise you get exception) /// - spaceDimension(which is referenceVector's components count) /// - column @@ -79,35 +36,45 @@ struct ApproximateNearestNeighborInformation /// - queryHasOrderByClause and queryHasWhereClause return true if query matches the type /// /// Search query type is also recognized for PREWHERE clause -class ApproximateNearestNeighborCondition +class VectorSimilarityCondition { public: - ApproximateNearestNeighborCondition(const SelectQueryInfo & query_info, ContextPtr context); + VectorSimilarityCondition(const SelectQueryInfo & query_info, ContextPtr context); + + /// Approximate nearest neighbour (ANN) / vector similarity queries have a similar structure: + /// - reference vector from which all distances are calculated + /// - distance function, e.g L2Distance + /// - name of column with embeddings + /// - type of query + /// - maximum number of returned elements (LIMIT) + /// + /// And one optional parameter: + /// - distance to compare with (only for where queries) + /// + /// This struct holds all these components. + struct Info + { + enum class DistanceFunction : uint8_t + { + Unknown, + L2 + }; + + std::vector reference_vector; + DistanceFunction distance_function; + String column_name; + UInt64 limit; + float distance = -1.0; + }; /// Returns false if query can be speeded up by an ANN index, true otherwise. - bool alwaysUnknownOrTrue(String metric) const; + bool alwaysUnknownOrTrue(String distance_function) const; - /// Returns the distance to compare with for search query - float getComparisonDistanceForWhereQuery() const; - - /// Distance should be calculated regarding to referenceVector std::vector getReferenceVector() const; - - /// Reference vector's dimension count size_t getDimensions() const; - String getColumnName() const; - - ApproximateNearestNeighborInformation::Metric getMetricType() const; - - /// The P- value if the metric is 'LpDistance' - float getPValueForLpDistance() const; - - ApproximateNearestNeighborInformation::Type getQueryType() const; - + Info::DistanceFunction getDistanceFunction() const; UInt64 getIndexGranularity() const { return index_granularity; } - - /// Length's value from LIMIT clause UInt64 getLimit() const; private: @@ -118,9 +85,6 @@ private: /// DistanceFunctions FUNCTION_DISTANCE, - //tuple(0.1, ..., 0.1) - FUNCTION_TUPLE, - //array(0.1, ..., 0.1) FUNCTION_ARRAY, @@ -139,9 +103,6 @@ private: /// Unknown, can be any value FUNCTION_UNKNOWN, - /// (0.1, ...., 0.1) vector without word 'tuple' - FUNCTION_LITERAL_TUPLE, - /// [0.1, ...., 0.1] vector without word 'array' FUNCTION_LITERAL_ARRAY, @@ -154,19 +115,14 @@ private: explicit RPNElement(Function function_ = FUNCTION_UNKNOWN) : function(function_) - , func_name("Unknown") - , float_literal(std::nullopt) - , identifier(std::nullopt) {} Function function; - String func_name; + String func_name = "Unknown"; std::optional float_literal; std::optional identifier; std::optional int_literal; - - std::optional tuple_literal; std::optional array_literal; UInt32 dim = 0; @@ -186,16 +142,16 @@ private: void traverseOrderByAST(const ASTPtr & node, RPN & rpn); /// Returns true and stores ANNExpr if the query has valid WHERE section - static bool matchRPNWhere(RPN & rpn, ApproximateNearestNeighborInformation & ann_info); + static bool matchRPNWhere(RPN & rpn, Info & info); /// Returns true and stores ANNExpr if the query has valid ORDERBY section - static bool matchRPNOrderBy(RPN & rpn, ApproximateNearestNeighborInformation & ann_info); + static bool matchRPNOrderBy(RPN & rpn, Info & info); /// Returns true and stores Length if we have valid LIMIT clause in query static bool matchRPNLimit(RPNElement & rpn, UInt64 & limit); - /* Matches dist function, reference vector, column name */ - static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, ApproximateNearestNeighborInformation & ann_info); + /// Matches dist function, reference vector, column name + static bool matchMainParts(RPN::iterator & iter, const RPN::iterator & end, Info & info); /// Gets float or int from AST node static float getFloatOrIntLiteralOrPanic(const RPN::iterator& iter); @@ -203,7 +159,7 @@ private: Block block_with_constants; /// true if we have one of two supported query types - std::optional query_information; + std::optional query_information; // Get from settings ANNIndex parameters const UInt64 index_granularity; @@ -214,13 +170,4 @@ private: bool index_is_useful = false; }; - -/// Common interface of ANN indexes. -class IMergeTreeIndexConditionApproximateNearestNeighbor : public IMergeTreeIndexCondition -{ -public: - /// Returns vector of indexes of ranges in granule which are useful for query. - virtual std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const = 0; -}; - } diff --git a/src/Storages/fuzzers/CMakeLists.txt b/src/Storages/fuzzers/CMakeLists.txt index ec56b853666..f67552716a2 100644 --- a/src/Storages/fuzzers/CMakeLists.txt +++ b/src/Storages/fuzzers/CMakeLists.txt @@ -4,4 +4,4 @@ clickhouse_add_executable (mergetree_checksum_fuzzer mergetree_checksum_fuzzer.c target_link_libraries (mergetree_checksum_fuzzer PRIVATE dbms clickhouse_functions) clickhouse_add_executable (columns_description_fuzzer columns_description_fuzzer.cpp) -target_link_libraries (columns_description_fuzzer PRIVATE dbms clickhouse_functions) +target_link_libraries (columns_description_fuzzer PRIVATE clickhouse_functions) diff --git a/src/Storages/fuzzers/columns_description_fuzzer.cpp b/src/Storages/fuzzers/columns_description_fuzzer.cpp index 469bfe2fb38..e39afccd1f9 100644 --- a/src/Storages/fuzzers/columns_description_fuzzer.cpp +++ b/src/Storages/fuzzers/columns_description_fuzzer.cpp @@ -1,4 +1,5 @@ #include +#include #include diff --git a/src/configure_config.cmake b/src/configure_config.cmake index 5b24f79ef6e..5a1aa179e01 100644 --- a/src/configure_config.cmake +++ b/src/configure_config.cmake @@ -164,6 +164,9 @@ endif() if (TARGET ch_contrib::bcrypt) set(USE_BCRYPT 1) endif() +if (TARGET ch_contrib::usearch) + set(USE_USEARCH 1) +endif() if (TARGET ch_contrib::ssh) set(USE_SSH 1) endif() diff --git a/tests/ci/libfuzzer_test_check.py b/tests/ci/libfuzzer_test_check.py index d9e33229932..8f19dd7d023 100644 --- a/tests/ci/libfuzzer_test_check.py +++ b/tests/ci/libfuzzer_test_check.py @@ -75,7 +75,7 @@ def get_run_command( f"--volume={result_path}:/test_output " "--security-opt seccomp=unconfined " # required to issue io_uring sys-calls f"--cap-add=SYS_PTRACE {env_str} {additional_options_str} {image} " - "python3 ./utils/runner.py" + "python3 /usr/share/clickhouse-test/fuzz/runner.py" ) diff --git a/utils/libfuzzer/runner.py b/tests/fuzz/runner.py similarity index 75% rename from utils/libfuzzer/runner.py rename to tests/fuzz/runner.py index bbe648dbbc2..44259228f60 100644 --- a/utils/libfuzzer/runner.py +++ b/tests/fuzz/runner.py @@ -11,7 +11,7 @@ FUZZER_ARGS = os.getenv("FUZZER_ARGS", "") def run_fuzzer(fuzzer: str): - logging.info(f"Running fuzzer {fuzzer}...") + logging.info("Running fuzzer %s...", fuzzer) corpus_dir = f"{fuzzer}.in" with Path(corpus_dir) as path: @@ -29,28 +29,27 @@ def run_fuzzer(fuzzer: str): if parser.has_section("asan"): os.environ["ASAN_OPTIONS"] = ( - f"{os.environ['ASAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['asan'].items())}" + f"{os.environ['ASAN_OPTIONS']}:{':'.join(f'{key}={value}' for key, value in parser['asan'].items())}" ) if parser.has_section("msan"): os.environ["MSAN_OPTIONS"] = ( - f"{os.environ['MSAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['msan'].items())}" + f"{os.environ['MSAN_OPTIONS']}:{':'.join(f'{key}={value}' for key, value in parser['msan'].items())}" ) if parser.has_section("ubsan"): os.environ["UBSAN_OPTIONS"] = ( - f"{os.environ['UBSAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['ubsan'].items())}" + f"{os.environ['UBSAN_OPTIONS']}:{':'.join(f'{key}={value}' for key, value in parser['ubsan'].items())}" ) if parser.has_section("libfuzzer"): custom_libfuzzer_options = " ".join( - "-%s=%s" % (key, value) - for key, value in parser["libfuzzer"].items() + f"-{key}={value}" for key, value in parser["libfuzzer"].items() ) if parser.has_section("fuzzer_arguments"): fuzzer_arguments = " ".join( - ("%s" % key) if value == "" else ("%s=%s" % (key, value)) + (f"{key}") if value == "" else (f"{key}={value}") for key, value in parser["fuzzer_arguments"].items() ) @@ -65,7 +64,7 @@ def run_fuzzer(fuzzer: str): cmd_line += " < /dev/null" - logging.info(f"...will execute: {cmd_line}") + logging.info("...will execute: %s", cmd_line) subprocess.check_call(cmd_line, shell=True) diff --git a/tests/fuzz/tcp_protocol_fuzzer.options b/tests/fuzz/tcp_protocol_fuzzer.options new file mode 100644 index 00000000000..4885669d91d --- /dev/null +++ b/tests/fuzz/tcp_protocol_fuzzer.options @@ -0,0 +1,4 @@ +[fuzzer_arguments] +--log-file=tcp_protocol_fuzzer.log +--= +--logging.terminal=0 diff --git a/tests/integration/test_server_keep_alive/__init__.py b/tests/integration/test_server_keep_alive/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml b/tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml new file mode 100644 index 00000000000..06e68044817 --- /dev/null +++ b/tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml @@ -0,0 +1,4 @@ + + 3600 + 5 + diff --git a/tests/integration/test_server_keep_alive/test.py b/tests/integration/test_server_keep_alive/test.py new file mode 100644 index 00000000000..e550319b6df --- /dev/null +++ b/tests/integration/test_server_keep_alive/test.py @@ -0,0 +1,55 @@ +import logging +import pytest +import random +import requests + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance("node", main_configs=["configs/keep_alive_settings.xml"]) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def test_max_keep_alive_requests_on_user_side(start_cluster): + # In this test we have `keep_alive_timeout` set to one hour to never trigger connection reset by timeout, `max_keep_alive_requests` is set to 5. + # We expect server to close connection after each 5 requests. We detect connection reset by change in src port. + # So the first 5 requests should come from the same port, the following 5 requests should come from another port. + + log_comments = [] + for _ in range(10): + rand_id = random.randint(0, 1000000) + log_comment = f"test_requests_with_keep_alive_{rand_id}" + log_comments.append(log_comment) + log_comments = sorted(log_comments) + + session = requests.Session() + for i in range(10): + session.get( + f"http://{node.ip_address}:8123/?query=select%201&log_comment={log_comments[i]}" + ) + + ports = node.query( + f""" + SYSTEM FLUSH LOGS; + + SELECT port + FROM system.query_log + WHERE log_comment IN ({", ".join(f"'{comment}'" for comment in log_comments)}) AND type = 'QueryFinish' + ORDER BY log_comment + """ + ).split("\n")[:-1] + + expected = 5 * [ports[0]] + [ports[5]] * 5 + + assert ports == expected diff --git a/tests/queries/0_stateless/00408_http_keep_alive.reference b/tests/queries/0_stateless/00408_http_keep_alive.reference index 17a7fd690a8..5402036bfd7 100644 --- a/tests/queries/0_stateless/00408_http_keep_alive.reference +++ b/tests/queries/0_stateless/00408_http_keep_alive.reference @@ -1,6 +1,6 @@ < Connection: Keep-Alive -< Keep-Alive: timeout=10 +< Keep-Alive: timeout=10, max=? < Connection: Keep-Alive -< Keep-Alive: timeout=10 +< Keep-Alive: timeout=10, max=? < Connection: Keep-Alive -< Keep-Alive: timeout=10 +< Keep-Alive: timeout=10, max=? diff --git a/tests/queries/0_stateless/00408_http_keep_alive.sh b/tests/queries/0_stateless/00408_http_keep_alive.sh index 4bd0e494eb8..4a1cb4ed712 100755 --- a/tests/queries/0_stateless/00408_http_keep_alive.sh +++ b/tests/queries/0_stateless/00408_http_keep_alive.sh @@ -6,9 +6,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) URL="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/" -${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< "SELECT 1" 2>&1 | perl -lnE 'print if /Keep-Alive/'; -${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< " error here " 2>&1 | perl -lnE 'print if /Keep-Alive/'; -${CLICKHOUSE_CURL} -vsS "${URL}"ping 2>&1 | perl -lnE 'print if /Keep-Alive/'; +# the sed command here replaces the real number of left requests with a question mark, because it can vary and we don't really have control over it +${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< "SELECT 1" 2>&1 | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -i 'keep-alive'; +${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< " error here " 2>&1 | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -i 'keep-alive'; +${CLICKHOUSE_CURL} -vsS "${URL}"ping 2>&1 | perl -lnE 'print if /Keep-Alive/' | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -i 'keep-alive'; # no keep-alive: ${CLICKHOUSE_CURL} -vsS "${URL}"404/not/found/ 2>&1 | perl -lnE 'print if /Keep-Alive/'; diff --git a/tests/queries/0_stateless/00501_http_head.reference b/tests/queries/0_stateless/00501_http_head.reference index 8351327b356..807bcd4922e 100644 --- a/tests/queries/0_stateless/00501_http_head.reference +++ b/tests/queries/0_stateless/00501_http_head.reference @@ -2,11 +2,11 @@ HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=10 +Keep-Alive: timeout=10, max=? HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=10 +Keep-Alive: timeout=10, max=? diff --git a/tests/queries/0_stateless/00501_http_head.sh b/tests/queries/0_stateless/00501_http_head.sh index 60283f26833..30da64c31f0 100755 --- a/tests/queries/0_stateless/00501_http_head.sh +++ b/tests/queries/0_stateless/00501_http_head.sh @@ -4,8 +4,9 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -( ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=SELECT%201"; - ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" | grep -v "X-ClickHouse-Timezone:" +# the sed command here replaces the real number of left requests with a question mark, because it can vary and we don't really have control over it +( ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=SELECT%201" | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I'; + ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" | grep -v "X-ClickHouse-Timezone:" if [[ $(${CLICKHOUSE_CURL} -sS -X POST -I "${CLICKHOUSE_URL}&query=SELECT+1" | grep -c '411 Length Required') -ne 1 ]]; then echo FAIL diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.reference b/tests/queries/0_stateless/02354_vector_search_bugs.reference index d2c2d7e2fb7..a27b086e118 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.reference +++ b/tests/queries/0_stateless/02354_vector_search_bugs.reference @@ -1,17 +1,5 @@ Issue #52258: Empty Arrays or Arrays with default values are rejected -- Annoy -- Usearch It is possible to create parts with different Array vector sizes but there will be an error at query time -- Annoy -- Usearch Correctness of index with > 1 mark -- Annoy 1 [1,0] 0 9000 [9000,0] 0 -1 (1,0) 0 -9000 (9000,0) 0 -- Usearch -1 [1,0] 0 -9000 [9000,0] 0 -1 (1,0) 0 -9000 (9000,0) 0 diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.sql b/tests/queries/0_stateless/02354_vector_search_bugs.sql index f03c36f6550..7c66b4b8e45 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.sql +++ b/tests/queries/0_stateless/02354_vector_search_bugs.sql @@ -1,61 +1,22 @@ --- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan +-- Tags: no-fasttest, no-ordinary-database --- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage, --- therefore they are tested in a single file. - --- This file contains tests for various bugs and special cases - -SET allow_experimental_annoy_index = 1; -SET allow_experimental_usearch_index = 1; +-- Tests various bugs and special cases for vector indexes. +SET allow_experimental_vector_similarity_index = 1; SET enable_analyzer = 1; -- 0 vs. 1 produce slightly different error codes, make it future-proof DROP TABLE IF EXISTS tab; SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected'; -SELECT '- Annoy'; - -CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree() ORDER BY (id); +CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree() ORDER BY id; INSERT INTO tab VALUES (1, []); -- { serverError INCORRECT_DATA } INSERT INTO tab (id) VALUES (1); -- { serverError INCORRECT_DATA } DROP TABLE tab; -CREATE TABLE tab (id UInt64, vec Tuple(Float32, Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree() ORDER BY (id); -INSERT INTO tab (id) VALUES (1); -- works fine, takes on default tuple (0.0, 0.0) -DROP TABLE tab; - -SELECT '- Usearch'; - -CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree() ORDER BY (id); -INSERT INTO tab VALUES (1, []); -- { serverError INCORRECT_DATA } -INSERT INTO tab (id) VALUES (1); -- { serverError INCORRECT_DATA } -DROP TABLE tab; - -CREATE TABLE tab (id UInt64, vec Tuple(Float32, Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree() ORDER BY (id); -INSERT INTO tab (id) VALUES (1); -- works fine, takes on default tuple (0.0, 0.0) -DROP TABLE tab; - SELECT 'It is possible to create parts with different Array vector sizes but there will be an error at query time'; -SELECT '- Annoy'; - -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -SYSTEM STOP MERGES tab; -INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2]); -INSERT INTO tab values (2, [2.2, 2.3, 2.4]) (3, [3.1, 3.2, 3.3]); - -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 3; -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } - -DROP TABLE tab; - -SELECT '- Usearch'; - -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; SYSTEM STOP MERGES tab; INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2]); INSERT INTO tab values (2, [2.2, 2.3, 2.4]) (3, [3.1, 3.2, 3.3]); @@ -70,9 +31,7 @@ DROP TABLE tab; SELECT 'Correctness of index with > 1 mark'; -SELECT '- Annoy'; - -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000); WITH [1.0, 0.0] AS reference_vec @@ -88,58 +47,3 @@ ORDER BY L2Distance(vec, reference_vec) LIMIT 1; DROP TABLE tab; - --- same, but with Tuples -CREATE TABLE tab(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug -INSERT INTO tab SELECT number, (toFloat32(number), 0.0) from numbers(10000); - -WITH (1.0, 0.0) AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 1; - -WITH (9000.0, 0.0) AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 1; - -DROP TABLE tab; - -SELECT '- Usearch'; - -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug -INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000); - -WITH [1.0, 0.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 1; - -WITH [9000.0, 0.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 1; - -DROP TABLE tab; - --- same, but with Tuples -CREATE TABLE tab(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity=8192; -- disable adaptive granularity due to bug -INSERT INTO tab SELECT number, (toFloat32(number), 0.0) from numbers(10000); - -WITH (1.0, 0.0) AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 1; - -WITH (9000.0, 0.0) AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab -ORDER BY L2Distance(vec, reference_vec) -LIMIT 1; - -DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_default_granularity.reference b/tests/queries/0_stateless/02354_vector_search_default_granularity.reference index 2f97ed72c52..ab3cc71ff6b 100644 --- a/tests/queries/0_stateless/02354_vector_search_default_granularity.reference +++ b/tests/queries/0_stateless/02354_vector_search_default_granularity.reference @@ -1,7 +1,2 @@ -Test the default index granularity for vector search indexes (CREATE TABLE AND ALTER TABLE), should be 100 million for Annoy and USearch -- Annoy -100000000 -100000000 -- Usearch 100000000 100000000 diff --git a/tests/queries/0_stateless/02354_vector_search_default_granularity.sql b/tests/queries/0_stateless/02354_vector_search_default_granularity.sql index f15554505f8..acb69cb6ff8 100644 --- a/tests/queries/0_stateless/02354_vector_search_default_granularity.sql +++ b/tests/queries/0_stateless/02354_vector_search_default_granularity.sql @@ -1,35 +1,18 @@ --- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan +-- Tags: no-fasttest, no-ordinary-database --- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage, --- therefore they are tested in a single file. +-- Tests that vector search indexes use a (non-standard) index granularity of 100 mio by default. --- This file contains tests for the non-standard default granularity of vector search indexes. - -SET allow_experimental_annoy_index = 1; -SET allow_experimental_usearch_index = 1; - -SELECT 'Test the default index granularity for vector search indexes (CREATE TABLE AND ALTER TABLE), should be 100 million for Annoy and USearch'; - -SELECT '- Annoy'; +SET allow_experimental_vector_similarity_index = 1; +-- After CREATE TABLE DROP TABLE IF EXISTS tab; -CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE annoy) ENGINE=MergeTree ORDER BY id; -SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; - -DROP TABLE tab; -CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE=MergeTree ORDER BY id; -ALTER TABLE tab ADD INDEX idx(vec) TYPE annoy; -SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; - -SELECT '- Usearch'; - -DROP TABLE tab; -CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE usearch) ENGINE=MergeTree ORDER BY id; -SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; - -DROP TABLE tab; -CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE=MergeTree ORDER BY id; -ALTER TABLE tab ADD INDEX idx(vec) TYPE usearch; +CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; +SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; + +-- After ALTER TABLE +DROP TABLE tab; +CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE = MergeTree ORDER BY id; +ALTER TABLE tab ADD INDEX idx(vec) TYPE vector_similarity('hnsw', 'L2Distance'); SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_detach_attach.reference b/tests/queries/0_stateless/02354_vector_search_detach_attach.reference new file mode 100644 index 00000000000..80eb091922e --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_detach_attach.reference @@ -0,0 +1,3 @@ +5 [0,2] 0 +6 [0,2.1] 0.09999990463256836 +7 [0,2.2] 0.20000004768371582 diff --git a/tests/queries/0_stateless/02354_vector_search_detach_attach.sql b/tests/queries/0_stateless/02354_vector_search_detach_attach.sql new file mode 100644 index 00000000000..f92eaddbbed --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_detach_attach.sql @@ -0,0 +1,20 @@ +-- Tags: no-fasttest, no-ordinary-database + +-- Tests that vector similarity indexes can be detached/attached. + +SET allow_experimental_vector_similarity_index = 1; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); + +DETACH TABLE tab SYNC; +ATTACH TABLE tab; + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3; + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference index 43bc49e8adc..b6d034208d0 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference @@ -1,8 +1,10 @@ -At most two index arguments -1st argument (distance function) must be String -Rejects unsupported distance functions -2nd argument (Annoy: number of trees, USearch: scalar kind) must be UInt64 (Annoy) / String (Usearch) -Rejects unsupported scalar kinds (only Usearch) +Two or six index arguments +1st argument (method) must be String and hnsw +2nd argument (distance function) must be String and L2Distance or cosineDistance +3nd argument (quantization), if given, must be String and f32, f16, ... +4nd argument (M), if given, must be UInt64 and > 1 +5nd argument (ef_construction), if given, must be UInt64 and > 0 +6nd argument (ef_search), if given, must be UInt64 and > 0 Must be created on single column -Must be created on Array(Float32) or Tuple(Float32, Float, ...) columns +Must be created on Array(Float32) columns Rejects INSERTs of Arrays with different sizes diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql index 6a4d6448629..7c2ddfe81fc 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql @@ -1,63 +1,51 @@ --- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan +-- Tags: no-fasttest, no-ordinary-database --- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage, --- therefore they are tested in a single file. +-- Tests that various conditions are checked during creation of vector search indexes. --- This file tests that various conditions are checked during creation of vector search indexes. - -SET allow_experimental_annoy_index = 1; -SET allow_experimental_usearch_index = 1; +SET allow_experimental_vector_similarity_index = 1; DROP TABLE IF EXISTS tab; -SELECT 'At most two index arguments'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('too', 'many', 'args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +SELECT 'Two or six index arguments'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant_have_one_arg')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'three_args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'six', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -SELECT '1st argument (distance function) must be String'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +SELECT '1st argument (method) must be String and hnsw'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3, 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('not_hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } -SELECT 'Rejects unsupported distance functions'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '2nd argument (distance function) must be String and L2Distance or cosineDistance'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'invalid_distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } -SELECT '2nd argument (Annoy: number of trees, USearch: scalar kind) must be UInt64 (Annoy) / String (Usearch)'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } - -SELECT 'Rejects unsupported scalar kinds (only Usearch)'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('L2Distance', 'invalidKind')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '3nd argument (quantization), if given, must be String and f32, f16, ...'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '4nd argument (M), if given, must be UInt64 and > 1'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 'invalid', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '5nd argument (ef_construction), if given, must be UInt64 and > 0'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 'invalid', 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 0, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '6nd argument (ef_search), if given, must be UInt64 and > 0'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 'invalid')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 0)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT 'Must be created on single column'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } - -SELECT 'Must be created on Array(Float32) or Tuple(Float32, Float, ...) columns'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } +SELECT 'Must be created on Array(Float32) columns'; SET allow_suspicious_low_cardinality_types = 1; - -CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } - -CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } - -CREATE TABLE tab(id Int32, vec Tuple(Float64), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Tuple(Float64), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } - -CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } - -CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec UInt64, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } SELECT 'Rejects INSERTs of Arrays with different sizes'; - -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; -INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA } -DROP TABLE tab; - -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA } DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference new file mode 100644 index 00000000000..030bfa9b1bd --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference @@ -0,0 +1,2 @@ +Annoy +Usearch diff --git a/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql new file mode 100644 index 00000000000..0889aa74f7a --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql @@ -0,0 +1,43 @@ +-- Indexes of type 'annoy' or 'usearch' are no longer supported. +-- Test what happens when ClickHouse encounters tables with the old index type. + +DROP TABLE IF EXISTS tab; + +SELECT 'Annoy'; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX vec_idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; + +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -- { serverError ILLEGAL_INDEX } + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3; +-- (*) The search succeeds because the index contains no data (i.e. some shortcut) +-- If it had data (can't really test in SQL tests ...), this statement would also return an error, trust me. + +-- Detach and attach should work. +DETACH TABLE tab; +ATTACH TABLE tab; + +DROP TABLE tab; + +SELECT 'Usearch'; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX vec_idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; + +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -- { serverError ILLEGAL_INDEX } + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3; +-- see above: (*) + +-- Detach and attach should work. +DETACH TABLE tab; +ATTACH TABLE tab; + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_multiple_indexes.reference b/tests/queries/0_stateless/02354_vector_search_multiple_indexes.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql b/tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql new file mode 100644 index 00000000000..f1cfc041233 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql @@ -0,0 +1,14 @@ +-- Tags: no-fasttest, no-ordinary-database + +-- Tests that multiple vector similarity indexes can be created on the same column (even if that makes no sense) + +SET allow_experimental_vector_similarity_index = 1; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')); + +ALTER TABLE tab ADD INDEX idx(vec) TYPE minmax; +ALTER TABLE tab ADD INDEX vec_idx1(vec) TYPE vector_similarity('hnsw', 'cosineDistance'); +ALTER TABLE tab ADD INDEX vec_idx2(vec) TYPE vector_similarity('hnsw', 'L2Distance'); -- silly but creating the same index also works for non-vector indexes ... + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index 41c1915ecc3..7c8e4c0ca59 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -1,41 +1,14 @@ -ARRAY, 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block -- Annoy: WHERE-type +10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block +- ORDER-BY-type 5 [0,2] 0 6 [0,2.1] 0.09999990463256836 7 [0,2.2] 0.20000004768371582 -- Annoy: ORDER-BY-type -5 [0,2] 0 -6 [0,2.1] 0.09999990463256836 -7 [0,2.2] 0.20000004768371582 -- Usearch: WHERE-type -5 [0,2] 0 -6 [0,2.1] 0.09999990463256836 -7 [0,2.2] 0.20000004768371582 -- Usearch: ORDER-BY-type -5 [0,2] 0 -6 [0,2.1] 0.09999990463256836 -7 [0,2.2] 0.20000004768371582 -- Annoy: WHERE-type, EXPLAIN -Expression ((Projection + Before ORDER BY)) - Limit (preliminary LIMIT (without OFFSET)) - Expression - ReadFromMergeTree (default.tab_annoy) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 1/1 - Skip - Name: idx - Description: annoy GRANULARITY 100000000 - Parts: 1/1 - Granules: 1/1 -- Annoy: ORDER-BY-type, EXPLAIN +- ORDER-BY-type, EXPLAIN Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_annoy) + ReadFromMergeTree (default.tab) Indexes: PrimaryKey Condition: true @@ -43,78 +16,20 @@ Expression (Projection) Granules: 1/1 Skip Name: idx - Description: annoy GRANULARITY 100000000 + Description: vector_similarity GRANULARITY 100000000 Parts: 1/1 Granules: 1/1 -- Usearch: WHERE-type, EXPLAIN -Expression ((Projection + Before ORDER BY)) - Limit (preliminary LIMIT (without OFFSET)) - Expression - ReadFromMergeTree (default.tab_usearch) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 1/1 - Skip - Name: idx - Description: usearch GRANULARITY 100000000 - Parts: 1/1 - Granules: 1/1 -- Usearch: ORDER-BY-type, EXPLAIN +12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block +- ORDER-BY-type +6 [0,2] 0 +7 [0,2.1] 0.09999990463256836 +8 [0,2.2] 0.20000004768371582 +- ORDER-BY-type, EXPLAIN Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_usearch) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 1/1 - Skip - Name: idx - Description: usearch GRANULARITY 100000000 - Parts: 1/1 - Granules: 1/1 -ARRAY vectors, 12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block -- Annoy: WHERE-type -6 [0,2] 0 -7 [0,2.1] 0.09999990463256836 -8 [0,2.2] 0.20000004768371582 -- Annoy: ORDER-BY-type -6 [0,2] 0 -7 [0,2.1] 0.09999990463256836 -8 [0,2.2] 0.20000004768371582 -- Usearch: WHERE-type -6 [0,2] 0 -7 [0,2.1] 0.09999990463256836 -8 [0,2.2] 0.20000004768371582 -- Usearch: ORDER-BY-type -6 [0,2] 0 -7 [0,2.1] 0.09999990463256836 -8 [0,2.2] 0.20000004768371582 -- Annoy: WHERE-type, EXPLAIN -Expression ((Projection + Before ORDER BY)) - Limit (preliminary LIMIT (without OFFSET)) - Expression - ReadFromMergeTree (default.tab_annoy) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 4/4 - Skip - Name: idx - Description: annoy GRANULARITY 2 - Parts: 1/1 - Granules: 1/4 -- Annoy: ORDER-BY-type, EXPLAIN -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_annoy) + ReadFromMergeTree (default.tab) Indexes: PrimaryKey Condition: true @@ -122,77 +37,20 @@ Expression (Projection) Granules: 4/4 Skip Name: idx - Description: annoy GRANULARITY 2 + Description: vector_similarity GRANULARITY 2 Parts: 1/1 Granules: 2/4 -- Usearch: WHERE-type, EXPLAIN -Expression ((Projection + Before ORDER BY)) - Limit (preliminary LIMIT (without OFFSET)) - Expression - ReadFromMergeTree (default.tab_usearch) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 4/4 - Skip - Name: idx - Description: usearch GRANULARITY 2 - Parts: 1/1 - Granules: 1/4 -- Usearch: ORDER-BY-type, EXPLAIN -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_usearch) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 4/4 - Skip - Name: idx - Description: usearch GRANULARITY 2 - Parts: 1/1 - Granules: 2/4 -TUPLE vectors and special cases -- Annoy: WHERE-type -0 (4.6,2.3) 0.5527864045000421 -1 (2,3.2) 0.15200169244542905 -2 (4.2,3.4) 0.37080174340866845 -- Annoy: ORDER-BY-type -6 (1,9.3) 0.005731362878640178 -1 (2,3.2) 0.15200169244542905 -7 (5.5,4.7) 0.3503476876550442 -- Usearch: WHERE-type -0 (4.6,2.3) 0.5527864045000421 -1 (2,3.2) 0.15200169244542905 -2 (4.2,3.4) 0.37080174340866845 -- Usearch: ORDER-BY-type -6 (1,9.3) 0.005731362878640178 -1 (2,3.2) 0.15200169244542905 -7 (5.5,4.7) 0.3503476876550442 -- Special case: MaximumDistance is negative -- Special case: MaximumDistance is negative -- Special case: setting "annoy_index_search_k_nodes" +Special cases +- ORDER-BY-type +6 [1,9.3] 0.005731362878640178 +1 [2,3.2] 0.15200169244542905 +7 [5.5,4.7] 0.3503476876550442 - Special case: setting "max_limit_for_ann_queries" Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_annoy) - Indexes: - PrimaryKey - Condition: true - Parts: 1/1 - Granules: 4/4 -- Special case: setting "max_limit_for_ann_queries" -Expression (Projection) - Limit (preliminary LIMIT (without OFFSET)) - Sorting (Sorting for ORDER BY) - Expression (Before ORDER BY) - ReadFromMergeTree (default.tab_usearch) + ReadFromMergeTree (default.tab) Indexes: PrimaryKey Condition: true diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 64051aa8544..dbf0fca32ab 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -1,238 +1,80 @@ --- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan +-- Tags: no-fasttest, no-ordinary-database --- Tests vector search in ClickHouse, i.e. Annoy and Usearch indexes. Both index types share similarities in implementation and usage, --- therefore they are tested in a single file. +-- Tests various simple approximate nearest neighborhood (ANN) queries that utilize vector search indexes. --- This file tests various simple approximate nearest neighborhood (ANN) queries that utilize vector search indexes. +SET allow_experimental_vector_similarity_index = 1; -SET allow_experimental_annoy_index = 1; -SET allow_experimental_usearch_index = 1; +SET enable_analyzer = 0; -SELECT 'ARRAY, 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block'; +SELECT '10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block'; -DROP TABLE IF EXISTS tab_annoy; -DROP TABLE IF EXISTS tab_usearch; +DROP TABLE IF EXISTS tab; -CREATE TABLE tab_annoy(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; -INSERT INTO tab_annoy VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); - -CREATE TABLE tab_usearch(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; -INSERT INTO tab_usearch VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -SELECT '- Annoy: WHERE-type'; +SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Annoy: ORDER-BY-type'; -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy +FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- Usearch: WHERE-type'; +SELECT '- ORDER-BY-type, EXPLAIN'; +EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Usearch: ORDER-BY-type'; -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch +FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- Annoy: WHERE-type, EXPLAIN'; -EXPLAIN indexes=1 -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; +DROP TABLE tab; -SELECT '- Annoy: ORDER-BY-type, EXPLAIN'; -EXPLAIN indexes=1 + +SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block'; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]); + +SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy +FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- Usearch: WHERE-type, EXPLAIN'; -EXPLAIN indexes=1 +SELECT '- ORDER-BY-type, EXPLAIN'; +EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Usearch: ORDER-BY-type, EXPLAIN'; -EXPLAIN indexes=1 -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch +FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -DROP TABLE tab_annoy; -DROP TABLE tab_usearch; +DROP TABLE tab; -SELECT 'ARRAY vectors, 12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block'; +SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen. +-- Test with non-default metric, M, ef_construction, ef_search -CREATE TABLE tab_annoy(id Int32, vec Array(Float32), INDEX idx vec TYPE annoy() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -INSERT INTO tab_annoy VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]); +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -CREATE TABLE tab_usearch(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -INSERT INTO tab_usearch VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]); - -SELECT '- Annoy: WHERE-type'; +SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Annoy: ORDER-BY-type'; -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy -ORDER BY L2Distance(vec, reference_vec) -LIMIT 3; - -SELECT '- Usearch: WHERE-type'; -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Usearch: ORDER-BY-type'; -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch -ORDER BY L2Distance(vec, reference_vec) -LIMIT 3; - -SELECT '- Annoy: WHERE-type, EXPLAIN'; -EXPLAIN indexes=1 -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Annoy: ORDER-BY-type, EXPLAIN'; -EXPLAIN indexes=1 -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_annoy -ORDER BY L2Distance(vec, reference_vec) -LIMIT 3; - -SELECT '- Usearch: WHERE-type, EXPLAIN'; -EXPLAIN indexes=1 -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch -WHERE L2Distance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Usearch: ORDER-BY-type, EXPLAIN'; -EXPLAIN indexes=1 -WITH [0.0, 2.0] AS reference_vec -SELECT id, vec, L2Distance(vec, reference_vec) -FROM tab_usearch -ORDER BY L2Distance(vec, reference_vec) -LIMIT 3; - -DROP TABLE tab_annoy; -DROP TABLE tab_usearch; - - -SELECT 'TUPLE vectors and special cases'; --- Not a systematic test, just to check that no bad things happen. --- Just for jun, use metric = 'cosineDistance' (Annoy/Usearch), tree_count = 200 (Annoy), scalarKind = 'f64' (Usearch) - -CREATE TABLE tab_annoy(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE annoy('cosineDistance', 200) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -INSERT INTO tab_annoy VALUES (0, (4.6, 2.3)), (1, (2.0, 3.2)), (2, (4.2, 3.4)), (3, (5.3, 2.9)), (4, (2.4, 5.2)), (5, (5.3, 2.3)), (6, (1.0, 9.3)), (7, (5.5, 4.7)), (8, (6.4, 3.5)), (9, (5.3, 2.5)), (10, (6.4, 3.4)), (11, (6.4, 3.2)); - -CREATE TABLE tab_usearch(id Int32, vec Tuple(Float32, Float32), INDEX idx vec TYPE usearch('cosineDistance', 'f64') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; -INSERT INTO tab_usearch VALUES (0, (4.6, 2.3)), (1, (2.0, 3.2)), (2, (4.2, 3.4)), (3, (5.3, 2.9)), (4, (2.4, 5.2)), (5, (5.3, 2.3)), (6, (1.0, 9.3)), (7, (5.5, 4.7)), (8, (6.4, 3.5)), (9, (5.3, 2.5)), (10, (6.4, 3.4)), (11, (6.4, 3.2)); - -SELECT '- Annoy: WHERE-type'; -WITH (0.0, 2.0) AS reference_vec SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_annoy -WHERE cosineDistance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Annoy: ORDER-BY-type'; -WITH (0.0, 2.0) AS reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_annoy +FROM tab ORDER BY cosineDistance(vec, reference_vec) LIMIT 3; -SELECT '- Usearch: WHERE-type'; -WITH (0.0, 2.0) AS reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_usearch -WHERE cosineDistance(vec, reference_vec) < 1.0 -LIMIT 3; - -SELECT '- Usearch: ORDER-BY-type'; -WITH (0.0, 2.0) AS reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_usearch -ORDER BY cosineDistance(vec, reference_vec) -LIMIT 3; - -SELECT '- Special case: MaximumDistance is negative'; -WITH (0.0, 2.0) as reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_annoy -WHERE cosineDistance(vec, reference_vec) < -1.0 -LIMIT 3; -- { serverError INCORRECT_QUERY } - -SELECT '- Special case: MaximumDistance is negative'; -WITH (0.0, 2.0) as reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_usearch -WHERE cosineDistance(vec, reference_vec) < -1.0 -LIMIT 3; -- { serverError INCORRECT_QUERY } - -SELECT '- Special case: setting "annoy_index_search_k_nodes"'; -WITH (0.0, 2.0) as reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_annoy -ORDER BY cosineDistance(vec, reference_vec) -LIMIT 3 -SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results - SELECT '- Special case: setting "max_limit_for_ann_queries"'; EXPLAIN indexes=1 -WITH (0.0, 2.0) as reference_vec +WITH [0.0, 2.0] as reference_vec SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_annoy +FROM tab ORDER BY cosineDistance(vec, reference_vec) LIMIT 3 -SETTINGS max_limit_for_ann_queries=2; -- LIMIT 3 > 2 --> don't use the ann index +SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann index -SELECT '- Special case: setting "max_limit_for_ann_queries"'; -EXPLAIN indexes=1 -WITH (0.0, 2.0) as reference_vec -SELECT id, vec, cosineDistance(vec, reference_vec) -FROM tab_usearch -ORDER BY cosineDistance(vec, reference_vec) -LIMIT 3 -SETTINGS max_limit_for_ann_queries=2; -- LIMIT 3 > 2 --> don't use the ann index - -DROP TABLE tab_annoy; -DROP TABLE tab_usearch; +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.reference b/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql b/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql new file mode 100644 index 00000000000..da6494bf831 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql @@ -0,0 +1,23 @@ +-- Tags: no-fasttest, no-ordinary-database + +SET allow_experimental_vector_similarity_index = 1; + +-- Tests that quoted and unquoted parameters can be passed to vector search indexes. + +DROP TABLE IF EXISTS tab1; +DROP TABLE IF EXISTS tab2; + +CREATE TABLE tab1 (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')); +CREATE TABLE tab2 (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity(hnsw, L2Distance)); + +DROP TABLE tab1; +DROP TABLE tab2; + +CREATE TABLE tab1 (id Int32, vec Array(Float32), PRIMARY KEY id); +CREATE TABLE tab2 (id Int32, vec Array(Float32), PRIMARY KEY id); + +ALTER TABLE tab1 ADD INDEX idx1(vec) TYPE vector_similarity('hnsw', 'L2Distance'); +ALTER TABLE tab2 ADD INDEX idx2(vec) TYPE vector_similarity(hnsw, L2Distance); + +DROP TABLE tab1; +DROP TABLE tab2;