diff --git a/.gitmodules b/.gitmodules index 86fd7832dd9..36721723371 100644 --- a/.gitmodules +++ b/.gitmodules @@ -347,3 +347,6 @@ [submodule "contrib/incbin"] path = contrib/incbin url = https://github.com/graphitemaster/incbin.git +[submodule "contrib/usearch"] + path = contrib/usearch + url = https://github.com/unum-cloud/usearch.git diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 16135351cce..abecfcb30c8 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -196,6 +196,7 @@ if (ARCH_S390X) add_contrib(crc32-s390x-cmake crc32-s390x) endif() add_contrib (annoy-cmake annoy) +add_contrib (usearch-cmake usearch) add_contrib (xxHash-cmake xxHash) add_contrib (libbcrypt-cmake libbcrypt) diff --git a/contrib/usearch b/contrib/usearch new file mode 160000 index 00000000000..387b78b28b1 --- /dev/null +++ b/contrib/usearch @@ -0,0 +1 @@ +Subproject commit 387b78b28b17b8954024ffc81e97cbcfa10d1f30 diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt new file mode 100644 index 00000000000..f4ed6a9adca --- /dev/null +++ b/contrib/usearch-cmake/CMakeLists.txt @@ -0,0 +1,15 @@ +option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_USEARCH) + message (STATUS "Not using usearch") + return() +endif() + +set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch") +set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include") + +add_library(_usearch INTERFACE) +target_include_directories(_usearch SYSTEM INTERFACE ${USEARCH_PROJECT_DIR}/fp16/include ${USEARCH_PROJECT_DIR}/robin-map/include ${USEARCH_PROJECT_DIR}/simsimd/include ${USEARCH_SOURCE_DIR}) + +add_library(ch_contrib::usearch ALIAS _usearch) +target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH) \ No newline at end of file diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 81c69215472..4fd1d0d17db 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -142,6 +142,8 @@ was specified for ANN indexes, the default value is 100 million. - [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy) +- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch) + ## Annoy {#annoy} Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently @@ -216,3 +218,42 @@ ORDER BY L2Distance(vectors, Point) LIMIT N SETTINGS annoy_index_search_k_nodes=100; ``` + + +## USearch {#usearch} + +USearch indexes are currently experimental, to use them you first need to `SET allow_experimental_usearch_index = 1`. + +This type of ANN index implements [the HNSW algorithm](https://github.com/unum-cloud/usearch). + +Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column: + +```sql +CREATE TABLE table_with_usearch_index +( + id Int64, + vectors Array(Float32), + INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N] +) +ENGINE = MergeTree +ORDER BY id; +``` + +Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column: + +```sql +CREATE TABLE table_with_usearch_index +( + id Int64, + vectors Tuple(Float32[, Float32[, ...]]), + INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N] +) +ENGINE = MergeTree +ORDER BY id; +``` + +USearch currently supports two distance functions: +- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space + ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)). +- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors + ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ddb6fcebd23..e4e9fd19cff 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -599,6 +599,10 @@ if (TARGET ch_contrib::annoy) dbms_target_link_libraries(PUBLIC ch_contrib::annoy) endif() +if (TARGET ch_contrib::usearch) + dbms_target_link_libraries(PUBLIC ch_contrib::usearch) +endif() + if (TARGET ch_rust::skim) dbms_target_include_directories(PRIVATE $) dbms_target_link_libraries(PUBLIC ch_rust::skim) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 714ac17a15d..14714981c00 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -772,6 +772,7 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ + M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeIndexHnsw.cpp b/src/Storages/MergeTree/MergeTreeIndexHnsw.cpp new file mode 100644 index 00000000000..67726f41e8c --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexHnsw.cpp @@ -0,0 +1,351 @@ +#ifdef ENABLE_USEARCH + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int INCORRECT_DATA; + extern const int INCORRECT_NUMBER_OF_COLUMNS; + extern const int INCORRECT_QUERY; + extern const int LOGICAL_ERROR; +} + + +template +USearchIndexWithSerialization::USearchIndexWithSerialization(size_t dimensions) + : Base(Base::make(unum::usearch::metric_punned_t(dimensions, Metric))) +{ +} + +template +void USearchIndexWithSerialization::serialize([[maybe_unused]] WriteBuffer & ostr) const +{ + auto callback = [&ostr](void * from, size_t n) + { + ostr.write(reinterpret_cast(from), n); + return true; + }; + + Base::stream(callback); +} + +template +void USearchIndexWithSerialization::deserialize([[maybe_unused]] ReadBuffer & istr) +{ + BufferBase::Position & pos = istr.position(); + unum::usearch::memory_mapped_file_t memory_map(pos, istr.buffer().size() - istr.count()); + Base::view(std::move(memory_map)); + pos += Base::stream_length(); + + auto copy = Base::copy(); + if (!copy) + throw std::runtime_error("Can't copy index"); + Base::swap(copy.index); +} + +template +size_t USearchIndexWithSerialization::getDimensions() const +{ + return Base::dimensions(); +} + + +template +MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_) + : index_name(index_name_), index_sample_block(index_sample_block_), index(nullptr) +{ +} + +template +MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch( + const String & index_name_, const Block & index_sample_block_, USearchIndexWithSerializationPtr index_) + : index_name(index_name_), index_sample_block(index_sample_block_), index(std::move(index_)) +{ +} + +template +void MergeTreeIndexGranuleUSearch::serializeBinary(WriteBuffer & ostr) const +{ + /// Number of dimensions is required in the index constructor, + /// so it must be written and read separately from the other part + writeIntBinary(static_cast(index->getDimensions()), ostr); // write dimension + index->serialize(ostr); +} + +template +void MergeTreeIndexGranuleUSearch::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) +{ + UInt64 dimension; + readIntBinary(dimension, istr); + index = std::make_shared>(dimension); + index->deserialize(istr); +} + +template +MergeTreeIndexAggregatorUSearch::MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block_) + : index_name(index_name_), index_sample_block(index_sample_block_) +{ +} + +template +MergeTreeIndexGranulePtr MergeTreeIndexAggregatorUSearch::getGranuleAndReset() +{ + auto granule = std::make_shared>(index_name, index_sample_block, index); + index = nullptr; + return granule; +} + +template +void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, size_t limit) +{ + if (*pos >= block.rows()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "The provided position is not less than the number of block rows. Position: {}, Block rows: {}.", + *pos, + block.rows()); + + size_t rows_read = std::min(limit, block.rows() - *pos); + if (rows_read == 0) + return; + + if (index_sample_block.columns() > 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column"); + + const String & index_column_name = index_sample_block.getByPosition(0).name; + ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read); + + if (const auto & column_array = typeid_cast(column_cut.get())) + { + const auto & data = column_array->getData(); + const auto & array = typeid_cast(data).getData(); + + if (array.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read); + + const auto & offsets = column_array->getOffsets(); + const size_t num_rows = offsets.size(); + + + /// Check all sizes are the same + size_t size = offsets[0]; + for (size_t i = 0; i < num_rows - 1; ++i) + if (offsets[i + 1] - offsets[i] != size) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name); + + + index = std::make_shared>(size); + + /// Add all rows of block + index->reserve(unum::usearch::ceil2(index->size() + num_rows + 1)); + index->add(index->size(), array.data()); + for (size_t current_row = 1; current_row < num_rows; ++current_row) + index->add(index->size(), &array[offsets[current_row - 1]]); + } + else if (const auto & column_tuple = typeid_cast(column_cut.get())) + { + const auto & columns = column_tuple->getColumns(); + std::vector> data{column_tuple->size(), std::vector()}; + for (const auto & column : columns) + { + const auto & pod_array = typeid_cast(column.get())->getData(); + for (size_t i = 0; i < pod_array.size(); ++i) + data[i].push_back(pod_array[i]); + } + + if (data.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read); + + index = std::make_shared>(data[0].size()); + index->reserve(unum::usearch::ceil2(index->size() + data.size())); + for (const auto & item : data) + index->add(index->size(), item.data()); + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column"); + + *pos += rows_read; +} + + +MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch( + const IndexDescription & /*index_description*/, const SelectQueryInfo & query, const String & distance_function_, ContextPtr context) + : ann_condition(query, context), distance_function(distance_function_) +{ +} + +bool MergeTreeIndexConditionUSearch::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const +{ + throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); +} + +bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const +{ + return ann_condition.alwaysUnknownOrTrue(distance_function); +} + +std::vector MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const +{ + if (distance_function == "L2Distance") + return getUsefulRangesImpl(idx_granule); + else if (distance_function == "cosineDistance") + return getUsefulRangesImpl(idx_granule); + std::unreachable(); +} + +template +std::vector MergeTreeIndexConditionUSearch::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const +{ + const UInt64 limit = ann_condition.getLimit(); + const UInt64 index_granularity = ann_condition.getIndexGranularity(); + const std::optional comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where + ? std::optional(ann_condition.getComparisonDistanceForWhereQuery()) + : std::nullopt; + + if (comparison_distance && comparison_distance.value() < 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance"); + + const std::vector reference_vector = ann_condition.getReferenceVector(); + const auto granule = std::dynamic_pointer_cast>(idx_granule); + if (granule == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); + + const USearchIndexWithSerializationPtr index = granule->index; + if (ann_condition.getDimensions() != index->dimensions()) + throw Exception( + ErrorCodes::INCORRECT_QUERY, + "The dimension of the space in the request ({}) " + "does not match the dimension in the index ({})", + ann_condition.getDimensions(), + index->dimensions()); + + auto result = index->search(reference_vector.data(), limit); + std::vector neighbors(result.size()); /// indexes of dots which were closest to the reference vector + std::vector distances(result.size()); + result.dump_to(neighbors.data(), distances.data()); + + std::vector granule_numbers; + granule_numbers.reserve(neighbors.size()); + for (size_t i = 0; i < neighbors.size(); ++i) + { + if (comparison_distance && distances[i] > comparison_distance) + continue; + granule_numbers.push_back(neighbors[i] / index_granularity); + } + + /// make unique + std::sort(granule_numbers.begin(), granule_numbers.end()); + granule_numbers.erase(std::unique(granule_numbers.begin(), granule_numbers.end()), granule_numbers.end()); + + return granule_numbers; +} + +MergeTreeIndexUSearch::MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_) + : IMergeTreeIndex(index_), distance_function(distance_function_) +{ +} + + +MergeTreeIndexGranulePtr MergeTreeIndexUSearch::createIndexGranule() const +{ + if (distance_function == "L2Distance") + return std::make_shared>(index.name, index.sample_block); + else if (distance_function == "cosineDistance") + return std::make_shared>(index.name, index.sample_block); + std::unreachable(); +} + +MergeTreeIndexAggregatorPtr MergeTreeIndexUSearch::createIndexAggregator() const +{ + if (distance_function == "L2Distance") + return std::make_shared>(index.name, index.sample_block); + else if (distance_function == "cosineDistance") + return std::make_shared>(index.name, index.sample_block); + std::unreachable(); +} + +MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const +{ + return std::make_shared(index, query, distance_function, context); +}; + +MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index) +{ + static constexpr auto default_distance_function = "L2Distance"; + String distance_function = default_distance_function; + if (!index.arguments.empty()) + distance_function = index.arguments[0].get(); + + return std::make_shared(index, distance_function); +} + +void usearchIndexValidator(const IndexDescription & index, bool /* attach */) +{ + /// Check number and type of USearch index arguments: + + if (index.arguments.size() > 1) + throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index must not have more than one parameters"); + + if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of USearch index must be of type String"); + + /// Check that the index is created on a single column + + if (index.column_names.size() != 1 || index.data_types.size() != 1) + throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "USearch indexes must be created on a single column"); + + /// Check that a supported metric was passed as first argument + + if (!index.arguments.empty()) + { + String distance_name = index.arguments[0].get(); + if (distance_name != "L2Distance" && distance_name != "cosineDistance") + throw Exception(ErrorCodes::INCORRECT_DATA, "USearch index only supports distance functions 'L2Distance' and 'cosineDistance'"); + } + + /// Check data type of indexed column: + + auto throw_unsupported_underlying_column_exception = []() + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)"); + }; + + DataTypePtr data_type = index.sample_block.getDataTypes()[0]; + + if (const auto * data_type_array = typeid_cast(data_type.get())) + { + TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); + if (!WhichDataType(nested_type_index).isFloat32()) + throw_unsupported_underlying_column_exception(); + } + else if (const auto * data_type_tuple = typeid_cast(data_type.get())) + { + const DataTypes & inner_types = data_type_tuple->getElements(); + for (const auto & inner_type : inner_types) + { + TypeIndex nested_type_index = inner_type->getTypeId(); + if (!WhichDataType(nested_type_index).isFloat32()) + throw_unsupported_underlying_column_exception(); + } + } + else + throw_unsupported_underlying_column_exception(); +} +} + +#endif diff --git a/src/Storages/MergeTree/MergeTreeIndexHnsw.h b/src/Storages/MergeTree/MergeTreeIndexHnsw.h new file mode 100644 index 00000000000..6be4a649ebe --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexHnsw.h @@ -0,0 +1,103 @@ +#pragma once + +#ifdef ENABLE_USEARCH + +#include + +#include + +namespace DB +{ + +template +class USearchIndexWithSerialization : public unum::usearch::index_dense_t +{ + using Base = unum::usearch::index_dense_t; + +public: + explicit USearchIndexWithSerialization(size_t dimensions); + void serialize(WriteBuffer & ostr) const; + void deserialize(ReadBuffer & istr); + size_t getDimensions() const; +}; + +template +using USearchIndexWithSerializationPtr = std::shared_ptr>; + +template +struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule +{ + MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_); + MergeTreeIndexGranuleUSearch( + const String & index_name_, const Block & index_sample_block_, USearchIndexWithSerializationPtr index_); + + ~MergeTreeIndexGranuleUSearch() override = default; + + void serializeBinary(WriteBuffer & ostr) const override; + void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override; + + bool empty() const override { return !index.get(); } + + const String index_name; + const Block index_sample_block; + USearchIndexWithSerializationPtr index; +}; + +template +struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator +{ + MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block); + ~MergeTreeIndexAggregatorUSearch() override = default; + + bool empty() const override { return !index || index->size() == 0; } + MergeTreeIndexGranulePtr getGranuleAndReset() override; + void update(const Block & block, size_t * pos, size_t limit) override; + + const String index_name; + const Block index_sample_block; + USearchIndexWithSerializationPtr index; +}; + + +class MergeTreeIndexConditionUSearch final : public IMergeTreeIndexConditionApproximateNearestNeighbor +{ +public: + MergeTreeIndexConditionUSearch( + const IndexDescription & index_description, const SelectQueryInfo & query, const String & distance_function, ContextPtr context); + + ~MergeTreeIndexConditionUSearch() override = default; + + bool alwaysUnknownOrTrue() const override; + bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override; + std::vector getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override; + +private: + template + std::vector getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const; + + const ApproximateNearestNeighborCondition ann_condition; + const String distance_function; +}; + + +class MergeTreeIndexUSearch : public IMergeTreeIndex +{ +public: + MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_); + + ~MergeTreeIndexUSearch() override = default; + + MergeTreeIndexGranulePtr createIndexGranule() const override; + MergeTreeIndexAggregatorPtr createIndexAggregator() const override; + MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const override; + + bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; } + +private: + const String distance_function; +}; + + +} + +#endif diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp index 6ae96d00171..322cdd35afe 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.cpp +++ b/src/Storages/MergeTree/MergeTreeIndices.cpp @@ -132,6 +132,11 @@ MergeTreeIndexFactory::MergeTreeIndexFactory() registerValidator("annoy", annoyIndexValidator); #endif +#ifdef ENABLE_USEARCH + registerCreator("usearch", usearchIndexCreator); + registerValidator("usearch", usearchIndexValidator); +#endif + registerCreator("inverted", invertedIndexCreator); registerValidator("inverted", invertedIndexValidator); diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 1ad6b082223..40128bab9d0 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -238,6 +238,11 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index); void annoyIndexValidator(const IndexDescription & index, bool attach); #endif +#ifdef ENABLE_USEARCH +MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index); +void usearchIndexValidator(const IndexDescription& index, bool attach); +#endif + MergeTreeIndexPtr invertedIndexCreator(const IndexDescription& index); void invertedIndexValidator(const IndexDescription& index, bool attach); diff --git a/tests/queries/0_stateless/02354_hnsw_index.reference b/tests/queries/0_stateless/02354_hnsw_index.reference new file mode 100644 index 00000000000..102edf6d026 --- /dev/null +++ b/tests/queries/0_stateless/02354_hnsw_index.reference @@ -0,0 +1,143 @@ +--- Negative tests --- +--- Test default GRANULARITY (should be 100 mio. for usearch)--- +CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX usearch_index vector TYPE usearch GRANULARITY 1\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192 +CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX usearch_index vector TYPE usearch GRANULARITY 1\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192 +--- Test with Array, GRANULARITY = 1, index_granularity = 5 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: usearch_index + Description: usearch GRANULARITY 1 + Parts: 1/1 + Granules: 1/3 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: usearch_index + Description: usearch GRANULARITY 1 + Parts: 1/1 + Granules: 3/3 +Reference ARRAYs with non-matching dimension are rejected +Special case: MaximumDistance is negative +WHERE type, L2Distance +Special case: setting max_limit_for_ann_queries +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 +--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: usearch_index + Description: usearch GRANULARITY 1 + Parts: 1/1 + Granules: 1/3 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 3/3 + Skip + Name: usearch_index + Description: usearch GRANULARITY 1 + Parts: 1/1 + Granules: 3/3 +--- Test non-default metric (cosine distance) --- +--- Test with Array, GRANULARITY = 2, index_granularity = 4 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: usearch_index + Description: usearch GRANULARITY 2 + Parts: 0/1 + Granules: 2/4 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: usearch_index + Description: usearch GRANULARITY 2 + Parts: 1/1 + Granules: 4/4 +--- Test with Array, GRANULARITY = 4, index_granularity = 4 --- +WHERE type, L2Distance, check that index is used +Expression ((Projection + Before ORDER BY)) + Limit (preliminary LIMIT (without OFFSET)) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: usearch_index + Description: usearch GRANULARITY 4 + Parts: 0/1 + Granules: 3/4 +ORDER BY type, L2Distance, check that index is used +Expression (Projection) + Limit (preliminary LIMIT (without OFFSET)) + Sorting (Sorting for ORDER BY) + Expression (Before ORDER BY) + ReadFromMergeTree (default.tab) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 4/4 + Skip + Name: usearch_index + Description: usearch GRANULARITY 4 + Parts: 1/1 + Granules: 4/4 diff --git a/tests/queries/0_stateless/02354_hnsw_index.sql b/tests/queries/0_stateless/02354_hnsw_index.sql new file mode 100644 index 00000000000..059705254d8 --- /dev/null +++ b/tests/queries/0_stateless/02354_hnsw_index.sql @@ -0,0 +1,229 @@ +-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check + +SET allow_experimental_usearch_index = 1; +SET allow_experimental_analyzer = 0; + +SELECT '--- Negative tests ---'; + +DROP TABLE IF EXISTS tab; + +-- must have at most 2 arguments +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } + +-- first argument (distance_function) must be String +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } + +-- must be created on single column +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index (vector, id) TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } + +-- reject unsupported distance functions +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } + +-- must be created on Array/Tuple(Float32) columns +SET allow_suspicious_low_cardinality_types = 1; +CREATE TABLE tab(id Int32, vector Float32, INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Array(Float64), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Tuple(Float64), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector LowCardinality(Float32), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vector Nullable(Float32), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } + +SELECT '--- Test default GRANULARITY (should be 100 mio. for usearch)---'; + +CREATE TABLE tab (id Int32, vector Array(Float32), INDEX usearch_index(vector) TYPE usearch) ENGINE=MergeTree ORDER BY id; +SHOW CREATE TABLE tab; +DROP TABLE tab; + +CREATE TABLE tab (id Int32, vector Array(Float32)) ENGINE=MergeTree ORDER BY id; +ALTER TABLE tab ADD INDEX usearch_index(vector) TYPE usearch; +SHOW CREATE TABLE tab; + +DROP TABLE tab; + + +SELECT '--- Test with Array, GRANULARITY = 1, index_granularity = 5 ---'; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); + +-- rows = 15, index_granularity = 5, GRANULARITY = 1 gives 3 usearch-indexed blocks (each comprising a single granule) +-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one usearch-indexed block produces results --> "Granules: 1/3" + +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 +LIMIT 3; + +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0]) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [0.0, 0.0, 10.0]) +LIMIT 3; + +-- Test special cases. Corresponding special case tests are omitted from later tests. + +SELECT 'Reference ARRAYs with non-matching dimension are rejected'; +SELECT * +FROM tab +ORDER BY L2Distance(vector, [0.0, 0.0]) +LIMIT 3; -- { serverError INCORRECT_QUERY } + +SELECT 'Special case: MaximumDistance is negative'; +SELECT 'WHERE type, L2Distance'; +SELECT * +FROM tab +WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < -1.0 +LIMIT 3; -- { serverError INCORRECT_QUERY } + +SELECT 'Special case: setting max_limit_for_ann_queries'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [5.3, 7.3, 2.1]) +LIMIT 3 +SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index + +DROP TABLE tab; + +-- Test Tuple embeddings. Triggers different logic than Array inside MergeTreeIndexUSearch but the same logic as Array above MergeTreeIndexUSearch. +-- Therefore test Tuple case just once. + +SELECT '--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 ---'; + +CREATE TABLE tab(id Int32, vector Tuple(Float32, Float32, Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5; +INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0)); + +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0 +LIMIT 3; + +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, (0.0, 0.0, 10.0)) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, (0.0, 0.0, 10.0)) +LIMIT 3; + +DROP TABLE tab; + +-- Not a systematic test, just to make sure no bad things happen +SELECT '--- Test non-default metric (cosine distance) ---'; + +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch('cosineDistance') GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]); + +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 +-- LIMIT 3; + +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0]) +-- LIMIT 3; + +DROP TABLE tab; + +SELECT '--- Test with Array, GRANULARITY = 2, index_granularity = 4 ---'; + +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]); + +-- rows = 16, index_granularity = 4, GRANULARITY = 2 gives 2 usearch-indexed blocks (each comprising two granules) +-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one usearch-indexed block produces results --> "Granules: 2/4" + +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +LIMIT 3; + +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +LIMIT 3; + +DROP TABLE tab; + +SELECT '--- Test with Array, GRANULARITY = 4, index_granularity = 4 ---'; + +CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4; +INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]); + +-- rows = 16, index_granularity = 4, GRANULARITY = 4 gives a single usearch-indexed block (comprising all granules) +-- no two matches happen to be located in the same granule, so with LIMIT = 3, we'll get "Granules: 2/4" + +-- SELECT 'WHERE type, L2Distance'; +-- SELECT * +-- FROM tab +-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +-- LIMIT 3; + +SELECT 'WHERE type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0 +LIMIT 3; + +-- SELECT 'ORDER BY type, L2Distance'; +-- SELECT * +-- FROM tab +-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +-- LIMIT 3; + +SELECT 'ORDER BY type, L2Distance, check that index is used'; +EXPLAIN indexes=1 +SELECT * +FROM tab +ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) +LIMIT 3; + +DROP TABLE tab;