Add: USearch

This commit is contained in:
Davit Vardanyan 2023-08-15 16:00:27 +04:00
parent 33763e7e50
commit 48c62fd75e
13 changed files with 902 additions and 0 deletions

3
.gitmodules vendored
View File

@ -347,3 +347,6 @@
[submodule "contrib/incbin"]
path = contrib/incbin
url = https://github.com/graphitemaster/incbin.git
[submodule "contrib/usearch"]
path = contrib/usearch
url = https://github.com/unum-cloud/usearch.git

View File

@ -196,6 +196,7 @@ if (ARCH_S390X)
add_contrib(crc32-s390x-cmake crc32-s390x)
endif()
add_contrib (annoy-cmake annoy)
add_contrib (usearch-cmake usearch)
add_contrib (xxHash-cmake xxHash)
add_contrib (libbcrypt-cmake libbcrypt)

1
contrib/usearch vendored Submodule

@ -0,0 +1 @@
Subproject commit 387b78b28b17b8954024ffc81e97cbcfa10d1f30

View File

@ -0,0 +1,15 @@
option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES})
if (NOT ENABLE_USEARCH)
message (STATUS "Not using usearch")
return()
endif()
set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include")
add_library(_usearch INTERFACE)
target_include_directories(_usearch SYSTEM INTERFACE ${USEARCH_PROJECT_DIR}/fp16/include ${USEARCH_PROJECT_DIR}/robin-map/include ${USEARCH_PROJECT_DIR}/simsimd/include ${USEARCH_SOURCE_DIR})
add_library(ch_contrib::usearch ALIAS _usearch)
target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH)

View File

@ -142,6 +142,8 @@ was specified for ANN indexes, the default value is 100 million.
- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)
- [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch)
## Annoy {#annoy}
Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
@ -216,3 +218,42 @@ ORDER BY L2Distance(vectors, Point)
LIMIT N
SETTINGS annoy_index_search_k_nodes=100;
```
## USearch {#usearch}
USearch indexes are currently experimental, to use them you first need to `SET allow_experimental_usearch_index = 1`.
This type of ANN index implements [the HNSW algorithm](https://github.com/unum-cloud/usearch).
Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column:
```sql
CREATE TABLE table_with_usearch_index
(
id Int64,
vectors Array(Float32),
INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column:
```sql
CREATE TABLE table_with_usearch_index
(
id Int64,
vectors Tuple(Float32[, Float32[, ...]]),
INDEX [ann_index_name] vectors TYPE usearch([Distance]) [GRANULARITY N]
)
ENGINE = MergeTree
ORDER BY id;
```
USearch currently supports two distance functions:
- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).

View File

@ -599,6 +599,10 @@ if (TARGET ch_contrib::annoy)
dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
endif()
if (TARGET ch_contrib::usearch)
dbms_target_link_libraries(PUBLIC ch_contrib::usearch)
endif()
if (TARGET ch_rust::skim)
dbms_target_include_directories(PRIVATE $<TARGET_PROPERTY:ch_rust::skim,INTERFACE_INCLUDE_DIRECTORIES>)
dbms_target_link_libraries(PUBLIC ch_rust::skim)

View File

@ -772,6 +772,7 @@ class IColumn;
M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \
M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \

View File

@ -0,0 +1,351 @@
#ifdef ENABLE_USEARCH
#include <Storages/MergeTree/MergeTreeIndexHnsw.h>
#include <Columns/ColumnArray.h>
#include <Common/typeid_cast.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <Interpreters/castColumn.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int INCORRECT_DATA;
extern const int INCORRECT_NUMBER_OF_COLUMNS;
extern const int INCORRECT_QUERY;
extern const int LOGICAL_ERROR;
}
template <unum::usearch::metric_kind_t Metric>
USearchIndexWithSerialization<Metric>::USearchIndexWithSerialization(size_t dimensions)
: Base(Base::make(unum::usearch::metric_punned_t(dimensions, Metric)))
{
}
template <unum::usearch::metric_kind_t Metric>
void USearchIndexWithSerialization<Metric>::serialize([[maybe_unused]] WriteBuffer & ostr) const
{
auto callback = [&ostr](void * from, size_t n)
{
ostr.write(reinterpret_cast<const char *>(from), n);
return true;
};
Base::stream(callback);
}
template <unum::usearch::metric_kind_t Metric>
void USearchIndexWithSerialization<Metric>::deserialize([[maybe_unused]] ReadBuffer & istr)
{
BufferBase::Position & pos = istr.position();
unum::usearch::memory_mapped_file_t memory_map(pos, istr.buffer().size() - istr.count());
Base::view(std::move(memory_map));
pos += Base::stream_length();
auto copy = Base::copy();
if (!copy)
throw std::runtime_error("Can't copy index");
Base::swap(copy.index);
}
template <unum::usearch::metric_kind_t Metric>
size_t USearchIndexWithSerialization<Metric>::getDimensions() const
{
return Base::dimensions();
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexGranuleUSearch<Metric>::MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_)
: index_name(index_name_), index_sample_block(index_sample_block_), index(nullptr)
{
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexGranuleUSearch<Metric>::MergeTreeIndexGranuleUSearch(
const String & index_name_, const Block & index_sample_block_, USearchIndexWithSerializationPtr<Metric> index_)
: index_name(index_name_), index_sample_block(index_sample_block_), index(std::move(index_))
{
}
template <unum::usearch::metric_kind_t Metric>
void MergeTreeIndexGranuleUSearch<Metric>::serializeBinary(WriteBuffer & ostr) const
{
/// Number of dimensions is required in the index constructor,
/// so it must be written and read separately from the other part
writeIntBinary(static_cast<UInt64>(index->getDimensions()), ostr); // write dimension
index->serialize(ostr);
}
template <unum::usearch::metric_kind_t Metric>
void MergeTreeIndexGranuleUSearch<Metric>::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
{
UInt64 dimension;
readIntBinary(dimension, istr);
index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension);
index->deserialize(istr);
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexAggregatorUSearch<Metric>::MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block_)
: index_name(index_name_), index_sample_block(index_sample_block_)
{
}
template <unum::usearch::metric_kind_t Metric>
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorUSearch<Metric>::getGranuleAndReset()
{
auto granule = std::make_shared<MergeTreeIndexGranuleUSearch<Metric>>(index_name, index_sample_block, index);
index = nullptr;
return granule;
}
template <unum::usearch::metric_kind_t Metric>
void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t * pos, size_t limit)
{
if (*pos >= block.rows())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
*pos,
block.rows());
size_t rows_read = std::min(limit, block.rows() - *pos);
if (rows_read == 0)
return;
if (index_sample_block.columns() > 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
const String & index_column_name = index_sample_block.getByPosition(0).name;
ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
{
const auto & data = column_array->getData();
const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
if (array.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
const auto & offsets = column_array->getOffsets();
const size_t num_rows = offsets.size();
/// Check all sizes are the same
size_t size = offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (offsets[i + 1] - offsets[i] != size)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
index = std::make_shared<USearchIndexWithSerialization<Metric>>(size);
/// Add all rows of block
index->reserve(unum::usearch::ceil2(index->size() + num_rows + 1));
index->add(index->size(), array.data());
for (size_t current_row = 1; current_row < num_rows; ++current_row)
index->add(index->size(), &array[offsets[current_row - 1]]);
}
else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
{
const auto & columns = column_tuple->getColumns();
std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
for (const auto & column : columns)
{
const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
for (size_t i = 0; i < pod_array.size(); ++i)
data[i].push_back(pod_array[i]);
}
if (data.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
index = std::make_shared<USearchIndexWithSerialization<Metric>>(data[0].size());
index->reserve(unum::usearch::ceil2(index->size() + data.size()));
for (const auto & item : data)
index->add(index->size(), item.data());
}
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array or Tuple column");
*pos += rows_read;
}
MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch(
const IndexDescription & /*index_description*/, const SelectQueryInfo & query, const String & distance_function_, ContextPtr context)
: ann_condition(query, context), distance_function(distance_function_)
{
}
bool MergeTreeIndexConditionUSearch::mayBeTrueOnGranule(MergeTreeIndexGranulePtr /*idx_granule*/) const
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes");
}
bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const
{
return ann_condition.alwaysUnknownOrTrue(distance_function);
}
std::vector<size_t> MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const
{
if (distance_function == "L2Distance")
return getUsefulRangesImpl<unum::usearch::metric_kind_t::l2sq_k>(idx_granule);
else if (distance_function == "cosineDistance")
return getUsefulRangesImpl<unum::usearch::metric_kind_t::cos_k>(idx_granule);
std::unreachable();
}
template <unum::usearch::metric_kind_t Metric>
std::vector<size_t> MergeTreeIndexConditionUSearch::getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const
{
const UInt64 limit = ann_condition.getLimit();
const UInt64 index_granularity = ann_condition.getIndexGranularity();
const std::optional<float> comparison_distance = ann_condition.getQueryType() == ApproximateNearestNeighborInformation::Type::Where
? std::optional<float>(ann_condition.getComparisonDistanceForWhereQuery())
: std::nullopt;
if (comparison_distance && comparison_distance.value() < 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to optimize query with where without distance");
const std::vector<float> reference_vector = ann_condition.getReferenceVector();
const auto granule = std::dynamic_pointer_cast<MergeTreeIndexGranuleUSearch<Metric>>(idx_granule);
if (granule == nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type");
const USearchIndexWithSerializationPtr<Metric> index = granule->index;
if (ann_condition.getDimensions() != index->dimensions())
throw Exception(
ErrorCodes::INCORRECT_QUERY,
"The dimension of the space in the request ({}) "
"does not match the dimension in the index ({})",
ann_condition.getDimensions(),
index->dimensions());
auto result = index->search(reference_vector.data(), limit);
std::vector<UInt64> neighbors(result.size()); /// indexes of dots which were closest to the reference vector
std::vector<Float32> distances(result.size());
result.dump_to(neighbors.data(), distances.data());
std::vector<size_t> granule_numbers;
granule_numbers.reserve(neighbors.size());
for (size_t i = 0; i < neighbors.size(); ++i)
{
if (comparison_distance && distances[i] > comparison_distance)
continue;
granule_numbers.push_back(neighbors[i] / index_granularity);
}
/// make unique
std::sort(granule_numbers.begin(), granule_numbers.end());
granule_numbers.erase(std::unique(granule_numbers.begin(), granule_numbers.end()), granule_numbers.end());
return granule_numbers;
}
MergeTreeIndexUSearch::MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_)
: IMergeTreeIndex(index_), distance_function(distance_function_)
{
}
MergeTreeIndexGranulePtr MergeTreeIndexUSearch::createIndexGranule() const
{
if (distance_function == "L2Distance")
return std::make_shared<MergeTreeIndexGranuleUSearch<unum::usearch::metric_kind_t::l2sq_k>>(index.name, index.sample_block);
else if (distance_function == "cosineDistance")
return std::make_shared<MergeTreeIndexGranuleUSearch<unum::usearch::metric_kind_t::cos_k>>(index.name, index.sample_block);
std::unreachable();
}
MergeTreeIndexAggregatorPtr MergeTreeIndexUSearch::createIndexAggregator() const
{
if (distance_function == "L2Distance")
return std::make_shared<MergeTreeIndexAggregatorUSearch<unum::usearch::metric_kind_t::l2sq_k>>(index.name, index.sample_block);
else if (distance_function == "cosineDistance")
return std::make_shared<MergeTreeIndexAggregatorUSearch<unum::usearch::metric_kind_t::cos_k>>(index.name, index.sample_block);
std::unreachable();
}
MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const
{
return std::make_shared<MergeTreeIndexConditionUSearch>(index, query, distance_function, context);
};
MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index)
{
static constexpr auto default_distance_function = "L2Distance";
String distance_function = default_distance_function;
if (!index.arguments.empty())
distance_function = index.arguments[0].get<String>();
return std::make_shared<MergeTreeIndexUSearch>(index, distance_function);
}
void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
{
/// Check number and type of USearch index arguments:
if (index.arguments.size() > 1)
throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index must not have more than one parameters");
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String)
throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of USearch index must be of type String");
/// Check that the index is created on a single column
if (index.column_names.size() != 1 || index.data_types.size() != 1)
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "USearch indexes must be created on a single column");
/// Check that a supported metric was passed as first argument
if (!index.arguments.empty())
{
String distance_name = index.arguments[0].get<String>();
if (distance_name != "L2Distance" && distance_name != "cosineDistance")
throw Exception(ErrorCodes::INCORRECT_DATA, "USearch index only supports distance functions 'L2Distance' and 'cosineDistance'");
}
/// Check data type of indexed column:
auto throw_unsupported_underlying_column_exception = []()
{
throw Exception(
ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
};
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
if (const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get()))
{
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw_unsupported_underlying_column_exception();
}
else if (const auto * data_type_tuple = typeid_cast<const DataTypeTuple *>(data_type.get()))
{
const DataTypes & inner_types = data_type_tuple->getElements();
for (const auto & inner_type : inner_types)
{
TypeIndex nested_type_index = inner_type->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw_unsupported_underlying_column_exception();
}
}
else
throw_unsupported_underlying_column_exception();
}
}
#endif

View File

@ -0,0 +1,103 @@
#pragma once
#ifdef ENABLE_USEARCH
#include <Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h>
#include <usearch/index_dense.hpp>
namespace DB
{
template <unum::usearch::metric_kind_t metric>
class USearchIndexWithSerialization : public unum::usearch::index_dense_t
{
using Base = unum::usearch::index_dense_t;
public:
explicit USearchIndexWithSerialization(size_t dimensions);
void serialize(WriteBuffer & ostr) const;
void deserialize(ReadBuffer & istr);
size_t getDimensions() const;
};
template <unum::usearch::metric_kind_t metric>
using USearchIndexWithSerializationPtr = std::shared_ptr<USearchIndexWithSerialization<metric>>;
template <unum::usearch::metric_kind_t metric>
struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule
{
MergeTreeIndexGranuleUSearch(const String & index_name_, const Block & index_sample_block_);
MergeTreeIndexGranuleUSearch(
const String & index_name_, const Block & index_sample_block_, USearchIndexWithSerializationPtr<metric> index_);
~MergeTreeIndexGranuleUSearch() override = default;
void serializeBinary(WriteBuffer & ostr) const override;
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
bool empty() const override { return !index.get(); }
const String index_name;
const Block index_sample_block;
USearchIndexWithSerializationPtr<metric> index;
};
template <unum::usearch::metric_kind_t metric>
struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator
{
MergeTreeIndexAggregatorUSearch(const String & index_name_, const Block & index_sample_block);
~MergeTreeIndexAggregatorUSearch() override = default;
bool empty() const override { return !index || index->size() == 0; }
MergeTreeIndexGranulePtr getGranuleAndReset() override;
void update(const Block & block, size_t * pos, size_t limit) override;
const String index_name;
const Block index_sample_block;
USearchIndexWithSerializationPtr<metric> index;
};
class MergeTreeIndexConditionUSearch final : public IMergeTreeIndexConditionApproximateNearestNeighbor
{
public:
MergeTreeIndexConditionUSearch(
const IndexDescription & index_description, const SelectQueryInfo & query, const String & distance_function, ContextPtr context);
~MergeTreeIndexConditionUSearch() override = default;
bool alwaysUnknownOrTrue() const override;
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
std::vector<size_t> getUsefulRanges(MergeTreeIndexGranulePtr idx_granule) const override;
private:
template <unum::usearch::metric_kind_t metric>
std::vector<size_t> getUsefulRangesImpl(MergeTreeIndexGranulePtr idx_granule) const;
const ApproximateNearestNeighborCondition ann_condition;
const String distance_function;
};
class MergeTreeIndexUSearch : public IMergeTreeIndex
{
public:
MergeTreeIndexUSearch(const IndexDescription & index_, const String & distance_function_);
~MergeTreeIndexUSearch() override = default;
MergeTreeIndexGranulePtr createIndexGranule() const override;
MergeTreeIndexAggregatorPtr createIndexAggregator() const override;
MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const override;
bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; }
private:
const String distance_function;
};
}
#endif

View File

@ -132,6 +132,11 @@ MergeTreeIndexFactory::MergeTreeIndexFactory()
registerValidator("annoy", annoyIndexValidator);
#endif
#ifdef ENABLE_USEARCH
registerCreator("usearch", usearchIndexCreator);
registerValidator("usearch", usearchIndexValidator);
#endif
registerCreator("inverted", invertedIndexCreator);
registerValidator("inverted", invertedIndexValidator);

View File

@ -238,6 +238,11 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index);
void annoyIndexValidator(const IndexDescription & index, bool attach);
#endif
#ifdef ENABLE_USEARCH
MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index);
void usearchIndexValidator(const IndexDescription& index, bool attach);
#endif
MergeTreeIndexPtr invertedIndexCreator(const IndexDescription& index);
void invertedIndexValidator(const IndexDescription& index, bool attach);

View File

@ -0,0 +1,143 @@
--- Negative tests ---
--- Test default GRANULARITY (should be 100 mio. for usearch)---
CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX usearch_index vector TYPE usearch GRANULARITY 1\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192
CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX usearch_index vector TYPE usearch GRANULARITY 1\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192
--- Test with Array, GRANULARITY = 1, index_granularity = 5 ---
WHERE type, L2Distance, check that index is used
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 3/3
Skip
Name: usearch_index
Description: usearch GRANULARITY 1
Parts: 1/1
Granules: 1/3
ORDER BY type, L2Distance, check that index is used
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 3/3
Skip
Name: usearch_index
Description: usearch GRANULARITY 1
Parts: 1/1
Granules: 3/3
Reference ARRAYs with non-matching dimension are rejected
Special case: MaximumDistance is negative
WHERE type, L2Distance
Special case: setting max_limit_for_ann_queries
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 3/3
--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 ---
WHERE type, L2Distance, check that index is used
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 3/3
Skip
Name: usearch_index
Description: usearch GRANULARITY 1
Parts: 1/1
Granules: 1/3
ORDER BY type, L2Distance, check that index is used
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 3/3
Skip
Name: usearch_index
Description: usearch GRANULARITY 1
Parts: 1/1
Granules: 3/3
--- Test non-default metric (cosine distance) ---
--- Test with Array, GRANULARITY = 2, index_granularity = 4 ---
WHERE type, L2Distance, check that index is used
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: usearch_index
Description: usearch GRANULARITY 2
Parts: 0/1
Granules: 2/4
ORDER BY type, L2Distance, check that index is used
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: usearch_index
Description: usearch GRANULARITY 2
Parts: 1/1
Granules: 4/4
--- Test with Array, GRANULARITY = 4, index_granularity = 4 ---
WHERE type, L2Distance, check that index is used
Expression ((Projection + Before ORDER BY))
Limit (preliminary LIMIT (without OFFSET))
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: usearch_index
Description: usearch GRANULARITY 4
Parts: 0/1
Granules: 3/4
ORDER BY type, L2Distance, check that index is used
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: usearch_index
Description: usearch GRANULARITY 4
Parts: 1/1
Granules: 4/4

View File

@ -0,0 +1,229 @@
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check
SET allow_experimental_usearch_index = 1;
SET allow_experimental_analyzer = 0;
SELECT '--- Negative tests ---';
DROP TABLE IF EXISTS tab;
-- must have at most 2 arguments
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
-- first argument (distance_function) must be String
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
-- must be created on single column
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index (vector, id) TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
-- reject unsupported distance functions
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
-- must be created on Array/Tuple(Float32) columns
SET allow_suspicious_low_cardinality_types = 1;
CREATE TABLE tab(id Int32, vector Float32, INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vector Array(Float64), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vector Tuple(Float64), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vector LowCardinality(Float32), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vector Nullable(Float32), INDEX usearch_index vector TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
SELECT '--- Test default GRANULARITY (should be 100 mio. for usearch)---';
CREATE TABLE tab (id Int32, vector Array(Float32), INDEX usearch_index(vector) TYPE usearch) ENGINE=MergeTree ORDER BY id;
SHOW CREATE TABLE tab;
DROP TABLE tab;
CREATE TABLE tab (id Int32, vector Array(Float32)) ENGINE=MergeTree ORDER BY id;
ALTER TABLE tab ADD INDEX usearch_index(vector) TYPE usearch;
SHOW CREATE TABLE tab;
DROP TABLE tab;
SELECT '--- Test with Array, GRANULARITY = 1, index_granularity = 5 ---';
DROP TABLE IF EXISTS tab;
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5;
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]);
-- rows = 15, index_granularity = 5, GRANULARITY = 1 gives 3 usearch-indexed blocks (each comprising a single granule)
-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one usearch-indexed block produces results --> "Granules: 1/3"
-- SELECT 'WHERE type, L2Distance';
-- SELECT *
-- FROM tab
-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
-- LIMIT 3;
SELECT 'WHERE type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
LIMIT 3;
-- SELECT 'ORDER BY type, L2Distance';
-- SELECT *
-- FROM tab
-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0])
-- LIMIT 3;
SELECT 'ORDER BY type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
ORDER BY L2Distance(vector, [0.0, 0.0, 10.0])
LIMIT 3;
-- Test special cases. Corresponding special case tests are omitted from later tests.
SELECT 'Reference ARRAYs with non-matching dimension are rejected';
SELECT *
FROM tab
ORDER BY L2Distance(vector, [0.0, 0.0])
LIMIT 3; -- { serverError INCORRECT_QUERY }
SELECT 'Special case: MaximumDistance is negative';
SELECT 'WHERE type, L2Distance';
SELECT *
FROM tab
WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < -1.0
LIMIT 3; -- { serverError INCORRECT_QUERY }
SELECT 'Special case: setting max_limit_for_ann_queries';
EXPLAIN indexes=1
SELECT *
FROM tab
ORDER BY L2Distance(vector, [5.3, 7.3, 2.1])
LIMIT 3
SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index
DROP TABLE tab;
-- Test Tuple embeddings. Triggers different logic than Array inside MergeTreeIndexUSearch but the same logic as Array above MergeTreeIndexUSearch.
-- Therefore test Tuple case just once.
SELECT '--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 ---';
CREATE TABLE tab(id Int32, vector Tuple(Float32, Float32, Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5;
INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0));
-- SELECT 'WHERE type, L2Distance';
-- SELECT *
-- FROM tab
-- WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0
-- LIMIT 3;
SELECT 'WHERE type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0
LIMIT 3;
-- SELECT 'ORDER BY type, L2Distance';
-- SELECT *
-- FROM tab
-- ORDER BY L2Distance(vector, (0.0, 0.0, 10.0))
-- LIMIT 3;
SELECT 'ORDER BY type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
ORDER BY L2Distance(vector, (0.0, 0.0, 10.0))
LIMIT 3;
DROP TABLE tab;
-- Not a systematic test, just to make sure no bad things happen
SELECT '--- Test non-default metric (cosine distance) ---';
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch('cosineDistance') GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5;
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]);
-- SELECT 'WHERE type, L2Distance';
-- SELECT *
-- FROM tab
-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
-- LIMIT 3;
-- SELECT 'ORDER BY type, L2Distance';
-- SELECT *
-- FROM tab
-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0])
-- LIMIT 3;
DROP TABLE tab;
SELECT '--- Test with Array, GRANULARITY = 2, index_granularity = 4 ---';
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4;
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]);
-- rows = 16, index_granularity = 4, GRANULARITY = 2 gives 2 usearch-indexed blocks (each comprising two granules)
-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one usearch-indexed block produces results --> "Granules: 2/4"
-- SELECT 'WHERE type, L2Distance';
-- SELECT *
-- FROM tab
-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
-- LIMIT 3;
SELECT 'WHERE type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
LIMIT 3;
-- SELECT 'ORDER BY type, L2Distance';
-- SELECT *
-- FROM tab
-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
-- LIMIT 3;
SELECT 'ORDER BY type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
LIMIT 3;
DROP TABLE tab;
SELECT '--- Test with Array, GRANULARITY = 4, index_granularity = 4 ---';
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX usearch_index vector TYPE usearch() GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4;
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]);
-- rows = 16, index_granularity = 4, GRANULARITY = 4 gives a single usearch-indexed block (comprising all granules)
-- no two matches happen to be located in the same granule, so with LIMIT = 3, we'll get "Granules: 2/4"
-- SELECT 'WHERE type, L2Distance';
-- SELECT *
-- FROM tab
-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
-- LIMIT 3;
SELECT 'WHERE type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
LIMIT 3;
-- SELECT 'ORDER BY type, L2Distance';
-- SELECT *
-- FROM tab
-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
-- LIMIT 3;
SELECT 'ORDER BY type, L2Distance, check that index is used';
EXPLAIN indexes=1
SELECT *
FROM tab
ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
LIMIT 3;
DROP TABLE tab;