diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 8996133f667..d6ff7f23bb4 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -203,9 +203,10 @@ Parameter `NumTrees` is the number of trees which the algorithm creates (default more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes. :::note -Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use -[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 -CHECK length(vectors) = 256`. +Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays must have same length. To avoid +errors, you can use a [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT +constraint_name_1 CHECK length(vectors) = 256`. Also, unspecified `Array` values in INSERT statements (i.e. default values) are not +supported. ::: Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger @@ -223,6 +224,7 @@ SETTINGS annoy_index_search_k_nodes=100; The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see [here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml. ::: + ## USearch {#usearch} This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW diff --git a/docs/en/sql-reference/data-types/array.md b/docs/en/sql-reference/data-types/array.md index 20ce7d2ed52..0ee7c8de93c 100644 --- a/docs/en/sql-reference/data-types/array.md +++ b/docs/en/sql-reference/data-types/array.md @@ -4,7 +4,7 @@ sidebar_position: 52 sidebar_label: Array(T) --- -# Array(t) +# Array(T) An array of `T`-type items, with the starting array index as 1. `T` can be any data type, including an array. diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 3ad4f81716e..3eec8614dcd 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -154,36 +154,45 @@ void MergeTreeIndexAggregatorAnnoy::update(const Block & block, size_t if (const auto & column_array = typeid_cast(column_cut.get())) { - const auto & data = column_array->getData(); - const auto & array = typeid_cast(data).getData(); + const auto & column_array_data = column_array->getData(); + const auto & column_arary_data_float_data = typeid_cast(column_array_data).getData(); - if (array.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read); + const auto & column_array_offsets = column_array->getOffsets(); + const size_t num_rows = column_array_offsets.size(); - const auto & offsets = column_array->getOffsets(); - const size_t num_rows = offsets.size(); + /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays + /// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default + /// value which is also empty. + if (column_array->isDefaultAt(0)) + throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); /// Check all sizes are the same - size_t size = offsets[0]; + size_t dimension = column_array_offsets[0]; for (size_t i = 0; i < num_rows - 1; ++i) - if (offsets[i + 1] - offsets[i] != size) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name); + if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); + + /// Also check that previously inserted blocks have the same size as this block. + /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across + /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. + if (index && index->getDimensions() != dimension) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); if (!index) - index = std::make_shared>(size); + index = std::make_shared>(dimension); /// Add all rows of block - index->add_item(index->get_n_items(), array.data()); + index->add_item(index->get_n_items(), column_arary_data_float_data.data()); for (size_t current_row = 1; current_row < num_rows; ++current_row) - index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]); + index->add_item(index->get_n_items(), &column_arary_data_float_data[column_array_offsets[current_row - 1]]); } else if (const auto & column_tuple = typeid_cast(column_cut.get())) { - const auto & columns = column_tuple->getColumns(); + const auto & column_tuple_columns = column_tuple->getColumns(); /// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything - std::vector> data{column_tuple->size(), std::vector()}; - for (const auto & column : columns) + std::vector> data(column_tuple->size(), std::vector()); + for (const auto & column : column_tuple_columns) { const auto & pod_array = typeid_cast(column.get())->getData(); for (size_t i = 0; i < pod_array.size(); ++i) @@ -363,7 +372,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */) { throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)"); + "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])"); }; DataTypePtr data_type = index.sample_block.getDataTypes()[0]; diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp index 1ab85e6bbaf..009c004faea 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp @@ -173,23 +173,32 @@ void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t if (const auto & column_array = typeid_cast(column_cut.get())) { - const auto & data = column_array->getData(); - const auto & array = typeid_cast(data).getData(); + const auto & column_array_data = column_array->getData(); + const auto & column_array_data_float_data = typeid_cast(column_array_data).getData(); - if (array.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read); + const auto & column_array_offsets = column_array->getOffsets(); + const size_t num_rows = column_array_offsets.size(); - const auto & offsets = column_array->getOffsets(); - const size_t num_rows = offsets.size(); + /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays + /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default + /// values which is also empty. + if (column_array->isDefaultAt(0)) + throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); /// Check all sizes are the same - size_t size = offsets[0]; + size_t dimension = column_array_offsets[0]; for (size_t i = 0; i < num_rows - 1; ++i) - if (offsets[i + 1] - offsets[i] != size) - throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name); + if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); + + /// Also check that previously inserted blocks have the same size as this block. + /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across + /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42. + if (index && index->getDimensions() != dimension) + throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); if (!index) - index = std::make_shared>(size, scalar_kind); + index = std::make_shared>(dimension, scalar_kind); /// Add all rows of block if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows))) @@ -197,7 +206,7 @@ void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t for (size_t current_row = 0; current_row < num_rows; ++current_row) { - auto rc = index->add(static_cast(index->size()), &array[offsets[current_row - 1]]); + auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]); if (!rc) throw Exception(ErrorCodes::INCORRECT_DATA, rc.error.release()); @@ -208,9 +217,9 @@ void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t } else if (const auto & column_tuple = typeid_cast(column_cut.get())) { - const auto & columns = column_tuple->getColumns(); - std::vector> data{column_tuple->size(), std::vector()}; - for (const auto & column : columns) + const auto & column_tuple_columns = column_tuple->getColumns(); + std::vector> data(column_tuple->size(), std::vector()); + for (const auto & column : column_tuple_columns) { const auto & pod_array = typeid_cast(column.get())->getData(); for (size_t i = 0; i < pod_array.size(); ++i) @@ -413,7 +422,8 @@ void usearchIndexValidator(const IndexDescription & index, bool /* attach */) auto throw_unsupported_underlying_column_exception = []() { throw Exception( - ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)"); + ErrorCodes::ILLEGAL_COLUMN, + "USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])"); }; DataTypePtr data_type = index.sample_block.getDataTypes()[0]; diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference index a0ffb1e1f7f..81f2ff8aa59 100644 --- a/tests/queries/0_stateless/02354_annoy_index.reference +++ b/tests/queries/0_stateless/02354_annoy_index.reference @@ -147,3 +147,4 @@ Expression (Projection) 9000 [9000,0,0,0] 1 (1,0,0,0) 9000 (9000,0,0,0) +--- Bugs --- diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql index eab7a62c5f0..67ef64cc301 100644 --- a/tests/queries/0_stateless/02354_annoy_index.sql +++ b/tests/queries/0_stateless/02354_annoy_index.sql @@ -281,3 +281,15 @@ ORDER BY L2Distance(vector, (9000.0, 0.0, 0.0, 0.0)) LIMIT 1; DROP TABLE tab; + +SELECT '--- Bugs ---'; + +-- Arrays with default values are rejected, issue #52258 +CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid); +INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA } +DROP TABLE tab; + +-- Tuples with default value work +CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid); +INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_usearch_index.reference b/tests/queries/0_stateless/02354_usearch_index.reference index 893a092a386..c2791e99a54 100644 --- a/tests/queries/0_stateless/02354_usearch_index.reference +++ b/tests/queries/0_stateless/02354_usearch_index.reference @@ -150,3 +150,4 @@ Expression (Projection) 1 [0,0,10] 2 [0,0,10.5] 3 [0,0,9.5] +--- Bugs --- diff --git a/tests/queries/0_stateless/02354_usearch_index.sql b/tests/queries/0_stateless/02354_usearch_index.sql index e534c91b615..fc2954d6c5d 100644 --- a/tests/queries/0_stateless/02354_usearch_index.sql +++ b/tests/queries/0_stateless/02354_usearch_index.sql @@ -274,3 +274,17 @@ SELECT * FROM tab WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0 LIMIT 3; + +DROP TABLE tab; + +SELECT '--- Bugs ---'; + +-- Arrays with default values are rejected, issue #52258 +CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid); +INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA } +DROP TABLE tab; + +-- Tuples with default value work +CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid); +INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine +DROP TABLE tab;