Annoy: Fix LOGICAL_ERROR with default values #52258

This commit is contained in:
Robert Schulze 2023-09-13 14:23:09 +00:00
parent c422a8f0dc
commit 945179be46
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
7 changed files with 56 additions and 11 deletions

View File

@ -203,9 +203,10 @@ Parameter `NumTrees` is the number of trees which the algorithm creates (default
more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.
:::note
Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use
[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1
CHECK length(vectors) = 256`.
Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays must have same length. To avoid
errors, you can use a [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT
constraint_name_1 CHECK length(vectors) = 256`. Also, unspecified `Array` values in INSERT statements (i.e. default values) are not
supported.
:::
Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
@ -223,6 +224,7 @@ SETTINGS annoy_index_search_k_nodes=100;
The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
:::
## USearch {#usearch}
This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW

View File

@ -157,18 +157,25 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
const auto & column_array_data = column_array->getData();
const auto & column_arary_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
if (column_arary_data_float_data.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
const auto & column_array_offsets = column_array->getOffsets();
const size_t num_rows = column_array_offsets.size();
/// The index dimension is inferred from the inserted arrays (array cardinality). If no value was specified in the INSERT statement
/// for the annoy-indexed column (i.e. default value), we have a problem. Reject such values.
if (column_array_offsets.empty() || column_array_offsets[0] == 0)
/// (The if condition is a bit weird but I have seen either with default values)
throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to insert {} rows into Annoy index but there were no values to insert. Likely, the INSERT used default values - these are not supported for Annoy.", rows_read);
/// Check all sizes are the same
size_t dimension = column_array_offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
/// Also check that previously inserted blocks have the same size as this block
if (index && index->getDimensions() != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
if (!index)
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
@ -363,7 +370,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
{
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
};
DataTypePtr data_type = index.sample_block.getDataTypes()[0];

View File

@ -176,18 +176,25 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
const auto & column_array_data = column_array->getData();
const auto & column_array_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
if (column_array_data_float_data.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
const auto & column_array_offsets = column_array->getOffsets();
const size_t num_rows = column_array_offsets.size();
/// The index dimension is inferred from the inserted arrays (array cardinality). If no value was specified in the INSERT statement
/// for the usearch-indexed column (i.e. default value), we have a problem. Reject such values.
if (column_array_offsets.empty() || column_array_offsets[0] == 0)
/// (The if condition is a bit weird but I have seen either with default values)
throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to insert {} rows into usearch index but there were no values to insert. Likely, the INSERT used default values - these are not supported for Annoy.", rows_read);
/// Check all sizes are the same
size_t dimension = column_array_offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
/// Also check that previously inserted blocks have the same size as this block
if (index && index->getDimensions() != dimension)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
if (!index)
index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);
@ -413,7 +420,8 @@ void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
auto throw_unsupported_underlying_column_exception = []()
{
throw Exception(
ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
ErrorCodes::ILLEGAL_COLUMN,
"USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
};
DataTypePtr data_type = index.sample_block.getDataTypes()[0];

View File

@ -147,3 +147,4 @@ Expression (Projection)
9000 [9000,0,0,0]
1 (1,0,0,0)
9000 (9000,0,0,0)
--- Bugs ---

View File

@ -281,3 +281,15 @@ ORDER BY L2Distance(vector, (9000.0, 0.0, 0.0, 0.0))
LIMIT 1;
DROP TABLE tab;
SELECT '--- Bugs ---';
-- Arrays with default values are rejected, issue #52258
CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid);
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA }
DROP TABLE tab;
-- Tuples with default value work
CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid);
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine
DROP TABLE tab;

View File

@ -150,3 +150,4 @@ Expression (Projection)
1 [0,0,10]
2 [0,0,10.5]
3 [0,0,9.5]
--- Bugs ---

View File

@ -274,3 +274,17 @@ SELECT *
FROM tab
WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
LIMIT 3;
DROP TABLE tab;
SELECT '--- Bugs ---';
-- Arrays with default values are rejected, issue #52258
CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid);
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA }
DROP TABLE tab;
-- Tuples with default value work
CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid);
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine
DROP TABLE tab;