mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Merge pull request #54600 from rschu1ze/fix-annoy-crash
Annoy/Usearch index: Fix LOGICAL_ERROR during build-up with default values
This commit is contained in:
commit
2c91e52da1
@ -203,9 +203,10 @@ Parameter `NumTrees` is the number of trees which the algorithm creates (default
|
||||
more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.
|
||||
|
||||
:::note
|
||||
Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use
|
||||
[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1
|
||||
CHECK length(vectors) = 256`.
|
||||
Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays must have same length. To avoid
|
||||
errors, you can use a [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT
|
||||
constraint_name_1 CHECK length(vectors) = 256`. Also, unspecified `Array` values in INSERT statements (i.e. default values) are not
|
||||
supported.
|
||||
:::
|
||||
|
||||
Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
|
||||
@ -223,6 +224,7 @@ SETTINGS annoy_index_search_k_nodes=100;
|
||||
The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
|
||||
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
|
||||
:::
|
||||
|
||||
## USearch {#usearch}
|
||||
|
||||
This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
|
||||
|
@ -4,7 +4,7 @@ sidebar_position: 52
|
||||
sidebar_label: Array(T)
|
||||
---
|
||||
|
||||
# Array(t)
|
||||
# Array(T)
|
||||
|
||||
An array of `T`-type items, with the starting array index as 1. `T` can be any data type, including an array.
|
||||
|
||||
|
@ -154,36 +154,45 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
|
||||
|
||||
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
|
||||
{
|
||||
const auto & data = column_array->getData();
|
||||
const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
|
||||
const auto & column_array_data = column_array->getData();
|
||||
const auto & column_arary_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
|
||||
|
||||
if (array.empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
|
||||
const auto & column_array_offsets = column_array->getOffsets();
|
||||
const size_t num_rows = column_array_offsets.size();
|
||||
|
||||
const auto & offsets = column_array->getOffsets();
|
||||
const size_t num_rows = offsets.size();
|
||||
/// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
|
||||
/// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default
|
||||
/// value which is also empty.
|
||||
if (column_array->isDefaultAt(0))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
|
||||
|
||||
/// Check all sizes are the same
|
||||
size_t size = offsets[0];
|
||||
size_t dimension = column_array_offsets[0];
|
||||
for (size_t i = 0; i < num_rows - 1; ++i)
|
||||
if (offsets[i + 1] - offsets[i] != size)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
|
||||
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
|
||||
|
||||
/// Also check that previously inserted blocks have the same size as this block.
|
||||
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
|
||||
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
|
||||
if (index && index->getDimensions() != dimension)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
|
||||
|
||||
if (!index)
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
|
||||
|
||||
/// Add all rows of block
|
||||
index->add_item(index->get_n_items(), array.data());
|
||||
index->add_item(index->get_n_items(), column_arary_data_float_data.data());
|
||||
for (size_t current_row = 1; current_row < num_rows; ++current_row)
|
||||
index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]);
|
||||
index->add_item(index->get_n_items(), &column_arary_data_float_data[column_array_offsets[current_row - 1]]);
|
||||
}
|
||||
else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
|
||||
{
|
||||
const auto & columns = column_tuple->getColumns();
|
||||
const auto & column_tuple_columns = column_tuple->getColumns();
|
||||
|
||||
/// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything
|
||||
std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
|
||||
for (const auto & column : columns)
|
||||
std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
|
||||
for (const auto & column : column_tuple_columns)
|
||||
{
|
||||
const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
|
||||
for (size_t i = 0; i < pod_array.size(); ++i)
|
||||
@ -363,7 +372,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
|
||||
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
|
||||
};
|
||||
|
||||
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
|
||||
|
@ -173,23 +173,32 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
|
||||
|
||||
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
|
||||
{
|
||||
const auto & data = column_array->getData();
|
||||
const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
|
||||
const auto & column_array_data = column_array->getData();
|
||||
const auto & column_array_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
|
||||
|
||||
if (array.empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
|
||||
const auto & column_array_offsets = column_array->getOffsets();
|
||||
const size_t num_rows = column_array_offsets.size();
|
||||
|
||||
const auto & offsets = column_array->getOffsets();
|
||||
const size_t num_rows = offsets.size();
|
||||
/// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
|
||||
/// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default
|
||||
/// values which is also empty.
|
||||
if (column_array->isDefaultAt(0))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
|
||||
|
||||
/// Check all sizes are the same
|
||||
size_t size = offsets[0];
|
||||
size_t dimension = column_array_offsets[0];
|
||||
for (size_t i = 0; i < num_rows - 1; ++i)
|
||||
if (offsets[i + 1] - offsets[i] != size)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
|
||||
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
|
||||
|
||||
/// Also check that previously inserted blocks have the same size as this block.
|
||||
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
|
||||
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
|
||||
if (index && index->getDimensions() != dimension)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
|
||||
|
||||
if (!index)
|
||||
index = std::make_shared<USearchIndexWithSerialization<Metric>>(size, scalar_kind);
|
||||
index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);
|
||||
|
||||
/// Add all rows of block
|
||||
if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows)))
|
||||
@ -197,7 +206,7 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
|
||||
|
||||
for (size_t current_row = 0; current_row < num_rows; ++current_row)
|
||||
{
|
||||
auto rc = index->add(static_cast<uint32_t>(index->size()), &array[offsets[current_row - 1]]);
|
||||
auto rc = index->add(static_cast<uint32_t>(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
|
||||
if (!rc)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, rc.error.release());
|
||||
|
||||
@ -208,9 +217,9 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
|
||||
}
|
||||
else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
|
||||
{
|
||||
const auto & columns = column_tuple->getColumns();
|
||||
std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
|
||||
for (const auto & column : columns)
|
||||
const auto & column_tuple_columns = column_tuple->getColumns();
|
||||
std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
|
||||
for (const auto & column : column_tuple_columns)
|
||||
{
|
||||
const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
|
||||
for (size_t i = 0; i < pod_array.size(); ++i)
|
||||
@ -413,7 +422,8 @@ void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
auto throw_unsupported_underlying_column_exception = []()
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
|
||||
};
|
||||
|
||||
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
|
||||
|
@ -147,3 +147,4 @@ Expression (Projection)
|
||||
9000 [9000,0,0,0]
|
||||
1 (1,0,0,0)
|
||||
9000 (9000,0,0,0)
|
||||
--- Bugs ---
|
||||
|
@ -281,3 +281,15 @@ ORDER BY L2Distance(vector, (9000.0, 0.0, 0.0, 0.0))
|
||||
LIMIT 1;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Bugs ---';
|
||||
|
||||
-- Arrays with default values are rejected, issue #52258
|
||||
CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid);
|
||||
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA }
|
||||
DROP TABLE tab;
|
||||
|
||||
-- Tuples with default value work
|
||||
CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid);
|
||||
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine
|
||||
DROP TABLE tab;
|
||||
|
@ -150,3 +150,4 @@ Expression (Projection)
|
||||
1 [0,0,10]
|
||||
2 [0,0,10.5]
|
||||
3 [0,0,9.5]
|
||||
--- Bugs ---
|
||||
|
@ -274,3 +274,17 @@ SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Bugs ---';
|
||||
|
||||
-- Arrays with default values are rejected, issue #52258
|
||||
CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid);
|
||||
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA }
|
||||
DROP TABLE tab;
|
||||
|
||||
-- Tuples with default value work
|
||||
CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid);
|
||||
INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine
|
||||
DROP TABLE tab;
|
||||
|
Loading…
Reference in New Issue
Block a user