From c422a8f0dc14269041aba360ea0ac645e2a635e1 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 13 Sep 2023 13:41:02 +0000
Subject: [PATCH 1/4] Cosmetics

---
 docs/en/sql-reference/data-types/array.md     |  2 +-
 .../MergeTree/MergeTreeIndexAnnoy.cpp         | 26 +++++++++----------
 .../MergeTree/MergeTreeIndexUSearch.cpp       | 24 ++++++++---------
 3 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/docs/en/sql-reference/data-types/array.md b/docs/en/sql-reference/data-types/array.md
index 20ce7d2ed52..0ee7c8de93c 100644
--- a/docs/en/sql-reference/data-types/array.md
+++ b/docs/en/sql-reference/data-types/array.md
@@ -4,7 +4,7 @@ sidebar_position: 52
 sidebar_label: Array(T)
 ---
 
-# Array(t)
+# Array(T)
 
 An array of `T`-type items, with the starting array index as 1. `T` can be any data type, including an array.
 
diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
index 3ad4f81716e..15830513162 100644
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
@@ -154,36 +154,36 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
 
     if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
     {
-        const auto & data = column_array->getData();
-        const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
+        const auto & column_array_data = column_array->getData();
+        const auto & column_arary_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
 
-        if (array.empty())
+        if (column_arary_data_float_data.empty())
             throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
 
-        const auto & offsets = column_array->getOffsets();
-        const size_t num_rows = offsets.size();
+        const auto & column_array_offsets = column_array->getOffsets();
+        const size_t num_rows = column_array_offsets.size();
 
         /// Check all sizes are the same
-        size_t size = offsets[0];
+        size_t dimension = column_array_offsets[0];
         for (size_t i = 0; i < num_rows - 1; ++i)
-            if (offsets[i + 1] - offsets[i] != size)
+            if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
                 throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
 
         if (!index)
-            index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);
+            index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
 
         /// Add all rows of block
-        index->add_item(index->get_n_items(), array.data());
+        index->add_item(index->get_n_items(), column_arary_data_float_data.data());
         for (size_t current_row = 1; current_row < num_rows; ++current_row)
-            index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]);
+            index->add_item(index->get_n_items(), &column_arary_data_float_data[column_array_offsets[current_row - 1]]);
     }
     else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
     {
-        const auto & columns = column_tuple->getColumns();
+        const auto & column_tuple_columns = column_tuple->getColumns();
 
         /// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything
-        std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
-        for (const auto & column : columns)
+        std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
+        for (const auto & column : column_tuple_columns)
         {
             const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
             for (size_t i = 0; i < pod_array.size(); ++i)
diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
index 1ab85e6bbaf..de556eb7e07 100644
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
@@ -173,23 +173,23 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
 
     if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
     {
-        const auto & data = column_array->getData();
-        const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
+        const auto & column_array_data = column_array->getData();
+        const auto & column_array_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
 
-        if (array.empty())
+        if (column_array_data_float_data.empty())
             throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
 
-        const auto & offsets = column_array->getOffsets();
-        const size_t num_rows = offsets.size();
+        const auto & column_array_offsets = column_array->getOffsets();
+        const size_t num_rows = column_array_offsets.size();
 
         /// Check all sizes are the same
-        size_t size = offsets[0];
+        size_t dimension = column_array_offsets[0];
         for (size_t i = 0; i < num_rows - 1; ++i)
-            if (offsets[i + 1] - offsets[i] != size)
+            if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
                 throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
 
         if (!index)
-            index = std::make_shared<USearchIndexWithSerialization<Metric>>(size, scalar_kind);
+            index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);
 
         /// Add all rows of block
         if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows)))
@@ -197,7 +197,7 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
 
         for (size_t current_row = 0; current_row < num_rows; ++current_row)
         {
-            auto rc = index->add(static_cast<uint32_t>(index->size()), &array[offsets[current_row - 1]]);
+            auto rc = index->add(static_cast<uint32_t>(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
             if (!rc)
                 throw Exception(ErrorCodes::INCORRECT_DATA, rc.error.release());
 
@@ -208,9 +208,9 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
     }
     else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
     {
-        const auto & columns = column_tuple->getColumns();
-        std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
-        for (const auto & column : columns)
+        const auto & column_tuple_columns = column_tuple->getColumns();
+        std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
+        for (const auto & column : column_tuple_columns)
         {
             const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
             for (size_t i = 0; i < pod_array.size(); ++i)

From 945179be46bcc1b07741d180a0fdaa64396994ef Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 13 Sep 2023 14:23:09 +0000
Subject: [PATCH 2/4] Annoy: Fix LOGICAL_ERROR with default values #52258

---
 .../table-engines/mergetree-family/annindexes.md |  8 +++++---
 src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp   | 15 +++++++++++----
 src/Storages/MergeTree/MergeTreeIndexUSearch.cpp | 16 ++++++++++++----
 .../0_stateless/02354_annoy_index.reference      |  1 +
 tests/queries/0_stateless/02354_annoy_index.sql  | 12 ++++++++++++
 .../0_stateless/02354_usearch_index.reference    |  1 +
 .../queries/0_stateless/02354_usearch_index.sql  | 14 ++++++++++++++
 7 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md
index 8996133f667..d6ff7f23bb4 100644
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@@ -203,9 +203,10 @@ Parameter `NumTrees` is the number of trees which the algorithm creates (default
 more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.
 
 :::note
-Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use
-[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1
-CHECK length(vectors) = 256`.
+Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays must have same length. To avoid
+errors, you can use a [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT
+constraint_name_1 CHECK length(vectors) = 256`. Also, unspecified `Array` values in INSERT statements (i.e. default values) are not
+supported.
 :::
 
 Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
@@ -223,6 +224,7 @@ SETTINGS annoy_index_search_k_nodes=100;
 The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
 [here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
 :::
+
 ## USearch {#usearch}
 
 This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
index 15830513162..f00f11359e1 100644
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
@@ -157,18 +157,25 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
         const auto & column_array_data = column_array->getData();
         const auto & column_arary_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
 
-        if (column_arary_data_float_data.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
-
         const auto & column_array_offsets = column_array->getOffsets();
         const size_t num_rows = column_array_offsets.size();
 
+        /// The index dimension is inferred from the inserted arrays (array cardinality). If no value was specified in the INSERT statement
+        /// for the annoy-indexed column (i.e. default value), we have a problem. Reject such values.
+        if (column_array_offsets.empty() || column_array_offsets[0] == 0)
+            /// (The if condition is a bit weird but I have seen either with default values)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to insert {} rows into Annoy index but there were no values to insert. Likely, the INSERT used default values - these are not supported for Annoy.", rows_read);
+
         /// Check all sizes are the same
         size_t dimension = column_array_offsets[0];
         for (size_t i = 0; i < num_rows - 1; ++i)
             if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
                 throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
 
+        /// Also check that previously inserted blocks have the same size as this block
+        if (index && index->getDimensions() != dimension)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+
         if (!index)
             index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
 
@@ -363,7 +370,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
     {
         throw Exception(
             ErrorCodes::ILLEGAL_COLUMN,
-            "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
+            "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
     };
 
     DataTypePtr data_type = index.sample_block.getDataTypes()[0];
diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
index de556eb7e07..9531b9188bf 100644
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
@@ -176,18 +176,25 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
         const auto & column_array_data = column_array->getData();
         const auto & column_array_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();
 
-        if (column_array_data_float_data.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
-
         const auto & column_array_offsets = column_array->getOffsets();
         const size_t num_rows = column_array_offsets.size();
 
+        /// The index dimension is inferred from the inserted arrays (array cardinality). If no value was specified in the INSERT statement
+        /// for the usearch-indexed column (i.e. default value), we have a problem. Reject such values.
+        if (column_array_offsets.empty() || column_array_offsets[0] == 0)
+            /// (The if condition is a bit weird but I have seen either with default values)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to insert {} rows into usearch index but there were no values to insert. Likely, the INSERT used default values - these are not supported for Annoy.", rows_read);
+
         /// Check all sizes are the same
         size_t dimension = column_array_offsets[0];
         for (size_t i = 0; i < num_rows - 1; ++i)
             if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
                 throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
 
+        /// Also check that previously inserted blocks have the same size as this block
+        if (index && index->getDimensions() != dimension)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+
         if (!index)
             index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);
 
@@ -413,7 +420,8 @@ void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
     auto throw_unsupported_underlying_column_exception = []()
     {
         throw Exception(
-            ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
+            ErrorCodes::ILLEGAL_COLUMN,
+            "USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
     };
 
     DataTypePtr data_type = index.sample_block.getDataTypes()[0];
diff --git a/tests/queries/0_stateless/02354_annoy_index.reference b/tests/queries/0_stateless/02354_annoy_index.reference
index a0ffb1e1f7f..81f2ff8aa59 100644
--- a/tests/queries/0_stateless/02354_annoy_index.reference
+++ b/tests/queries/0_stateless/02354_annoy_index.reference
@@ -147,3 +147,4 @@ Expression (Projection)
 9000	[9000,0,0,0]
 1	(1,0,0,0)
 9000	(9000,0,0,0)
+--- Bugs ---
diff --git a/tests/queries/0_stateless/02354_annoy_index.sql b/tests/queries/0_stateless/02354_annoy_index.sql
index eab7a62c5f0..67ef64cc301 100644
--- a/tests/queries/0_stateless/02354_annoy_index.sql
+++ b/tests/queries/0_stateless/02354_annoy_index.sql
@@ -281,3 +281,15 @@ ORDER BY L2Distance(vector, (9000.0, 0.0, 0.0, 0.0))
 LIMIT 1;
 
 DROP TABLE tab;
+
+SELECT '--- Bugs ---';
+
+-- Arrays with default values are rejected, issue #52258
+CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid);
+INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA }
+DROP TABLE tab;
+
+-- Tuples with default value work
+CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE annoy()) ENGINE = MergeTree() ORDER BY (uuid);
+INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine
+DROP TABLE tab;
diff --git a/tests/queries/0_stateless/02354_usearch_index.reference b/tests/queries/0_stateless/02354_usearch_index.reference
index 893a092a386..c2791e99a54 100644
--- a/tests/queries/0_stateless/02354_usearch_index.reference
+++ b/tests/queries/0_stateless/02354_usearch_index.reference
@@ -150,3 +150,4 @@ Expression (Projection)
 1	[0,0,10]
 2	[0,0,10.5]
 3	[0,0,9.5]
+--- Bugs ---
diff --git a/tests/queries/0_stateless/02354_usearch_index.sql b/tests/queries/0_stateless/02354_usearch_index.sql
index e534c91b615..fc2954d6c5d 100644
--- a/tests/queries/0_stateless/02354_usearch_index.sql
+++ b/tests/queries/0_stateless/02354_usearch_index.sql
@@ -274,3 +274,17 @@ SELECT *
 FROM tab
 WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
 LIMIT 3;
+
+DROP TABLE tab;
+
+SELECT '--- Bugs ---';
+
+-- Arrays with default values are rejected, issue #52258
+CREATE TABLE tab (`uuid` String, `vector` Array(Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid);
+INSERT INTO tab (uuid, version) VALUES ('1', 3); -- { serverError INCORRECT_DATA }
+DROP TABLE tab;
+
+-- Tuples with default value work
+CREATE TABLE tab (`uuid` String, `vector` Tuple(Float32, Float32), `version` UInt32, INDEX idx vector TYPE usearch()) ENGINE = MergeTree() ORDER BY (uuid);
+INSERT INTO tab (uuid, version) VALUES ('1', 3); -- works fine
+DROP TABLE tab;

From 9f009cccd5d01be29ff8e8ab6063297ec2a73b46 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Sun, 17 Sep 2023 15:22:51 +0000
Subject: [PATCH 3/4] Incorporate review feedback

---
 src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 18 ++++++++++--------
 .../MergeTree/MergeTreeIndexUSearch.cpp        | 18 ++++++++++--------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
index f00f11359e1..d15d89ad6f9 100644
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
@@ -160,21 +160,23 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
         const auto & column_array_offsets = column_array->getOffsets();
         const size_t num_rows = column_array_offsets.size();
 
-        /// The index dimension is inferred from the inserted arrays (array cardinality). If no value was specified in the INSERT statement
-        /// for the annoy-indexed column (i.e. default value), we have a problem. Reject such values.
-        if (column_array_offsets.empty() || column_array_offsets[0] == 0)
-            /// (The if condition is a bit weird but I have seen either with default values)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to insert {} rows into Annoy index but there were no values to insert. Likely, the INSERT used default values - these are not supported for Annoy.", rows_read);
+        /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 0. This condition is violated if empty arrays
+        /// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default
+        /// value which is also an empty array.
+        if (column_array->isDefaultAt(0))
+            throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
 
         /// Check all sizes are the same
         size_t dimension = column_array_offsets[0];
         for (size_t i = 0; i < num_rows - 1; ++i)
             if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
-                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
 
-        /// Also check that previously inserted blocks have the same size as this block
+        /// Also check that previously inserted blocks have the same size as this block.
+        /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
+        /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
         if (index && index->getDimensions() != dimension)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
 
         if (!index)
             index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
index 9531b9188bf..a00cab6ca59 100644
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
@@ -179,21 +179,23 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
         const auto & column_array_offsets = column_array->getOffsets();
         const size_t num_rows = column_array_offsets.size();
 
-        /// The index dimension is inferred from the inserted arrays (array cardinality). If no value was specified in the INSERT statement
-        /// for the usearch-indexed column (i.e. default value), we have a problem. Reject such values.
-        if (column_array_offsets.empty() || column_array_offsets[0] == 0)
-            /// (The if condition is a bit weird but I have seen either with default values)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to insert {} rows into usearch index but there were no values to insert. Likely, the INSERT used default values - these are not supported for Annoy.", rows_read);
+        /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 0. This condition is violated if empty arrays
+        /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default
+        /// value which is also an empty array.
+        if (column_array->isDefaultAt(0))
+            throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
 
         /// Check all sizes are the same
         size_t dimension = column_array_offsets[0];
         for (size_t i = 0; i < num_rows - 1; ++i)
             if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
-                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
 
-        /// Also check that previously inserted blocks have the same size as this block
+        /// Also check that previously inserted blocks have the same size as this block.
+        /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
+        /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
         if (index && index->getDimensions() != dimension)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
 
         if (!index)
             index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);

From de4f22e20aa1c4d044a525474f5ed2d03e6c59a5 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Sun, 17 Sep 2023 15:26:36 +0000
Subject: [PATCH 4/4] Typo

---
 src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp   | 4 ++--
 src/Storages/MergeTree/MergeTreeIndexUSearch.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
index d15d89ad6f9..3eec8614dcd 100644
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
@@ -160,9 +160,9 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
         const auto & column_array_offsets = column_array->getOffsets();
         const size_t num_rows = column_array_offsets.size();
 
-        /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 0. This condition is violated if empty arrays
+        /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
         /// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default
-        /// value which is also an empty array.
+        /// value which is also empty.
         if (column_array->isDefaultAt(0))
             throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
 
diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
index a00cab6ca59..009c004faea 100644
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
@@ -179,9 +179,9 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
         const auto & column_array_offsets = column_array->getOffsets();
         const size_t num_rows = column_array_offsets.size();
 
-        /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 0. This condition is violated if empty arrays
+        /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
         /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default
-        /// value which is also an empty array.
+        /// values which is also empty.
         if (column_array->isDefaultAt(0))
             throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);