Merge branch 'master' into format-settings-parsing

This commit is contained in:
Alexey Milovidov 2024-08-16 04:15:48 +02:00
commit 77c8bbda43
34 changed files with 440 additions and 159 deletions

View File

@ -9,4 +9,14 @@ target_include_directories(_usearch SYSTEM INTERFACE
${SIMSIMD_PROJECT_DIR}/include
${USEARCH_PROJECT_DIR}/include)
target_compile_definitions(_usearch INTERFACE USEARCH_USE_FP16LIB)
# target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD)
# ^^ simsimd is not enabled at the moment. Reasons:
# - Vectorization is important for raw scans but not so much for HNSW. We use usearch only for HNSW.
# - Simsimd does compile-time dispatch (choice of SIMD kernels determined by capabilities of the build machine) or dynamic dispatch (SIMD
# kernels chosen at runtime based on cpuid instruction). Since current builds are limited to SSE 4.2 (x86) and NEON (ARM), the speedup of
# the former would be moderate compared to AVX-512 / SVE. The latter is at the moment too fragile with respect to portability across x86
# and ARM machines ... certain conbinations of quantizations / distance functions / SIMD instructions are not implemented at the moment.
add_library(ch_contrib::usearch ALIAS _usearch)

View File

@ -129,6 +129,7 @@ configure
# Check that all new/changed setting were added in settings changes history.
# Some settings can be different for builds with sanitizers, so we check
# Also the automatic value of 'max_threads' and similar was displayed as "'auto(...)'" in previous versions instead of "auto(...)".
# settings changes only for non-sanitizer builds.
IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'")
if [ "${IS_SANITIZED}" -eq "0" ]
@ -145,7 +146,9 @@ then
old_settings.value AS old_value
FROM new_settings
LEFT JOIN old_settings ON new_settings.name = old_settings.name
WHERE (new_settings.value != old_settings.value) AND (name NOT IN (
WHERE (new_value != old_value)
AND NOT (startsWith(new_value, 'auto(') AND old_value LIKE '%auto(%')
AND (name NOT IN (
SELECT arrayJoin(tupleElement(changes, 'name'))
FROM
(
@ -177,7 +180,7 @@ then
if [ -s changed_settings.txt ]
then
mv changed_settings.txt /test_output/
echo -e "Changed settings are not reflected in settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
echo -e "Changed settings are not reflected in the settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
else
echo -e "There are no changed settings or they are reflected in settings changes history$OK" >> /test_output/test_results.tsv
fi

View File

@ -22,10 +22,10 @@ ORDER BY Distance(vectors, Point)
LIMIT N
```
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md), for example embeddings.
Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function but [other
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
0.33, ...)`, and `N` limits the number of search results.
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md) or Array(Float64), for example
embeddings. Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function
but [other distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point,
e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results.
This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for
situations where `MaxDistance` is difficult to determine in advance.

View File

@ -93,7 +93,7 @@ namespace
break;
}
UUID id = parse<UUID>(line);
UUID id = parse<UUID>(line.substr(0, line.find('\t')));
line.clear();
String queries;

View File

@ -44,7 +44,7 @@ namespace ErrorCodes
namespace zkutil
{
/// Preferred size of multi() command (in number of ops)
/// Preferred size of multi command (in the number of operations)
constexpr size_t MULTI_BATCH_SIZE = 100;
struct ShuffleHost

View File

@ -79,11 +79,16 @@ std::vector<String> parseRemoteDescription(
/// Look for the corresponding closing bracket
for (m = i + 1; m < r; ++m)
{
if (description[m] == '{') ++cnt;
if (description[m] == '}') --cnt;
if (description[m] == '.' && description[m-1] == '.') last_dot = m;
if (description[m] == separator) have_splitter = true;
if (cnt == 0) break;
if (description[m] == '{')
++cnt;
if (description[m] == '}')
--cnt;
if (description[m] == '.' && description[m-1] == '.')
last_dot = m;
if (description[m] == separator)
have_splitter = true;
if (cnt == 0)
break;
}
if (cnt != 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}': incorrect brace sequence in first argument", func_name);

View File

@ -54,7 +54,7 @@ namespace
std::filesystem::path path(snapshot_path);
std::string filename = path.stem();
Strings name_parts;
splitInto<'_'>(name_parts, filename);
splitInto<'_', '.'>(name_parts, filename);
return parse<uint64_t>(name_parts[1]);
}

View File

@ -26,12 +26,16 @@ std::optional<RaftServerConfig> RaftServerConfig::parse(std::string_view server)
if (!with_id_endpoint && !with_server_type && !with_priority)
return std::nullopt;
const std::string_view id_str = parts[0];
std::string_view id_str = parts[0];
if (!id_str.starts_with("server."))
return std::nullopt;
id_str = id_str.substr(7);
if (auto eq_pos = id_str.find('='); std::string_view::npos != eq_pos)
id_str = id_str.substr(0, eq_pos);
Int32 id;
if (!tryParse(id, std::next(id_str.begin(), 7)))
if (!tryParse(id, id_str))
return std::nullopt;
if (id <= 0)
return std::nullopt;

View File

@ -24,9 +24,7 @@ void GTIDSet::tryMerge(size_t i)
void GTIDSets::parse(String gtid_format)
{
if (gtid_format.empty())
{
return;
}
std::vector<String> gtid_sets;
boost::split(gtid_sets, gtid_format, [](char c) { return c == ','; });

View File

@ -10,20 +10,19 @@ GTEST_TEST(GTIDSetsContains, Tests)
contained1, contained2, contained3, contained4, contained5,
not_contained1, not_contained2, not_contained3, not_contained4, not_contained5, not_contained6;
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:2-3:11:47-49");
contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:11");
contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:47-49:60");
contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:60");
contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:47-49:60");
contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:60");
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
not_contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:0-3:11:47-49");
not_contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:99");
not_contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:46-49:60");
not_contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:99");
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
not_contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:46-49:60");
not_contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:99");
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
ASSERT_TRUE(gtid_set.contains(contained1));
ASSERT_TRUE(gtid_set.contains(contained2));

View File

@ -237,7 +237,7 @@ SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f)
String SettingFieldMaxThreads::toString() const
{
if (is_auto)
return "'auto(" + ::DB::toString(value) + ")'";
return "auto(" + ::DB::toString(value) + ")";
else
return ::DB::toString(value);
}

View File

@ -153,7 +153,7 @@ struct SettingFieldMaxThreads
operator UInt64() const { return value; } /// NOLINT
explicit operator Field() const { return value; }
/// Writes "auto(<number>)" instead of simple "<number>" if `is_auto==true`.
/// Writes "auto(<number>)" instead of simple "<number>" if `is_auto == true`.
String toString() const;
void parseFromString(const String & str);

View File

@ -1,4 +1,5 @@
#pragma once
#include <map>
#include <mutex>
#include <unordered_map>

View File

@ -258,6 +258,20 @@ inline void readBoolText(bool & x, ReadBuffer & buf)
char tmp = '0';
readChar(tmp, buf);
x = tmp != '0';
if (!buf.eof() && isAlphaASCII(tmp))
{
if (tmp == 't' || tmp == 'T')
{
assertStringCaseInsensitive("rue", buf);
x = true;
}
else if (tmp == 'f' || tmp == 'F')
{
assertStringCaseInsensitive("alse", buf);
x = false;
}
}
}
template <typename ReturnType = void>
@ -1735,6 +1749,7 @@ inline T parse(const char * data, size_t size)
T res;
ReadBufferFromMemory buf(data, size);
readText(res, buf);
assertEOF(buf);
return res;
}
@ -1742,7 +1757,9 @@ template <typename T>
inline bool tryParse(T & res, const char * data, size_t size)
{
ReadBufferFromMemory buf(data, size);
return tryReadText(res, buf);
if (!tryReadText(res, buf))
return false;
return buf.eof();
}
template <typename T>

View File

@ -230,21 +230,37 @@ String Cluster::Address::toFullString(bool use_compact_format) const
}
}
Cluster::Address Cluster::Address::fromFullString(const String & full_string)
Cluster::Address Cluster::Address::fromFullString(std::string_view full_string)
{
const char * address_begin = full_string.data();
const char * address_end = address_begin + full_string.size();
const char * user_pw_end = strchr(full_string.data(), '@');
std::string_view user_password;
if (auto pos = full_string.find('@'); pos != std::string_view::npos)
user_password = full_string.substr(pos + 1);
/// parsing with the new shard{shard_index}[_replica{replica_index}] format
if (!user_pw_end && startsWith(full_string, "shard"))
if (user_password.empty() && full_string.starts_with("shard"))
{
const char * underscore = strchr(full_string.data(), '_');
Address address;
address.shard_index = parse<UInt32>(address_begin + strlen("shard"));
address.replica_index = underscore ? parse<UInt32>(underscore + strlen("_replica")) : 0;
if (auto underscore_pos = full_string.find('_'); underscore_pos != std::string_view::npos)
{
address.shard_index = parse<UInt32>(full_string.substr(0, underscore_pos).substr(strlen("shard")));
if (full_string.substr(underscore_pos + 1).starts_with("replica"))
{
address.replica_index = parse<UInt32>(full_string.substr(underscore_pos + 1 + strlen("replica")));
}
else if (full_string.substr(underscore_pos + 1).starts_with("all_replicas"))
{
address.replica_index = 0;
}
else
throw Exception(ErrorCodes::SYNTAX_ERROR, "Incorrect address '{}', should be in a form of `shardN_all_replicas` or `shardN_replicaM`", full_string);
}
else
{
address.shard_index = parse<UInt32>(full_string.substr(strlen("shard")));
address.replica_index = 0;
}
return address;
}
@ -255,9 +271,13 @@ Cluster::Address Cluster::Address::fromFullString(const String & full_string)
/// - credentials are exposed in file name;
/// - the file name can be too long.
const char * address_begin = full_string.data();
const char * address_end = address_begin + full_string.size();
const char * user_pw_end = strchr(address_begin, '@');
Protocol::Secure secure = Protocol::Secure::Disable;
const char * secure_tag = "+secure";
if (endsWith(full_string, secure_tag))
if (full_string.ends_with(secure_tag))
{
address_end -= strlen(secure_tag);
secure = Protocol::Secure::Enable;

View File

@ -168,7 +168,7 @@ public:
String toFullString(bool use_compact_format) const;
/// Returns address with only shard index and replica index or full address without shard index and replica index
static Address fromFullString(const String & address_full_string);
static Address fromFullString(std::string_view full_string);
/// Returns resolved address if it does resolve.
std::optional<Poco::Net::SocketAddress> getResolvedAddress() const;

View File

@ -888,13 +888,22 @@ static Field applyFunctionForField(
return (*col)[0];
}
/// applyFunction will execute the function with one `field` or the column which `field` refers to.
static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field)
{
chassert(func != nullptr);
/// Fallback for fields without block reference.
if (field.isExplicit())
return applyFunctionForField(func, current_type, field);
String result_name = "_" + func->getName() + "_" + toString(field.column_idx);
/// We will cache the function result inside `field.columns`, because this function will call many times
/// from many fields from same column. When the column is huge, for example there are thousands of marks, we need a cache.
/// The cache key is like `_[function_pointer]_[param_column_id]` to identify a unique <function, param> pair.
WriteBufferFromOwnString buf;
writeText("_", buf);
writePointerHex(func.get(), buf);
writeText("_" + toString(field.column_idx), buf);
String result_name = buf.str();
const auto & columns = field.columns;
size_t result_idx = columns->size();
@ -906,6 +915,7 @@ static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr &
if (result_idx == columns->size())
{
/// When cache is missed, we calculate the whole column where the field comes from. This will avoid repeated calculation.
ColumnsWithTypeAndName args{(*columns)[field.column_idx]};
field.columns->emplace_back(ColumnWithTypeAndName {nullptr, func->getResultType(), result_name});
(*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size());

View File

@ -55,6 +55,7 @@ const std::unordered_map<String, unum::usearch::scalar_kind_t> quantizationToSca
{"f32", unum::usearch::scalar_kind_t::f32_k},
{"f16", unum::usearch::scalar_kind_t::f16_k},
{"i8", unum::usearch::scalar_kind_t::i8_k}};
/// Usearch provides more quantizations but ^^ above ones seem the only ones comprehensively supported across all distance functions.
template<typename T>
concept is_set = std::same_as<T, std::set<typename T::key_type, typename T::key_compare, typename T::allocator_type>>;
@ -98,9 +99,6 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
unum::usearch::index_dense_config_t config(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search);
config.enable_key_lookups = false; /// we don't do row-to-vector lookups
if (auto error = config.validate(); error) /// already called in vectorSimilarityIndexValidator, call again because usearch may change the config in-place
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parameters passed to vector similarity index. Error: {}", String(error.release()));
if (auto result = USearchIndex::make(metric, config); !result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not create vector similarity index. Error: {}", String(result.error.release()));
else
@ -250,14 +248,47 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAnd
return granule;
}
namespace
{
template <typename Column>
void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & column_array_offsets, USearchIndexWithSerializationPtr & index, size_t dimensions, size_t rows)
{
const auto & column_array_data = column_array->getData();
const auto & column_array_data_float = typeid_cast<const Column &>(column_array_data);
const auto & column_array_data_float_data = column_array_data_float.getData();
/// Check all sizes are the same
for (size_t row = 0; row < rows - 1; ++row)
if (column_array_offsets[row + 1] - column_array_offsets[row] != dimensions)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length");
/// Reserving space is mandatory
if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + rows)))
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index");
for (size_t row = 0; row < rows; ++row)
{
if (auto result = index->add(static_cast<USearchIndex::vector_key_t>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release()));
else
{
ProfileEvents::increment(ProfileEvents::USearchAddCount);
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members);
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances);
}
}
}
}
void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_t * pos, size_t limit)
{
if (*pos >= block.rows())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
*pos,
block.rows());
*pos, block.rows());
size_t rows_read = std::min(limit, block.rows() - *pos);
@ -271,63 +302,53 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
const String & index_column_name = index_sample_block.getByPosition(0).name;
ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
const ColumnPtr & index_column = block.getByName(index_column_name).column;
ColumnPtr column_cut = index_column->cut(*pos, rows_read);
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
{
const auto & column_array_data = column_array->getData();
const auto & column_array_data_float = typeid_cast<const ColumnFloat32 &>(column_array_data);
const auto & column_array_data_float_data = column_array_data_float.getData();
const auto * column_array = typeid_cast<const ColumnArray *>(column_cut.get());
if (!column_array)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float*) column");
const auto & column_array_offsets = column_array->getOffsets();
const size_t num_rows = column_array_offsets.size();
if (column_array->empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
if (column_array->empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
/// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
/// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default
/// values which is also empty.
if (column_array->isDefaultAt(0))
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
/// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
/// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default
/// values which is also empty.
if (column_array->isDefaultAt(0))
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
const size_t rows = column_array->size();
/// Check all sizes are the same
const size_t dimensions = column_array_offsets[0];
for (size_t i = 0; i < num_rows - 1; ++i)
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimensions)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
const auto & column_array_offsets = column_array->getOffsets();
const size_t dimensions = column_array_offsets[0];
/// Also check that previously inserted blocks have the same size as this block.
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
if (index && index->dimensions() != dimensions)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
if (!index)
index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
if (!index)
index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
/// Also check that previously inserted blocks have the same size as this block.
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
if (index->dimensions() != dimensions)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length");
/// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp
if (index->size() + num_rows > std::numeric_limits<UInt32>::max())
throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index in column {} would exceed 4 billion entries", index_column_name);
/// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp
if (index->size() + rows > std::numeric_limits<UInt32>::max())
throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index would exceed 4 billion entries");
/// Reserving space is mandatory
if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows)))
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index");
DataTypePtr data_type = block.getDataTypes()[0];
const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get());
if (!data_type_array)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected data type Array(Float*)");
const TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
for (size_t row = 0; row < num_rows; ++row)
{
if (auto result = index->add(static_cast<UInt32>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release()));
else
{
ProfileEvents::increment(ProfileEvents::USearchAddCount);
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members);
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances);
}
}
}
if (WhichDataType(nested_type_index).isFloat32())
updateImpl<ColumnFloat32>(column_array, column_array_offsets, index, dimensions, rows);
else if (WhichDataType(nested_type_index).isFloat64())
updateImpl<ColumnFloat64>(column_array, column_array_offsets, index, dimensions, rows);
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float32) column");
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected data type Array(Float*)");
*pos += rows_read;
}
@ -375,7 +396,7 @@ std::vector<size_t> MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer
"does not match the dimension in the index ({})",
vector_similarity_condition.getDimensions(), index->dimensions());
const std::vector<float> reference_vector = vector_similarity_condition.getReferenceVector();
const std::vector<Float64> reference_vector = vector_similarity_condition.getReferenceVector();
auto search_result = index->search(reference_vector.data(), limit);
if (!search_result)
@ -486,7 +507,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
if (!quantizationToScalarKind.contains(index.arguments[2].safeGet<String>()))
throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind));
/// Call Usearche's own parameter validation method for HNSW-specific parameters
/// Call Usearch's own parameter validation method for HNSW-specific parameters
UInt64 m = index.arguments[3].safeGet<UInt64>();
UInt64 ef_construction = index.arguments[4].safeGet<UInt64>();
UInt64 ef_search = index.arguments[5].safeGet<UInt64>();
@ -501,18 +522,14 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
if (index.column_names.size() != 1 || index.data_types.size() != 1)
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column");
/// Check data type of the indexed column:
/// Check that the data type is Array(Float*)
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
if (const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get()))
{
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
if (!WhichDataType(nested_type_index).isFloat32())
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)");
}
else
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)");
}
const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get());
if (!data_type_array)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float*)");
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
if (!WhichDataType(nested_type_index).isFloat())
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float*)");
}
}

View File

@ -24,7 +24,7 @@ namespace
{
template <typename Literal>
void extractReferenceVectorFromLiteral(std::vector<Float32> & reference_vector, Literal literal)
void extractReferenceVectorFromLiteral(std::vector<Float64> & reference_vector, Literal literal)
{
Float64 float_element_of_reference_vector;
Int64 int_element_of_reference_vector;
@ -72,7 +72,7 @@ UInt64 VectorSimilarityCondition::getLimit() const
throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported");
}
std::vector<float> VectorSimilarityCondition::getReferenceVector() const
std::vector<Float64> VectorSimilarityCondition::getReferenceVector() const
{
if (index_is_useful && query_information.has_value())
return query_information->reference_vector;

View File

@ -60,7 +60,7 @@ public:
L2
};
std::vector<Float32> reference_vector;
std::vector<Float64> reference_vector;
DistanceFunction distance_function;
String column_name;
UInt64 limit;
@ -70,7 +70,7 @@ public:
/// Returns false if query can be speeded up by an ANN index, true otherwise.
bool alwaysUnknownOrTrue(String distance_function) const;
std::vector<float> getReferenceVector() const;
std::vector<Float64> getReferenceVector() const;
size_t getDimensions() const;
String getColumnName() const;
Info::DistanceFunction getDistanceFunction() const;

View File

@ -332,6 +332,8 @@ struct DeltaLakeMetadataImpl
WhichDataType which(check_type->getTypeId());
if (which.isStringOrFixedString())
return value;
else if (isBool(check_type))
return parse<bool>(value);
else if (which.isInt8())
return parse<Int8>(value);
else if (which.isUInt8())

View File

@ -1,4 +1,4 @@
#include "StorageExternalDistributed.h"
#include <Storages/StorageExternalDistributed.h>
#include <Core/Settings.h>
#include <Storages/StorageFactory.h>
@ -6,6 +6,8 @@
#include <Interpreters/InterpreterSelectQuery.h>
#include <Core/PostgreSQL/PoolWithFailover.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
#include <Common/parseAddress.h>
#include <Processors/QueryPlan/QueryPlan.h>
#include <Common/parseRemoteDescription.h>
@ -112,14 +114,39 @@ void registerStorageExternalDistributed(StorageFactory & factory)
std::unordered_set<StoragePtr> shards;
ASTs inner_engine_args(engine_args.begin() + 1, engine_args.end());
ASTPtr * address_arg = nullptr;
/// If there is a named collection argument, named `addresses_expr`
for (auto & node : inner_engine_args)
{
if (ASTFunction * func = node->as<ASTFunction>(); func && func->name == "equals" && func->arguments)
{
if (ASTExpressionList * func_args = func->arguments->as<ASTExpressionList>(); func_args && func_args->children.size() == 2)
{
if (ASTIdentifier * arg_name = func_args->children[0]->as<ASTIdentifier>(); arg_name && arg_name->name() == "addresses_expr")
{
address_arg = &func_args->children[1];
break;
}
}
}
}
/// Otherwise it is the first argument.
if (!address_arg)
address_arg = &inner_engine_args.at(0);
String addresses_expr = checkAndGetLiteralArgument<String>(*address_arg, "addresses");
Strings shards_addresses = get_addresses(addresses_expr);
auto engine_name = checkAndGetLiteralArgument<String>(engine_args[0], "engine_name");
if (engine_name == "URL")
{
auto configuration = StorageURL::getConfiguration(inner_engine_args, context);
auto shards_addresses = get_addresses(configuration.addresses_expr);
auto format_settings = StorageURL::getFormatSettingsFromArgs(args);
for (const auto & shard_address : shards_addresses)
{
*address_arg = std::make_shared<ASTLiteral>(shard_address);
auto configuration = StorageURL::getConfiguration(inner_engine_args, context);
auto uri_options = parseRemoteDescription(shard_address, 0, shard_address.size(), '|', max_addresses);
if (uri_options.size() > 1)
{
@ -140,13 +167,12 @@ void registerStorageExternalDistributed(StorageFactory & factory)
else if (engine_name == "MySQL")
{
MySQLSettings mysql_settings;
auto configuration = StorageMySQL::getConfiguration(inner_engine_args, context, mysql_settings);
auto shards_addresses = get_addresses(configuration.addresses_expr);
for (const auto & shard_address : shards_addresses)
{
auto current_configuration{configuration};
current_configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 3306);
auto pool = createMySQLPoolWithFailover(current_configuration, mysql_settings);
*address_arg = std::make_shared<ASTLiteral>(shard_address);
auto configuration = StorageMySQL::getConfiguration(inner_engine_args, context, mysql_settings);
configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 3306);
auto pool = createMySQLPoolWithFailover(configuration, mysql_settings);
shards.insert(std::make_shared<StorageMySQL>(
args.table_id, std::move(pool), configuration.database, configuration.table,
/* replace_query = */ false, /* on_duplicate_clause = */ "",
@ -157,14 +183,13 @@ void registerStorageExternalDistributed(StorageFactory & factory)
#if USE_LIBPQXX
else if (engine_name == "PostgreSQL")
{
auto configuration = StoragePostgreSQL::getConfiguration(inner_engine_args, context);
auto shards_addresses = get_addresses(configuration.addresses_expr);
for (const auto & shard_address : shards_addresses)
{
auto current_configuration{configuration};
current_configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 5432);
*address_arg = std::make_shared<ASTLiteral>(shard_address);
auto configuration = StoragePostgreSQL::getConfiguration(inner_engine_args, context);
configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 5432);
auto pool = std::make_shared<postgres::PoolWithFailover>(
current_configuration,
configuration,
settings.postgresql_connection_pool_size,
settings.postgresql_connection_pool_wait_timeout,
settings.postgresql_connection_pool_retries,

View File

@ -46,7 +46,7 @@ def test_cgroup_cpu_limit():
"clickhouse local -q \"select value from system.settings where name='max_threads'\"",
num_cpus,
)
expect_output = (r"\'auto({})\'".format(math.ceil(num_cpus))).encode()
expect_output = (r"auto({})".format(math.ceil(num_cpus))).encode()
assert (
result.strip() == expect_output
), f"fail for cpu limit={num_cpus}, result={result.strip()}, expect={expect_output}"

View File

@ -6,41 +6,56 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
TEST_PREFIX=$RANDOM
TEST_PREFIX="${CLICKHOUSE_DATABASE}"
${CLICKHOUSE_CLIENT} -q "drop user if exists u_00600${TEST_PREFIX}"
${CLICKHOUSE_CLIENT} -q "create user u_00600${TEST_PREFIX} settings max_execution_time=60, readonly=1"
${CLICKHOUSE_CLIENT} -q "grant select on system.numbers to u_00600${TEST_PREFIX}"
function wait_for_query_to_start()
{
while [[ $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") == 0 ]]; do sleep 0.1; done
while [[ 0 -eq $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") ]]
do
sleep 0.1
done
}
function wait_for_queries_to_finish()
{
while [[ 0 -ne $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE current_database = '${CLICKHOUSE_DATABASE}' AND query NOT LIKE '%this query%'") ]]
do
sleep 0.1
done
}
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=hello&replace_running_query=1" -d 'SELECT 1, count() FROM system.numbers' > /dev/null 2>&1 &
wait_for_query_to_start 'hello'
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=${CLICKHOUSE_DATABASE}hello&replace_running_query=1" -d 'SELECT 1, count() FROM system.numbers' > /dev/null 2>&1 &
wait_for_query_to_start "${CLICKHOUSE_DATABASE}hello"
# Replace it
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=hello&replace_running_query=1" -d 'SELECT 0'
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=${CLICKHOUSE_DATABASE}hello&replace_running_query=1" -d 'SELECT 0'
# Wait for it to be replaced
wait
wait_for_queries_to_finish
${CLICKHOUSE_CLIENT_BINARY} --user=u_00600${TEST_PREFIX} --query_id=42 --query='SELECT 2, count() FROM system.numbers' 2>&1 | grep -cF 'was cancelled' &
wait_for_query_to_start '42'
${CLICKHOUSE_CLIENT_BINARY} --user=u_00600${TEST_PREFIX} --query_id="${CLICKHOUSE_DATABASE}42" --query='SELECT 2, count() FROM system.numbers' 2>&1 | grep -cF 'QUERY_WAS_CANCELLED' &
wait_for_query_to_start "${CLICKHOUSE_DATABASE}42"
# Trying to run another query with the same query_id
${CLICKHOUSE_CLIENT} --query_id=42 --query='SELECT 43' 2>&1 | grep -cF 'is already running by user'
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --query='SELECT 43' 2>&1 | grep -cF 'is already running by user'
# Trying to replace query of a different user
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=42&replace_running_query=1" -d 'SELECT 1' | grep -cF 'is already running by user'
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=${CLICKHOUSE_DATABASE}42&replace_running_query=1" -d 'SELECT 1' | grep -cF 'is already running by user'
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = '42' SYNC" > /dev/null
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = '${CLICKHOUSE_DATABASE}42' SYNC" > /dev/null
wait
wait_for_queries_to_finish
${CLICKHOUSE_CLIENT} --query_id=42 --query='SELECT 3, count() FROM system.numbers' 2>&1 | grep -cF 'was cancelled' &
wait_for_query_to_start '42'
${CLICKHOUSE_CLIENT} --query_id=42 --replace_running_query=1 --replace_running_query_max_wait_ms=500 --query='SELECT 43' 2>&1 | grep -F "can't be stopped" > /dev/null
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --query='SELECT 3, count() FROM system.numbers' 2>&1 | grep -cF 'QUERY_WAS_CANCELLED' &
wait_for_query_to_start "${CLICKHOUSE_DATABASE}42"
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --replace_running_query=1 --replace_running_query_max_wait_ms=500 --query='SELECT 43' 2>&1 | grep -F "can't be stopped" > /dev/null
wait
${CLICKHOUSE_CLIENT} --query_id=42 --replace_running_query=1 --query='SELECT 44'
wait_for_queries_to_finish
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --replace_running_query=1 --query='SELECT 44'
${CLICKHOUSE_CLIENT} -q "drop user u_00600${TEST_PREFIX}"

View File

@ -1,3 +1,4 @@
Rejects INSERTs of Arrays with different sizes
Issue #52258: Empty Arrays or Arrays with default values are rejected
It is possible to create parts with different Array vector sizes but there will be an error at query time
Correctness of index with > 1 mark

View File

@ -7,6 +7,12 @@ SET enable_analyzer = 1; -- 0 vs. 1 produce slightly different error codes, make
DROP TABLE IF EXISTS tab;
SELECT 'Rejects INSERTs of Arrays with different sizes';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA }
DROP TABLE tab;
SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected';
CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree() ORDER BY id;

View File

@ -5,4 +5,3 @@ Two or six index arguments
4nd argument (M), if given, must be UInt64 and > 1
Must be created on single column
Must be created on Array(Float32) columns
Rejects INSERTs of Arrays with different sizes

View File

@ -35,11 +35,6 @@ SELECT 'Must be created on Array(Float32) columns';
SET allow_suspicious_low_cardinality_types = 1;
CREATE TABLE tab(id Int32, vec UInt64, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Array(UInt64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
SELECT 'Rejects INSERTs of Arrays with different sizes';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA }
DROP TABLE tab;

View File

@ -1,9 +1,7 @@
10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block
- ORDER-BY-type
5 [0,2] 0
6 [0,2.1] 0.09999990463256836
7 [0,2.2] 0.20000004768371582
- ORDER-BY-type, EXPLAIN
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
@ -20,11 +18,9 @@ Expression (Projection)
Parts: 1/1
Granules: 1/1
12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block
- ORDER-BY-type
6 [0,2] 0
7 [0,2.1] 0.09999990463256836
8 [0,2.2] 0.20000004768371582
- ORDER-BY-type, EXPLAIN
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
@ -41,11 +37,11 @@ Expression (Projection)
Parts: 1/1
Granules: 2/4
Special cases
- ORDER-BY-type
-- Non-default metric, M, ef_construction, ef_search
6 [1,9.3] 0.005731362878640178
1 [2,3.2] 0.15200169244542905
7 [5.5,4.7] 0.3503476876550442
- Special case: setting "max_limit_for_ann_queries"
-- Setting "max_limit_for_ann_queries"
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
@ -56,3 +52,62 @@ Expression (Projection)
Condition: true
Parts: 1/1
Granules: 4/4
-- Non-default quantization
1 [2,3.2] 2.3323807824711897
2 [4.2,3.4] 4.427188573446585
0 [4.6,2.3] 4.609772130377966
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_f32)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: vector_similarity GRANULARITY 2
Parts: 1/1
Granules: 2/4
1 [2,3.2] 2.3323807824711897
2 [4.2,3.4] 4.427188573446585
0 [4.6,2.3] 4.609772130377966
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_f16)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: vector_similarity GRANULARITY 2
Parts: 1/1
Granules: 2/4
1 [2,3.2] 2.3323807824711897
2 [4.2,3.4] 4.427188573446585
0 [4.6,2.3] 4.609772130377966
Expression (Projection)
Limit (preliminary LIMIT (without OFFSET))
Sorting (Sorting for ORDER BY)
Expression (Before ORDER BY)
ReadFromMergeTree (default.tab_i8)
Indexes:
PrimaryKey
Condition: true
Parts: 1/1
Granules: 4/4
Skip
Name: idx
Description: vector_similarity GRANULARITY 2
Parts: 1/1
Granules: 2/4
-- Index on Array(Float64) column
6 [0,2] 0
7 [0,2.1] 0.10000000000000009
8 [0,2.2] 0.20000000000000018

View File

@ -14,14 +14,12 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
SELECT '- ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- ORDER-BY-type, EXPLAIN';
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
@ -37,14 +35,12 @@ SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexe
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
SELECT '- ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
SELECT '- ORDER-BY-type, EXPLAIN';
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
@ -56,19 +52,18 @@ DROP TABLE tab;
SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen.
-- Test with non-default metric, M, ef_construction, ef_search
SELECT '-- Non-default metric, M, ef_construction, ef_search';
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
SELECT '- ORDER-BY-type';
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
FROM tab
ORDER BY cosineDistance(vec, reference_vec)
LIMIT 3;
SELECT '- Special case: setting "max_limit_for_ann_queries"';
SELECT '-- Setting "max_limit_for_ann_queries"';
EXPLAIN indexes=1
WITH [0.0, 2.0] as reference_vec
SELECT id, vec, cosineDistance(vec, reference_vec)
@ -78,3 +73,66 @@ LIMIT 3
SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann index
DROP TABLE tab;
SELECT '-- Non-default quantization';
CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
INSERT INTO tab_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_f32
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_f32
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_f16
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_f16
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_i8
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
EXPLAIN indexes = 1
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab_i8
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
DROP TABLE tab_f32;
DROP TABLE tab_f16;
DROP TABLE tab_i8;
SELECT '-- Index on Array(Float64) column';
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
WITH [0.0, 2.0] AS reference_vec
SELECT id, vec, L2Distance(vec, reference_vec)
FROM tab
ORDER BY L2Distance(vec, reference_vec)
LIMIT 3;
DROP TABLE tab;

View File

@ -0,0 +1 @@
50

View File

@ -0,0 +1,11 @@
CREATE TABLE IF NOT EXISTS report_metrics_v2
(
`a` UInt64
) Engine = MergeTree()
ORDER BY a;
insert into report_metrics_v2 SELECT * FROM system.numbers LIMIT 50000;
SELECT count(*) from report_metrics_v2 WHERE (intDiv(a, 50) = 200) AND (intDiv(a, 50000) = 0);
DROP TABLE report_metrics_v2;

View File

@ -0,0 +1,29 @@
SET enable_analyzer = 1;
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
CREATE TABLE t1 (`a` Int64, `b` Int64) ENGINE = MergeTree ORDER BY a;
CREATE TABLE t2 (`key` Int32, `val` Int64) ENGINE = MergeTree ORDER BY key;
insert into t1 Select number, number from numbers(100000);
insert into t2 Select number, number from numbers(100000);
SELECT
1 * 1000.0001,
(count(1.) = -2147483647) AND (count(a) = 1.1920928955078125e-7) AND (count(val) = 1048577) AND (sum(val) = ((NULL * 1048576) / -9223372036854775807)) AND (sum(a) = ((9223372036854775806 * 10000000000.) / 1048575))
FROM
(
SELECT
a,
val
FROM t1
FULL OUTER JOIN t2 ON (t1.a = t2.key) OR (1 * inf) OR (t1.b = t2.key)
)
GROUP BY '65537'
WITH CUBE
FORMAT Null
SETTINGS max_block_size = 100, join_use_nulls = 1, max_execution_time = 1., max_result_rows = 0, max_result_bytes = 0; -- { serverError TIMEOUT_EXCEEDED }
DROP TABLE t1;
DROP TABLE t2;