mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Merge branch 'master' into format-settings-parsing
This commit is contained in:
commit
77c8bbda43
@ -9,4 +9,14 @@ target_include_directories(_usearch SYSTEM INTERFACE
|
||||
${SIMSIMD_PROJECT_DIR}/include
|
||||
${USEARCH_PROJECT_DIR}/include)
|
||||
|
||||
target_compile_definitions(_usearch INTERFACE USEARCH_USE_FP16LIB)
|
||||
|
||||
# target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD)
|
||||
# ^^ simsimd is not enabled at the moment. Reasons:
|
||||
# - Vectorization is important for raw scans but not so much for HNSW. We use usearch only for HNSW.
|
||||
# - Simsimd does compile-time dispatch (choice of SIMD kernels determined by capabilities of the build machine) or dynamic dispatch (SIMD
|
||||
# kernels chosen at runtime based on cpuid instruction). Since current builds are limited to SSE 4.2 (x86) and NEON (ARM), the speedup of
|
||||
# the former would be moderate compared to AVX-512 / SVE. The latter is at the moment too fragile with respect to portability across x86
|
||||
# and ARM machines ... certain conbinations of quantizations / distance functions / SIMD instructions are not implemented at the moment.
|
||||
|
||||
add_library(ch_contrib::usearch ALIAS _usearch)
|
||||
|
@ -129,6 +129,7 @@ configure
|
||||
|
||||
# Check that all new/changed setting were added in settings changes history.
|
||||
# Some settings can be different for builds with sanitizers, so we check
|
||||
# Also the automatic value of 'max_threads' and similar was displayed as "'auto(...)'" in previous versions instead of "auto(...)".
|
||||
# settings changes only for non-sanitizer builds.
|
||||
IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'")
|
||||
if [ "${IS_SANITIZED}" -eq "0" ]
|
||||
@ -145,7 +146,9 @@ then
|
||||
old_settings.value AS old_value
|
||||
FROM new_settings
|
||||
LEFT JOIN old_settings ON new_settings.name = old_settings.name
|
||||
WHERE (new_settings.value != old_settings.value) AND (name NOT IN (
|
||||
WHERE (new_value != old_value)
|
||||
AND NOT (startsWith(new_value, 'auto(') AND old_value LIKE '%auto(%')
|
||||
AND (name NOT IN (
|
||||
SELECT arrayJoin(tupleElement(changes, 'name'))
|
||||
FROM
|
||||
(
|
||||
@ -177,7 +180,7 @@ then
|
||||
if [ -s changed_settings.txt ]
|
||||
then
|
||||
mv changed_settings.txt /test_output/
|
||||
echo -e "Changed settings are not reflected in settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
|
||||
echo -e "Changed settings are not reflected in the settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
|
||||
else
|
||||
echo -e "There are no changed settings or they are reflected in settings changes history$OK" >> /test_output/test_results.tsv
|
||||
fi
|
||||
|
@ -22,10 +22,10 @@ ORDER BY Distance(vectors, Point)
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md), for example embeddings.
|
||||
Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function but [other
|
||||
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
|
||||
0.33, ...)`, and `N` limits the number of search results.
|
||||
`vectors` contains N-dimensional values of type [Array(Float32)](../../../sql-reference/data-types/array.md) or Array(Float64), for example
|
||||
embeddings. Function `Distance` computes the distance between two vectors. Often, the Euclidean (L2) distance is chosen as distance function
|
||||
but [other distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point,
|
||||
e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results.
|
||||
|
||||
This query returns the top-`N` closest points to the reference point. Parameter `N` limits the number of returned values which is useful for
|
||||
situations where `MaxDistance` is difficult to determine in advance.
|
||||
|
@ -93,7 +93,7 @@ namespace
|
||||
break;
|
||||
}
|
||||
|
||||
UUID id = parse<UUID>(line);
|
||||
UUID id = parse<UUID>(line.substr(0, line.find('\t')));
|
||||
line.clear();
|
||||
|
||||
String queries;
|
||||
|
@ -44,7 +44,7 @@ namespace ErrorCodes
|
||||
namespace zkutil
|
||||
{
|
||||
|
||||
/// Preferred size of multi() command (in number of ops)
|
||||
/// Preferred size of multi command (in the number of operations)
|
||||
constexpr size_t MULTI_BATCH_SIZE = 100;
|
||||
|
||||
struct ShuffleHost
|
||||
|
@ -79,11 +79,16 @@ std::vector<String> parseRemoteDescription(
|
||||
/// Look for the corresponding closing bracket
|
||||
for (m = i + 1; m < r; ++m)
|
||||
{
|
||||
if (description[m] == '{') ++cnt;
|
||||
if (description[m] == '}') --cnt;
|
||||
if (description[m] == '.' && description[m-1] == '.') last_dot = m;
|
||||
if (description[m] == separator) have_splitter = true;
|
||||
if (cnt == 0) break;
|
||||
if (description[m] == '{')
|
||||
++cnt;
|
||||
if (description[m] == '}')
|
||||
--cnt;
|
||||
if (description[m] == '.' && description[m-1] == '.')
|
||||
last_dot = m;
|
||||
if (description[m] == separator)
|
||||
have_splitter = true;
|
||||
if (cnt == 0)
|
||||
break;
|
||||
}
|
||||
if (cnt != 0)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}': incorrect brace sequence in first argument", func_name);
|
||||
|
@ -54,7 +54,7 @@ namespace
|
||||
std::filesystem::path path(snapshot_path);
|
||||
std::string filename = path.stem();
|
||||
Strings name_parts;
|
||||
splitInto<'_'>(name_parts, filename);
|
||||
splitInto<'_', '.'>(name_parts, filename);
|
||||
return parse<uint64_t>(name_parts[1]);
|
||||
}
|
||||
|
||||
|
@ -26,12 +26,16 @@ std::optional<RaftServerConfig> RaftServerConfig::parse(std::string_view server)
|
||||
if (!with_id_endpoint && !with_server_type && !with_priority)
|
||||
return std::nullopt;
|
||||
|
||||
const std::string_view id_str = parts[0];
|
||||
std::string_view id_str = parts[0];
|
||||
if (!id_str.starts_with("server."))
|
||||
return std::nullopt;
|
||||
|
||||
id_str = id_str.substr(7);
|
||||
if (auto eq_pos = id_str.find('='); std::string_view::npos != eq_pos)
|
||||
id_str = id_str.substr(0, eq_pos);
|
||||
|
||||
Int32 id;
|
||||
if (!tryParse(id, std::next(id_str.begin(), 7)))
|
||||
if (!tryParse(id, id_str))
|
||||
return std::nullopt;
|
||||
if (id <= 0)
|
||||
return std::nullopt;
|
||||
|
@ -24,9 +24,7 @@ void GTIDSet::tryMerge(size_t i)
|
||||
void GTIDSets::parse(String gtid_format)
|
||||
{
|
||||
if (gtid_format.empty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<String> gtid_sets;
|
||||
boost::split(gtid_sets, gtid_format, [](char c) { return c == ','; });
|
||||
|
@ -10,20 +10,19 @@ GTEST_TEST(GTIDSetsContains, Tests)
|
||||
contained1, contained2, contained3, contained4, contained5,
|
||||
not_contained1, not_contained2, not_contained3, not_contained4, not_contained5, not_contained6;
|
||||
|
||||
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
|
||||
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
|
||||
gtid_set.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
|
||||
contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
|
||||
contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:2-3:11:47-49");
|
||||
contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:11");
|
||||
contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:47-49:60");
|
||||
contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:60");
|
||||
contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:47-49:60");
|
||||
contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:60");
|
||||
|
||||
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60");
|
||||
not_contained1.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-50, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60");
|
||||
not_contained2.parse("2174B383-5441-11E8-B90A-C80AA9429562:0-3:11:47-49");
|
||||
not_contained3.parse("2174B383-5441-11E8-B90A-C80AA9429562:99");
|
||||
not_contained4.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:2-16:46-49:60");
|
||||
not_contained5.parse("24DA167-0C0C-11E8-8442-00059A3C7B00:99");
|
||||
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, 24DA167-0C0C-11E8-8442-00059A3C7B00:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
|
||||
|
||||
not_contained4.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:2-16:46-49:60");
|
||||
not_contained5.parse("FBC30C64-F8C9-4DDF-8CDD-066208EB433B:99");
|
||||
not_contained6.parse("2174B383-5441-11E8-B90A-C80AA9429562:1-3:11:47-49, FBC30C64-F8C9-4DDF-8CDD-066208EB433B:1-19:47-49:60, 00000000-0000-0000-0000-000000000000");
|
||||
|
||||
ASSERT_TRUE(gtid_set.contains(contained1));
|
||||
ASSERT_TRUE(gtid_set.contains(contained2));
|
||||
|
@ -237,7 +237,7 @@ SettingFieldMaxThreads & SettingFieldMaxThreads::operator=(const Field & f)
|
||||
String SettingFieldMaxThreads::toString() const
|
||||
{
|
||||
if (is_auto)
|
||||
return "'auto(" + ::DB::toString(value) + ")'";
|
||||
return "auto(" + ::DB::toString(value) + ")";
|
||||
else
|
||||
return ::DB::toString(value);
|
||||
}
|
||||
|
@ -153,7 +153,7 @@ struct SettingFieldMaxThreads
|
||||
operator UInt64() const { return value; } /// NOLINT
|
||||
explicit operator Field() const { return value; }
|
||||
|
||||
/// Writes "auto(<number>)" instead of simple "<number>" if `is_auto==true`.
|
||||
/// Writes "auto(<number>)" instead of simple "<number>" if `is_auto == true`.
|
||||
String toString() const;
|
||||
void parseFromString(const String & str);
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
|
@ -258,6 +258,20 @@ inline void readBoolText(bool & x, ReadBuffer & buf)
|
||||
char tmp = '0';
|
||||
readChar(tmp, buf);
|
||||
x = tmp != '0';
|
||||
|
||||
if (!buf.eof() && isAlphaASCII(tmp))
|
||||
{
|
||||
if (tmp == 't' || tmp == 'T')
|
||||
{
|
||||
assertStringCaseInsensitive("rue", buf);
|
||||
x = true;
|
||||
}
|
||||
else if (tmp == 'f' || tmp == 'F')
|
||||
{
|
||||
assertStringCaseInsensitive("alse", buf);
|
||||
x = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReturnType = void>
|
||||
@ -1735,6 +1749,7 @@ inline T parse(const char * data, size_t size)
|
||||
T res;
|
||||
ReadBufferFromMemory buf(data, size);
|
||||
readText(res, buf);
|
||||
assertEOF(buf);
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -1742,7 +1757,9 @@ template <typename T>
|
||||
inline bool tryParse(T & res, const char * data, size_t size)
|
||||
{
|
||||
ReadBufferFromMemory buf(data, size);
|
||||
return tryReadText(res, buf);
|
||||
if (!tryReadText(res, buf))
|
||||
return false;
|
||||
return buf.eof();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -230,21 +230,37 @@ String Cluster::Address::toFullString(bool use_compact_format) const
|
||||
}
|
||||
}
|
||||
|
||||
Cluster::Address Cluster::Address::fromFullString(const String & full_string)
|
||||
Cluster::Address Cluster::Address::fromFullString(std::string_view full_string)
|
||||
{
|
||||
const char * address_begin = full_string.data();
|
||||
const char * address_end = address_begin + full_string.size();
|
||||
|
||||
const char * user_pw_end = strchr(full_string.data(), '@');
|
||||
std::string_view user_password;
|
||||
if (auto pos = full_string.find('@'); pos != std::string_view::npos)
|
||||
user_password = full_string.substr(pos + 1);
|
||||
|
||||
/// parsing with the new shard{shard_index}[_replica{replica_index}] format
|
||||
if (!user_pw_end && startsWith(full_string, "shard"))
|
||||
if (user_password.empty() && full_string.starts_with("shard"))
|
||||
{
|
||||
const char * underscore = strchr(full_string.data(), '_');
|
||||
|
||||
Address address;
|
||||
address.shard_index = parse<UInt32>(address_begin + strlen("shard"));
|
||||
address.replica_index = underscore ? parse<UInt32>(underscore + strlen("_replica")) : 0;
|
||||
|
||||
if (auto underscore_pos = full_string.find('_'); underscore_pos != std::string_view::npos)
|
||||
{
|
||||
address.shard_index = parse<UInt32>(full_string.substr(0, underscore_pos).substr(strlen("shard")));
|
||||
|
||||
if (full_string.substr(underscore_pos + 1).starts_with("replica"))
|
||||
{
|
||||
address.replica_index = parse<UInt32>(full_string.substr(underscore_pos + 1 + strlen("replica")));
|
||||
}
|
||||
else if (full_string.substr(underscore_pos + 1).starts_with("all_replicas"))
|
||||
{
|
||||
address.replica_index = 0;
|
||||
}
|
||||
else
|
||||
throw Exception(ErrorCodes::SYNTAX_ERROR, "Incorrect address '{}', should be in a form of `shardN_all_replicas` or `shardN_replicaM`", full_string);
|
||||
}
|
||||
else
|
||||
{
|
||||
address.shard_index = parse<UInt32>(full_string.substr(strlen("shard")));
|
||||
address.replica_index = 0;
|
||||
}
|
||||
|
||||
return address;
|
||||
}
|
||||
@ -255,9 +271,13 @@ Cluster::Address Cluster::Address::fromFullString(const String & full_string)
|
||||
/// - credentials are exposed in file name;
|
||||
/// - the file name can be too long.
|
||||
|
||||
const char * address_begin = full_string.data();
|
||||
const char * address_end = address_begin + full_string.size();
|
||||
const char * user_pw_end = strchr(address_begin, '@');
|
||||
|
||||
Protocol::Secure secure = Protocol::Secure::Disable;
|
||||
const char * secure_tag = "+secure";
|
||||
if (endsWith(full_string, secure_tag))
|
||||
if (full_string.ends_with(secure_tag))
|
||||
{
|
||||
address_end -= strlen(secure_tag);
|
||||
secure = Protocol::Secure::Enable;
|
||||
|
@ -168,7 +168,7 @@ public:
|
||||
String toFullString(bool use_compact_format) const;
|
||||
|
||||
/// Returns address with only shard index and replica index or full address without shard index and replica index
|
||||
static Address fromFullString(const String & address_full_string);
|
||||
static Address fromFullString(std::string_view full_string);
|
||||
|
||||
/// Returns resolved address if it does resolve.
|
||||
std::optional<Poco::Net::SocketAddress> getResolvedAddress() const;
|
||||
|
@ -888,13 +888,22 @@ static Field applyFunctionForField(
|
||||
return (*col)[0];
|
||||
}
|
||||
|
||||
/// applyFunction will execute the function with one `field` or the column which `field` refers to.
|
||||
static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field)
|
||||
{
|
||||
chassert(func != nullptr);
|
||||
/// Fallback for fields without block reference.
|
||||
if (field.isExplicit())
|
||||
return applyFunctionForField(func, current_type, field);
|
||||
|
||||
String result_name = "_" + func->getName() + "_" + toString(field.column_idx);
|
||||
/// We will cache the function result inside `field.columns`, because this function will call many times
|
||||
/// from many fields from same column. When the column is huge, for example there are thousands of marks, we need a cache.
|
||||
/// The cache key is like `_[function_pointer]_[param_column_id]` to identify a unique <function, param> pair.
|
||||
WriteBufferFromOwnString buf;
|
||||
writeText("_", buf);
|
||||
writePointerHex(func.get(), buf);
|
||||
writeText("_" + toString(field.column_idx), buf);
|
||||
String result_name = buf.str();
|
||||
const auto & columns = field.columns;
|
||||
size_t result_idx = columns->size();
|
||||
|
||||
@ -906,6 +915,7 @@ static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr &
|
||||
|
||||
if (result_idx == columns->size())
|
||||
{
|
||||
/// When cache is missed, we calculate the whole column where the field comes from. This will avoid repeated calculation.
|
||||
ColumnsWithTypeAndName args{(*columns)[field.column_idx]};
|
||||
field.columns->emplace_back(ColumnWithTypeAndName {nullptr, func->getResultType(), result_name});
|
||||
(*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size());
|
||||
|
@ -55,6 +55,7 @@ const std::unordered_map<String, unum::usearch::scalar_kind_t> quantizationToSca
|
||||
{"f32", unum::usearch::scalar_kind_t::f32_k},
|
||||
{"f16", unum::usearch::scalar_kind_t::f16_k},
|
||||
{"i8", unum::usearch::scalar_kind_t::i8_k}};
|
||||
/// Usearch provides more quantizations but ^^ above ones seem the only ones comprehensively supported across all distance functions.
|
||||
|
||||
template<typename T>
|
||||
concept is_set = std::same_as<T, std::set<typename T::key_type, typename T::key_compare, typename T::allocator_type>>;
|
||||
@ -98,9 +99,6 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
|
||||
unum::usearch::index_dense_config_t config(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search);
|
||||
config.enable_key_lookups = false; /// we don't do row-to-vector lookups
|
||||
|
||||
if (auto error = config.validate(); error) /// already called in vectorSimilarityIndexValidator, call again because usearch may change the config in-place
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parameters passed to vector similarity index. Error: {}", String(error.release()));
|
||||
|
||||
if (auto result = USearchIndex::make(metric, config); !result)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not create vector similarity index. Error: {}", String(result.error.release()));
|
||||
else
|
||||
@ -250,14 +248,47 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAnd
|
||||
return granule;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template <typename Column>
|
||||
void updateImpl(const ColumnArray * column_array, const ColumnArray::Offsets & column_array_offsets, USearchIndexWithSerializationPtr & index, size_t dimensions, size_t rows)
|
||||
{
|
||||
const auto & column_array_data = column_array->getData();
|
||||
const auto & column_array_data_float = typeid_cast<const Column &>(column_array_data);
|
||||
const auto & column_array_data_float_data = column_array_data_float.getData();
|
||||
|
||||
/// Check all sizes are the same
|
||||
for (size_t row = 0; row < rows - 1; ++row)
|
||||
if (column_array_offsets[row + 1] - column_array_offsets[row] != dimensions)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length");
|
||||
|
||||
/// Reserving space is mandatory
|
||||
if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + rows)))
|
||||
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index");
|
||||
|
||||
for (size_t row = 0; row < rows; ++row)
|
||||
{
|
||||
if (auto result = index->add(static_cast<USearchIndex::vector_key_t>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release()));
|
||||
else
|
||||
{
|
||||
ProfileEvents::increment(ProfileEvents::USearchAddCount);
|
||||
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members);
|
||||
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_t * pos, size_t limit)
|
||||
{
|
||||
if (*pos >= block.rows())
|
||||
throw Exception(
|
||||
ErrorCodes::LOGICAL_ERROR,
|
||||
"The provided position is not less than the number of block rows. Position: {}, Block rows: {}.",
|
||||
*pos,
|
||||
block.rows());
|
||||
*pos, block.rows());
|
||||
|
||||
size_t rows_read = std::min(limit, block.rows() - *pos);
|
||||
|
||||
@ -271,63 +302,53 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected block with single column");
|
||||
|
||||
const String & index_column_name = index_sample_block.getByPosition(0).name;
|
||||
ColumnPtr column_cut = block.getByName(index_column_name).column->cut(*pos, rows_read);
|
||||
const ColumnPtr & index_column = block.getByName(index_column_name).column;
|
||||
ColumnPtr column_cut = index_column->cut(*pos, rows_read);
|
||||
|
||||
if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
|
||||
{
|
||||
const auto & column_array_data = column_array->getData();
|
||||
const auto & column_array_data_float = typeid_cast<const ColumnFloat32 &>(column_array_data);
|
||||
const auto & column_array_data_float_data = column_array_data_float.getData();
|
||||
const auto * column_array = typeid_cast<const ColumnArray *>(column_cut.get());
|
||||
if (!column_array)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float*) column");
|
||||
|
||||
const auto & column_array_offsets = column_array->getOffsets();
|
||||
const size_t num_rows = column_array_offsets.size();
|
||||
if (column_array->empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
|
||||
|
||||
if (column_array->empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty");
|
||||
/// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
|
||||
/// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default
|
||||
/// values which is also empty.
|
||||
if (column_array->isDefaultAt(0))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
|
||||
|
||||
/// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
|
||||
/// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default
|
||||
/// values which is also empty.
|
||||
if (column_array->isDefaultAt(0))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);
|
||||
const size_t rows = column_array->size();
|
||||
|
||||
/// Check all sizes are the same
|
||||
const size_t dimensions = column_array_offsets[0];
|
||||
for (size_t i = 0; i < num_rows - 1; ++i)
|
||||
if (column_array_offsets[i + 1] - column_array_offsets[i] != dimensions)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
|
||||
const auto & column_array_offsets = column_array->getOffsets();
|
||||
const size_t dimensions = column_array_offsets[0];
|
||||
|
||||
/// Also check that previously inserted blocks have the same size as this block.
|
||||
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
|
||||
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
|
||||
if (index && index->dimensions() != dimensions)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
|
||||
if (!index)
|
||||
index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
|
||||
|
||||
if (!index)
|
||||
index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
|
||||
/// Also check that previously inserted blocks have the same size as this block.
|
||||
/// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
|
||||
/// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
|
||||
if (index->dimensions() != dimensions)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column with vector similarity index must have equal length");
|
||||
|
||||
/// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp
|
||||
if (index->size() + num_rows > std::numeric_limits<UInt32>::max())
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index in column {} would exceed 4 billion entries", index_column_name);
|
||||
/// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp
|
||||
if (index->size() + rows > std::numeric_limits<UInt32>::max())
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index would exceed 4 billion entries");
|
||||
|
||||
/// Reserving space is mandatory
|
||||
if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows)))
|
||||
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index");
|
||||
DataTypePtr data_type = block.getDataTypes()[0];
|
||||
const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get());
|
||||
if (!data_type_array)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected data type Array(Float*)");
|
||||
const TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
|
||||
|
||||
for (size_t row = 0; row < num_rows; ++row)
|
||||
{
|
||||
if (auto result = index->add(static_cast<UInt32>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release()));
|
||||
else
|
||||
{
|
||||
ProfileEvents::increment(ProfileEvents::USearchAddCount);
|
||||
ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members);
|
||||
ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (WhichDataType(nested_type_index).isFloat32())
|
||||
updateImpl<ColumnFloat32>(column_array, column_array_offsets, index, dimensions, rows);
|
||||
else if (WhichDataType(nested_type_index).isFloat64())
|
||||
updateImpl<ColumnFloat64>(column_array, column_array_offsets, index, dimensions, rows);
|
||||
else
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected Array(Float32) column");
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected data type Array(Float*)");
|
||||
|
||||
|
||||
*pos += rows_read;
|
||||
}
|
||||
@ -375,7 +396,7 @@ std::vector<size_t> MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer
|
||||
"does not match the dimension in the index ({})",
|
||||
vector_similarity_condition.getDimensions(), index->dimensions());
|
||||
|
||||
const std::vector<float> reference_vector = vector_similarity_condition.getReferenceVector();
|
||||
const std::vector<Float64> reference_vector = vector_similarity_condition.getReferenceVector();
|
||||
|
||||
auto search_result = index->search(reference_vector.data(), limit);
|
||||
if (!search_result)
|
||||
@ -486,7 +507,7 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
|
||||
if (!quantizationToScalarKind.contains(index.arguments[2].safeGet<String>()))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind));
|
||||
|
||||
/// Call Usearche's own parameter validation method for HNSW-specific parameters
|
||||
/// Call Usearch's own parameter validation method for HNSW-specific parameters
|
||||
UInt64 m = index.arguments[3].safeGet<UInt64>();
|
||||
UInt64 ef_construction = index.arguments[4].safeGet<UInt64>();
|
||||
UInt64 ef_search = index.arguments[5].safeGet<UInt64>();
|
||||
@ -501,18 +522,14 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
|
||||
if (index.column_names.size() != 1 || index.data_types.size() != 1)
|
||||
throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column");
|
||||
|
||||
/// Check data type of the indexed column:
|
||||
/// Check that the data type is Array(Float*)
|
||||
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
|
||||
if (const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get()))
|
||||
{
|
||||
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
|
||||
if (!WhichDataType(nested_type_index).isFloat32())
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)");
|
||||
}
|
||||
const auto * data_type_array = typeid_cast<const DataTypeArray *>(data_type.get());
|
||||
if (!data_type_array)
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float*)");
|
||||
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
|
||||
if (!WhichDataType(nested_type_index).isFloat())
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float*)");
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ namespace
|
||||
{
|
||||
|
||||
template <typename Literal>
|
||||
void extractReferenceVectorFromLiteral(std::vector<Float32> & reference_vector, Literal literal)
|
||||
void extractReferenceVectorFromLiteral(std::vector<Float64> & reference_vector, Literal literal)
|
||||
{
|
||||
Float64 float_element_of_reference_vector;
|
||||
Int64 int_element_of_reference_vector;
|
||||
@ -72,7 +72,7 @@ UInt64 VectorSimilarityCondition::getLimit() const
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "No LIMIT section in query, not supported");
|
||||
}
|
||||
|
||||
std::vector<float> VectorSimilarityCondition::getReferenceVector() const
|
||||
std::vector<Float64> VectorSimilarityCondition::getReferenceVector() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value())
|
||||
return query_information->reference_vector;
|
||||
|
@ -60,7 +60,7 @@ public:
|
||||
L2
|
||||
};
|
||||
|
||||
std::vector<Float32> reference_vector;
|
||||
std::vector<Float64> reference_vector;
|
||||
DistanceFunction distance_function;
|
||||
String column_name;
|
||||
UInt64 limit;
|
||||
@ -70,7 +70,7 @@ public:
|
||||
/// Returns false if query can be speeded up by an ANN index, true otherwise.
|
||||
bool alwaysUnknownOrTrue(String distance_function) const;
|
||||
|
||||
std::vector<float> getReferenceVector() const;
|
||||
std::vector<Float64> getReferenceVector() const;
|
||||
size_t getDimensions() const;
|
||||
String getColumnName() const;
|
||||
Info::DistanceFunction getDistanceFunction() const;
|
||||
|
@ -332,6 +332,8 @@ struct DeltaLakeMetadataImpl
|
||||
WhichDataType which(check_type->getTypeId());
|
||||
if (which.isStringOrFixedString())
|
||||
return value;
|
||||
else if (isBool(check_type))
|
||||
return parse<bool>(value);
|
||||
else if (which.isInt8())
|
||||
return parse<Int8>(value);
|
||||
else if (which.isUInt8())
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "StorageExternalDistributed.h"
|
||||
#include <Storages/StorageExternalDistributed.h>
|
||||
|
||||
#include <Core/Settings.h>
|
||||
#include <Storages/StorageFactory.h>
|
||||
@ -6,6 +6,8 @@
|
||||
#include <Interpreters/InterpreterSelectQuery.h>
|
||||
#include <Core/PostgreSQL/PoolWithFailover.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Common/parseAddress.h>
|
||||
#include <Processors/QueryPlan/QueryPlan.h>
|
||||
#include <Common/parseRemoteDescription.h>
|
||||
@ -112,14 +114,39 @@ void registerStorageExternalDistributed(StorageFactory & factory)
|
||||
std::unordered_set<StoragePtr> shards;
|
||||
ASTs inner_engine_args(engine_args.begin() + 1, engine_args.end());
|
||||
|
||||
ASTPtr * address_arg = nullptr;
|
||||
|
||||
/// If there is a named collection argument, named `addresses_expr`
|
||||
for (auto & node : inner_engine_args)
|
||||
{
|
||||
if (ASTFunction * func = node->as<ASTFunction>(); func && func->name == "equals" && func->arguments)
|
||||
{
|
||||
if (ASTExpressionList * func_args = func->arguments->as<ASTExpressionList>(); func_args && func_args->children.size() == 2)
|
||||
{
|
||||
if (ASTIdentifier * arg_name = func_args->children[0]->as<ASTIdentifier>(); arg_name && arg_name->name() == "addresses_expr")
|
||||
{
|
||||
address_arg = &func_args->children[1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Otherwise it is the first argument.
|
||||
if (!address_arg)
|
||||
address_arg = &inner_engine_args.at(0);
|
||||
|
||||
String addresses_expr = checkAndGetLiteralArgument<String>(*address_arg, "addresses");
|
||||
Strings shards_addresses = get_addresses(addresses_expr);
|
||||
|
||||
auto engine_name = checkAndGetLiteralArgument<String>(engine_args[0], "engine_name");
|
||||
if (engine_name == "URL")
|
||||
{
|
||||
auto configuration = StorageURL::getConfiguration(inner_engine_args, context);
|
||||
auto shards_addresses = get_addresses(configuration.addresses_expr);
|
||||
auto format_settings = StorageURL::getFormatSettingsFromArgs(args);
|
||||
for (const auto & shard_address : shards_addresses)
|
||||
{
|
||||
*address_arg = std::make_shared<ASTLiteral>(shard_address);
|
||||
auto configuration = StorageURL::getConfiguration(inner_engine_args, context);
|
||||
auto uri_options = parseRemoteDescription(shard_address, 0, shard_address.size(), '|', max_addresses);
|
||||
if (uri_options.size() > 1)
|
||||
{
|
||||
@ -140,13 +167,12 @@ void registerStorageExternalDistributed(StorageFactory & factory)
|
||||
else if (engine_name == "MySQL")
|
||||
{
|
||||
MySQLSettings mysql_settings;
|
||||
auto configuration = StorageMySQL::getConfiguration(inner_engine_args, context, mysql_settings);
|
||||
auto shards_addresses = get_addresses(configuration.addresses_expr);
|
||||
for (const auto & shard_address : shards_addresses)
|
||||
{
|
||||
auto current_configuration{configuration};
|
||||
current_configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 3306);
|
||||
auto pool = createMySQLPoolWithFailover(current_configuration, mysql_settings);
|
||||
*address_arg = std::make_shared<ASTLiteral>(shard_address);
|
||||
auto configuration = StorageMySQL::getConfiguration(inner_engine_args, context, mysql_settings);
|
||||
configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 3306);
|
||||
auto pool = createMySQLPoolWithFailover(configuration, mysql_settings);
|
||||
shards.insert(std::make_shared<StorageMySQL>(
|
||||
args.table_id, std::move(pool), configuration.database, configuration.table,
|
||||
/* replace_query = */ false, /* on_duplicate_clause = */ "",
|
||||
@ -157,14 +183,13 @@ void registerStorageExternalDistributed(StorageFactory & factory)
|
||||
#if USE_LIBPQXX
|
||||
else if (engine_name == "PostgreSQL")
|
||||
{
|
||||
auto configuration = StoragePostgreSQL::getConfiguration(inner_engine_args, context);
|
||||
auto shards_addresses = get_addresses(configuration.addresses_expr);
|
||||
for (const auto & shard_address : shards_addresses)
|
||||
{
|
||||
auto current_configuration{configuration};
|
||||
current_configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 5432);
|
||||
*address_arg = std::make_shared<ASTLiteral>(shard_address);
|
||||
auto configuration = StoragePostgreSQL::getConfiguration(inner_engine_args, context);
|
||||
configuration.addresses = parseRemoteDescriptionForExternalDatabase(shard_address, max_addresses, 5432);
|
||||
auto pool = std::make_shared<postgres::PoolWithFailover>(
|
||||
current_configuration,
|
||||
configuration,
|
||||
settings.postgresql_connection_pool_size,
|
||||
settings.postgresql_connection_pool_wait_timeout,
|
||||
settings.postgresql_connection_pool_retries,
|
||||
|
@ -46,7 +46,7 @@ def test_cgroup_cpu_limit():
|
||||
"clickhouse local -q \"select value from system.settings where name='max_threads'\"",
|
||||
num_cpus,
|
||||
)
|
||||
expect_output = (r"\'auto({})\'".format(math.ceil(num_cpus))).encode()
|
||||
expect_output = (r"auto({})".format(math.ceil(num_cpus))).encode()
|
||||
assert (
|
||||
result.strip() == expect_output
|
||||
), f"fail for cpu limit={num_cpus}, result={result.strip()}, expect={expect_output}"
|
||||
|
@ -6,41 +6,56 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
TEST_PREFIX=$RANDOM
|
||||
TEST_PREFIX="${CLICKHOUSE_DATABASE}"
|
||||
${CLICKHOUSE_CLIENT} -q "drop user if exists u_00600${TEST_PREFIX}"
|
||||
${CLICKHOUSE_CLIENT} -q "create user u_00600${TEST_PREFIX} settings max_execution_time=60, readonly=1"
|
||||
${CLICKHOUSE_CLIENT} -q "grant select on system.numbers to u_00600${TEST_PREFIX}"
|
||||
|
||||
function wait_for_query_to_start()
|
||||
{
|
||||
while [[ $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") == 0 ]]; do sleep 0.1; done
|
||||
while [[ 0 -eq $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE query_id = '$1'") ]]
|
||||
do
|
||||
sleep 0.1
|
||||
done
|
||||
}
|
||||
|
||||
function wait_for_queries_to_finish()
|
||||
{
|
||||
while [[ 0 -ne $($CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "SELECT count() FROM system.processes WHERE current_database = '${CLICKHOUSE_DATABASE}' AND query NOT LIKE '%this query%'") ]]
|
||||
do
|
||||
sleep 0.1
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=hello&replace_running_query=1" -d 'SELECT 1, count() FROM system.numbers' > /dev/null 2>&1 &
|
||||
wait_for_query_to_start 'hello'
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=${CLICKHOUSE_DATABASE}hello&replace_running_query=1" -d 'SELECT 1, count() FROM system.numbers' > /dev/null 2>&1 &
|
||||
wait_for_query_to_start "${CLICKHOUSE_DATABASE}hello"
|
||||
|
||||
# Replace it
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=hello&replace_running_query=1" -d 'SELECT 0'
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=${CLICKHOUSE_DATABASE}hello&replace_running_query=1" -d 'SELECT 0'
|
||||
|
||||
# Wait for it to be replaced
|
||||
wait
|
||||
wait_for_queries_to_finish
|
||||
|
||||
${CLICKHOUSE_CLIENT_BINARY} --user=u_00600${TEST_PREFIX} --query_id=42 --query='SELECT 2, count() FROM system.numbers' 2>&1 | grep -cF 'was cancelled' &
|
||||
wait_for_query_to_start '42'
|
||||
${CLICKHOUSE_CLIENT_BINARY} --user=u_00600${TEST_PREFIX} --query_id="${CLICKHOUSE_DATABASE}42" --query='SELECT 2, count() FROM system.numbers' 2>&1 | grep -cF 'QUERY_WAS_CANCELLED' &
|
||||
wait_for_query_to_start "${CLICKHOUSE_DATABASE}42"
|
||||
|
||||
# Trying to run another query with the same query_id
|
||||
${CLICKHOUSE_CLIENT} --query_id=42 --query='SELECT 43' 2>&1 | grep -cF 'is already running by user'
|
||||
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --query='SELECT 43' 2>&1 | grep -cF 'is already running by user'
|
||||
|
||||
# Trying to replace query of a different user
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=42&replace_running_query=1" -d 'SELECT 1' | grep -cF 'is already running by user'
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL&query_id=${CLICKHOUSE_DATABASE}42&replace_running_query=1" -d 'SELECT 1' | grep -cF 'is already running by user'
|
||||
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = '42' SYNC" > /dev/null
|
||||
$CLICKHOUSE_CURL -sS "$CLICKHOUSE_URL" -d "KILL QUERY WHERE query_id = '${CLICKHOUSE_DATABASE}42' SYNC" > /dev/null
|
||||
wait
|
||||
wait_for_queries_to_finish
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query_id=42 --query='SELECT 3, count() FROM system.numbers' 2>&1 | grep -cF 'was cancelled' &
|
||||
wait_for_query_to_start '42'
|
||||
${CLICKHOUSE_CLIENT} --query_id=42 --replace_running_query=1 --replace_running_query_max_wait_ms=500 --query='SELECT 43' 2>&1 | grep -F "can't be stopped" > /dev/null
|
||||
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --query='SELECT 3, count() FROM system.numbers' 2>&1 | grep -cF 'QUERY_WAS_CANCELLED' &
|
||||
wait_for_query_to_start "${CLICKHOUSE_DATABASE}42"
|
||||
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --replace_running_query=1 --replace_running_query_max_wait_ms=500 --query='SELECT 43' 2>&1 | grep -F "can't be stopped" > /dev/null
|
||||
wait
|
||||
${CLICKHOUSE_CLIENT} --query_id=42 --replace_running_query=1 --query='SELECT 44'
|
||||
wait_for_queries_to_finish
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query_id="${CLICKHOUSE_DATABASE}42" --replace_running_query=1 --query='SELECT 44'
|
||||
${CLICKHOUSE_CLIENT} -q "drop user u_00600${TEST_PREFIX}"
|
||||
|
@ -1,3 +1,4 @@
|
||||
Rejects INSERTs of Arrays with different sizes
|
||||
Issue #52258: Empty Arrays or Arrays with default values are rejected
|
||||
It is possible to create parts with different Array vector sizes but there will be an error at query time
|
||||
Correctness of index with > 1 mark
|
||||
|
@ -7,6 +7,12 @@ SET enable_analyzer = 1; -- 0 vs. 1 produce slightly different error codes, make
|
||||
|
||||
DROP TABLE IF EXISTS tab;
|
||||
|
||||
SELECT 'Rejects INSERTs of Arrays with different sizes';
|
||||
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
|
||||
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA }
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected';
|
||||
|
||||
CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree() ORDER BY id;
|
||||
|
@ -5,4 +5,3 @@ Two or six index arguments
|
||||
4nd argument (M), if given, must be UInt64 and > 1
|
||||
Must be created on single column
|
||||
Must be created on Array(Float32) columns
|
||||
Rejects INSERTs of Arrays with different sizes
|
||||
|
@ -35,11 +35,6 @@ SELECT 'Must be created on Array(Float32) columns';
|
||||
SET allow_suspicious_low_cardinality_types = 1;
|
||||
CREATE TABLE tab(id Int32, vec UInt64, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vec Array(UInt64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
|
||||
SELECT 'Rejects INSERTs of Arrays with different sizes';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id;
|
||||
INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA }
|
||||
DROP TABLE tab;
|
||||
|
@ -1,9 +1,7 @@
|
||||
10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block
|
||||
- ORDER-BY-type
|
||||
5 [0,2] 0
|
||||
6 [0,2.1] 0.09999990463256836
|
||||
7 [0,2.2] 0.20000004768371582
|
||||
- ORDER-BY-type, EXPLAIN
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
@ -20,11 +18,9 @@ Expression (Projection)
|
||||
Parts: 1/1
|
||||
Granules: 1/1
|
||||
12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block
|
||||
- ORDER-BY-type
|
||||
6 [0,2] 0
|
||||
7 [0,2.1] 0.09999990463256836
|
||||
8 [0,2.2] 0.20000004768371582
|
||||
- ORDER-BY-type, EXPLAIN
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
@ -41,11 +37,11 @@ Expression (Projection)
|
||||
Parts: 1/1
|
||||
Granules: 2/4
|
||||
Special cases
|
||||
- ORDER-BY-type
|
||||
-- Non-default metric, M, ef_construction, ef_search
|
||||
6 [1,9.3] 0.005731362878640178
|
||||
1 [2,3.2] 0.15200169244542905
|
||||
7 [5.5,4.7] 0.3503476876550442
|
||||
- Special case: setting "max_limit_for_ann_queries"
|
||||
-- Setting "max_limit_for_ann_queries"
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
@ -56,3 +52,62 @@ Expression (Projection)
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
-- Non-default quantization
|
||||
1 [2,3.2] 2.3323807824711897
|
||||
2 [4.2,3.4] 4.427188573446585
|
||||
0 [4.6,2.3] 4.609772130377966
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab_f32)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: idx
|
||||
Description: vector_similarity GRANULARITY 2
|
||||
Parts: 1/1
|
||||
Granules: 2/4
|
||||
1 [2,3.2] 2.3323807824711897
|
||||
2 [4.2,3.4] 4.427188573446585
|
||||
0 [4.6,2.3] 4.609772130377966
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab_f16)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: idx
|
||||
Description: vector_similarity GRANULARITY 2
|
||||
Parts: 1/1
|
||||
Granules: 2/4
|
||||
1 [2,3.2] 2.3323807824711897
|
||||
2 [4.2,3.4] 4.427188573446585
|
||||
0 [4.6,2.3] 4.609772130377966
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab_i8)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: idx
|
||||
Description: vector_similarity GRANULARITY 2
|
||||
Parts: 1/1
|
||||
Granules: 2/4
|
||||
-- Index on Array(Float64) column
|
||||
6 [0,2] 0
|
||||
7 [0,2.1] 0.10000000000000009
|
||||
8 [0,2.2] 0.20000000000000018
|
||||
|
@ -14,14 +14,12 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
|
||||
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]);
|
||||
|
||||
|
||||
SELECT '- ORDER-BY-type';
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
SELECT '- ORDER-BY-type, EXPLAIN';
|
||||
EXPLAIN indexes = 1
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
@ -37,14 +35,12 @@ SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexe
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
|
||||
|
||||
SELECT '- ORDER-BY-type';
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
SELECT '- ORDER-BY-type, EXPLAIN';
|
||||
EXPLAIN indexes = 1
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
@ -56,19 +52,18 @@ DROP TABLE tab;
|
||||
|
||||
|
||||
SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen.
|
||||
-- Test with non-default metric, M, ef_construction, ef_search
|
||||
|
||||
SELECT '-- Non-default metric, M, ef_construction, ef_search';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
|
||||
SELECT '- ORDER-BY-type';
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, cosineDistance(vec, reference_vec)
|
||||
FROM tab
|
||||
ORDER BY cosineDistance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
SELECT '- Special case: setting "max_limit_for_ann_queries"';
|
||||
SELECT '-- Setting "max_limit_for_ann_queries"';
|
||||
EXPLAIN indexes=1
|
||||
WITH [0.0, 2.0] as reference_vec
|
||||
SELECT id, vec, cosineDistance(vec, reference_vec)
|
||||
@ -78,3 +73,66 @@ LIMIT 3
|
||||
SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann index
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '-- Non-default quantization';
|
||||
CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
INSERT INTO tab_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab_f32
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
EXPLAIN indexes = 1
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab_f32
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab_f16
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
EXPLAIN indexes = 1
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab_f16
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab_i8
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
EXPLAIN indexes = 1
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab_i8
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab_f32;
|
||||
DROP TABLE tab_f16;
|
||||
DROP TABLE tab_i8;
|
||||
|
||||
SELECT '-- Index on Array(Float64) column';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]);
|
||||
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
SELECT id, vec, L2Distance(vec, reference_vec)
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vec, reference_vec)
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
@ -0,0 +1 @@
|
||||
50
|
11
tests/queries/0_stateless/03221_key_condition_bug.sql
Normal file
11
tests/queries/0_stateless/03221_key_condition_bug.sql
Normal file
@ -0,0 +1,11 @@
|
||||
CREATE TABLE IF NOT EXISTS report_metrics_v2
|
||||
(
|
||||
`a` UInt64
|
||||
) Engine = MergeTree()
|
||||
ORDER BY a;
|
||||
|
||||
insert into report_metrics_v2 SELECT * FROM system.numbers LIMIT 50000;
|
||||
|
||||
SELECT count(*) from report_metrics_v2 WHERE (intDiv(a, 50) = 200) AND (intDiv(a, 50000) = 0);
|
||||
|
||||
DROP TABLE report_metrics_v2;
|
29
tests/queries/0_stateless/03223_analyzer_with_cube_fuzz.sql
Normal file
29
tests/queries/0_stateless/03223_analyzer_with_cube_fuzz.sql
Normal file
@ -0,0 +1,29 @@
|
||||
SET enable_analyzer = 1;
|
||||
|
||||
DROP TABLE IF EXISTS t1;
|
||||
DROP TABLE IF EXISTS t2;
|
||||
|
||||
CREATE TABLE t1 (`a` Int64, `b` Int64) ENGINE = MergeTree ORDER BY a;
|
||||
CREATE TABLE t2 (`key` Int32, `val` Int64) ENGINE = MergeTree ORDER BY key;
|
||||
insert into t1 Select number, number from numbers(100000);
|
||||
insert into t2 Select number, number from numbers(100000);
|
||||
|
||||
|
||||
SELECT
|
||||
1 * 1000.0001,
|
||||
(count(1.) = -2147483647) AND (count(a) = 1.1920928955078125e-7) AND (count(val) = 1048577) AND (sum(val) = ((NULL * 1048576) / -9223372036854775807)) AND (sum(a) = ((9223372036854775806 * 10000000000.) / 1048575))
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
a,
|
||||
val
|
||||
FROM t1
|
||||
FULL OUTER JOIN t2 ON (t1.a = t2.key) OR (1 * inf) OR (t1.b = t2.key)
|
||||
)
|
||||
GROUP BY '65537'
|
||||
WITH CUBE
|
||||
FORMAT Null
|
||||
SETTINGS max_block_size = 100, join_use_nulls = 1, max_execution_time = 1., max_result_rows = 0, max_result_bytes = 0; -- { serverError TIMEOUT_EXCEEDED }
|
||||
|
||||
DROP TABLE t1;
|
||||
DROP TABLE t2;
|
Loading…
Reference in New Issue
Block a user