mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge remote-tracking branch 'origin/master' into pr-enable-local-plan
This commit is contained in:
commit
4ae488dd7f
2
contrib/usearch
vendored
2
contrib/usearch
vendored
@ -1 +1 @@
|
||||
Subproject commit d1d33eac94acd3b628e0b446c927ec3295ef63c7
|
||||
Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48
|
@ -42,13 +42,12 @@ ENV TZ=Etc/UTC
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
ENV DOCKER_CHANNEL stable
|
||||
# Unpin the docker version after the release 24.0.3 is released
|
||||
# https://github.com/moby/moby/issues/45770#issuecomment-1618255130
|
||||
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \
|
||||
&& add-apt-repository "deb https://download.docker.com/linux/ubuntu $(lsb_release -c -s) ${DOCKER_CHANNEL}" \
|
||||
&& apt-get update \
|
||||
&& env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
|
||||
docker-ce='5:23.*' docker-compose-plugin='2.29.*' \
|
||||
docker-ce="5:27.0.3*" \
|
||||
docker-compose-plugin='2.29.*' \
|
||||
&& rm -rf \
|
||||
/var/lib/apt/lists/* \
|
||||
/var/cache/debconf \
|
||||
|
@ -42,6 +42,10 @@ brew install ccache cmake ninja libtool gettext llvm gcc binutils grep findutils
|
||||
|
||||
## Checkout ClickHouse Sources {#checkout-clickhouse-sources}
|
||||
|
||||
:::note
|
||||
The ClickHouse build assumes a case-sensitive file system. Case-insensitive file systems may cause errors during the build process. If necessary, please follow [these instructions](https://brianboyko.medium.com/a-case-sensitive-src-folder-for-mac-programmers-176cc82a3830) to create a new disk image and checkout the code into it.
|
||||
:::
|
||||
|
||||
``` bash
|
||||
git clone --recursive git@github.com:ClickHouse/ClickHouse.git
|
||||
# ...alternatively, you can use https://github.com/ClickHouse/ClickHouse.git as the repo URL.
|
||||
|
@ -43,7 +43,7 @@ CREATE TABLE table
|
||||
(
|
||||
id Int64,
|
||||
vectors Array(Float32),
|
||||
INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N]
|
||||
INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, hnsw_max_connections_per_layer, hnsw_candidate_list_size_for_construction]) [GRANULARITY N]
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
@ -55,11 +55,13 @@ Parameters:
|
||||
line between two points in Euclidean space), or `cosineDistance` (the [cosine
|
||||
distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors).
|
||||
- `quantization`: either `f64`, `f32`, `f16`, `bf16`, or `i8` for storing the vector with reduced precision (optional, default: `bf16`)
|
||||
- `m`: the number of neighbors per graph node (optional, default: 16)
|
||||
- `ef_construction`: (optional, default: 128)
|
||||
- `ef_search`: (optional, default: 64)
|
||||
- `hnsw_max_connections_per_layer`: the number of neighbors per HNSW graph node, also known as `M` in the [HNSW
|
||||
paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 16)
|
||||
- `hnsw_candidate_list_size_for_construction`: the size of the dynamic candidate list when constructing the HNSW graph, also known as
|
||||
`ef_construction` in the original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473) (optional, default: 128)
|
||||
|
||||
Value 0 for parameters `m`, `ef_construction`, and `ef_search` refers to the default value.
|
||||
Values 0 for parameters `hnsw_max_connections_per_layer` and `hnsw_candidate_list_size_for_construction` means using the default values of
|
||||
these parameters.
|
||||
|
||||
Example:
|
||||
|
||||
@ -115,6 +117,11 @@ ANN indexes are built during column insertion and merge. As a result, `INSERT` a
|
||||
tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write
|
||||
requests.
|
||||
|
||||
:::tip
|
||||
To reduce the cost of building vector similarity indexes, consider setting `materialize_skip_indexes_on_insert` which disables the
|
||||
construction of skipping indexes on newly inserted parts. Search would fall back to exact search but as inserted parts are typically small
|
||||
compared to the total table size, the performance impact of that would be negligible.
|
||||
|
||||
ANN indexes support this type of query:
|
||||
|
||||
``` sql
|
||||
@ -124,6 +131,7 @@ FROM table
|
||||
WHERE ... -- WHERE clause is optional
|
||||
ORDER BY Distance(vectors, reference_vector)
|
||||
LIMIT N
|
||||
SETTINGS enable_analyzer = 0; -- Temporary limitation, will be lifted
|
||||
```
|
||||
|
||||
:::tip
|
||||
@ -135,6 +143,10 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Dista
|
||||
```
|
||||
:::
|
||||
|
||||
To search using a different value of HNSW parameter `hnsw_candidate_list_size_for_search` (default: 64), also known as `ef_search` in the
|
||||
original [HNSW paper](https://doi.org/10.1109/TPAMI.2018.2889473), run the `SELECT` query with `SETTINGS hnsw_candidate_list_size_for_search
|
||||
= <value>`.
|
||||
|
||||
**Restrictions**: Approximate algorithms used to determine the nearest neighbors require a limit, hence queries without `LIMIT` clause
|
||||
cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting
|
||||
`max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for
|
||||
|
@ -49,7 +49,7 @@ Default value: 8192.
|
||||
|
||||
Maximum size of data granules in bytes.
|
||||
|
||||
Default value: 10Mb.
|
||||
Default value: 10485760 (ca. 10 MiB).
|
||||
|
||||
To restrict the granule size only by number of rows, set to 0 (not recommended).
|
||||
|
||||
|
@ -24,7 +24,7 @@ Returns a random UInt32 number with uniform distribution.
|
||||
|
||||
Uses a linear congruential generator with an initial state obtained from the system, which means that while it appears random, it's not truly random and can be predictable if the initial state is known. For scenarios where true randomness is crucial, consider using alternative methods like system-level calls or integrating with external libraries.
|
||||
|
||||
### Syntax
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
rand()
|
||||
@ -32,15 +32,15 @@ rand()
|
||||
|
||||
Alias: `rand32`
|
||||
|
||||
### Arguments
|
||||
**Arguments**
|
||||
|
||||
None.
|
||||
|
||||
### Returned value
|
||||
**Returned value**
|
||||
|
||||
Returns a number of type UInt32.
|
||||
|
||||
### Example
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT rand();
|
||||
@ -54,23 +54,23 @@ SELECT rand();
|
||||
|
||||
Returns a random UInt64 integer (UInt64) number
|
||||
|
||||
### Syntax
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
rand64()
|
||||
```
|
||||
|
||||
### Arguments
|
||||
**Arguments**
|
||||
|
||||
None.
|
||||
|
||||
### Returned value
|
||||
**Arguments**
|
||||
|
||||
Returns a number UInt64 number with uniform distribution.
|
||||
|
||||
Uses a linear congruential generator with an initial state obtained from the system, which means that while it appears random, it's not truly random and can be predictable if the initial state is known. For scenarios where true randomness is crucial, consider using alternative methods like system-level calls or integrating with external libraries.
|
||||
|
||||
### Example
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT rand64();
|
||||
@ -84,21 +84,21 @@ SELECT rand64();
|
||||
|
||||
Returns a random Float64 number.
|
||||
|
||||
### Syntax
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
randCanonical()
|
||||
```
|
||||
|
||||
### Arguments
|
||||
**Arguments**
|
||||
|
||||
None.
|
||||
|
||||
### Returned value
|
||||
**Arguments**
|
||||
|
||||
Returns a Float64 value between 0 (inclusive) and 1 (exclusive).
|
||||
|
||||
### Example
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT randCanonical();
|
||||
@ -112,25 +112,25 @@ SELECT randCanonical();
|
||||
|
||||
Generates a single constant column filled with a random value. Unlike `rand`, this function ensures the same random value appears in every row of the generated column, making it useful for scenarios requiring a consistent random seed across rows in a single query.
|
||||
|
||||
### Syntax
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
randConstant([x]);
|
||||
```
|
||||
|
||||
### Arguments
|
||||
**Arguments**
|
||||
|
||||
- **[x] (Optional):** An optional expression that influences the generated random value. Even if provided, the resulting value will still be constant within the same query execution. Different queries using the same expression will likely generate different constant values.
|
||||
|
||||
### Returned value
|
||||
**Arguments**
|
||||
|
||||
Returns a column of type UInt32 containing the same random value in each row.
|
||||
|
||||
### Implementation details
|
||||
**Implementation details**
|
||||
|
||||
The actual output will be different for each query execution, even with the same optional expression. The optional parameter may not significantly change the generated value compared to using `randConstant` alone.
|
||||
|
||||
### Examples
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT randConstant() AS random_value;
|
||||
@ -156,22 +156,22 @@ SELECT randConstant(10) AS random_value;
|
||||
|
||||
Returns a random Float64 drawn uniformly from interval [`min`, `max`].
|
||||
|
||||
### Syntax
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
randUniform(min, max)
|
||||
```
|
||||
|
||||
### Arguments
|
||||
**Arguments**
|
||||
|
||||
- `min` - `Float64` - left boundary of the range,
|
||||
- `max` - `Float64` - right boundary of the range.
|
||||
|
||||
### Returned value
|
||||
**Arguments**
|
||||
|
||||
A random number of type [Float64](../data-types/float.md).
|
||||
|
||||
### Example
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT randUniform(5.5, 10) FROM numbers(5)
|
||||
|
@ -1096,7 +1096,7 @@ convertCharset(s, from, to)
|
||||
|
||||
## base58Encode
|
||||
|
||||
Encodes a String using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) in the "Bitcoin" alphabet.
|
||||
Encodes a string using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) in the "Bitcoin" alphabet.
|
||||
|
||||
**Syntax**
|
||||
|
||||
@ -1110,7 +1110,7 @@ base58Encode(plaintext)
|
||||
|
||||
**Returned value**
|
||||
|
||||
- A string containing the encoded value of the argument. [String](../data-types/string.md).
|
||||
- A string containing the encoded value of the argument. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
|
||||
|
||||
**Example**
|
||||
|
||||
@ -1128,7 +1128,7 @@ Result:
|
||||
|
||||
## base58Decode
|
||||
|
||||
Accepts a String and decodes it using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) encoding scheme using "Bitcoin" alphabet.
|
||||
Accepts a string and decodes it using [Base58](https://datatracker.ietf.org/doc/html/draft-msporny-base58) encoding scheme using "Bitcoin" alphabet.
|
||||
|
||||
**Syntax**
|
||||
|
||||
@ -1138,7 +1138,7 @@ base58Decode(encoded)
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `encoded` — [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, an exception is thrown.
|
||||
- `encoded` — [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). If the string is not a valid Base58-encoded value, an exception is thrown.
|
||||
|
||||
**Returned value**
|
||||
|
||||
@ -1170,7 +1170,7 @@ tryBase58Decode(encoded)
|
||||
|
||||
**Parameters**
|
||||
|
||||
- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error.
|
||||
- `encoded`: [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md). If the string is not a valid Base58-encoded value, returns an empty string in case of error.
|
||||
|
||||
**Returned value**
|
||||
|
||||
|
@ -121,6 +121,14 @@ However, you can delete old data using `ALTER TABLE ... DROP PARTITION`.
|
||||
|
||||
To insert a default value instead of `NULL` into a column with not nullable data type, enable [insert_null_as_default](../../operations/settings/settings.md#insert_null_as_default) setting.
|
||||
|
||||
`INSERT` also supports CTE(common table expression). For example, the following two statements are equivalent:
|
||||
|
||||
``` sql
|
||||
INSERT INTO x WITH y AS (SELECT * FROM numbers(10)) SELECT * FROM y;
|
||||
WITH y AS (SELECT * FROM numbers(10)) INSERT INTO x SELECT * FROM y;
|
||||
```
|
||||
|
||||
|
||||
## Inserting Data from a File
|
||||
|
||||
**Syntax**
|
||||
|
@ -18,9 +18,9 @@ generateRandom(['name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_str
|
||||
|
||||
- `name` — Name of corresponding column.
|
||||
- `TypeName` — Type of corresponding column.
|
||||
- `max_array_length` — Maximum elements for all generated arrays or maps. Defaults to `10`.
|
||||
- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`.
|
||||
- `random_seed` — Specify random seed manually to produce stable results. If NULL — seed is randomly generated.
|
||||
- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`.
|
||||
- `max_array_length` — Maximum elements for all generated arrays or maps. Defaults to `10`.
|
||||
|
||||
**Returned Value**
|
||||
|
||||
|
@ -18,9 +18,9 @@ generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_stri
|
||||
|
||||
- `name` — название соответствующего столбца.
|
||||
- `TypeName` — тип соответствующего столбца.
|
||||
- `max_array_length` — максимальная длина массива для всех сгенерированных массивов. По умолчанию `10`.
|
||||
- `max_string_length` — максимальная длина строки для всех генерируемых строк. По умолчанию `10`.
|
||||
- `random_seed` — укажите состояние генератора случайных чисел вручную, чтобы получить стабильные результаты. Если значение равно `NULL` - генератор инициализируется случайным состоянием.
|
||||
- `max_string_length` — максимальная длина строки для всех генерируемых строк. По умолчанию `10`.
|
||||
- `max_array_length` — максимальная длина массива для всех сгенерированных массивов. По умолчанию `10`.
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
|
@ -18,9 +18,9 @@ generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_stri
|
||||
|
||||
- `name` — 对应列的名称。
|
||||
- `TypeName` — 对应列的类型。
|
||||
- `max_array_length` — 生成数组的最大长度。 默认为10。
|
||||
- `max_string_length` — 生成字符串的最大长度。 默认为10。
|
||||
- `random_seed` — 手动指定随机种子以产生稳定的结果。 如果为NULL-种子是随机生成的。
|
||||
- `max_string_length` — 生成字符串的最大长度。 默认为10。
|
||||
- `max_array_length` — 生成数组的最大长度。 默认为10。
|
||||
|
||||
**返回值**
|
||||
|
||||
|
@ -1662,8 +1662,17 @@ try
|
||||
config().getString("path", DBMS_DEFAULT_PATH),
|
||||
std::move(main_config_zk_node_cache),
|
||||
main_config_zk_changed_event,
|
||||
[&](ConfigurationPtr config, bool initial_loading)
|
||||
[&, config_file = config().getString("config-file", "config.xml")](ConfigurationPtr config, bool initial_loading)
|
||||
{
|
||||
if (!initial_loading)
|
||||
{
|
||||
/// Add back "config-file" key which is absent in the reloaded config.
|
||||
config->setString("config-file", config_file);
|
||||
|
||||
/// Apply config updates in global context.
|
||||
global_context->setConfig(config);
|
||||
}
|
||||
|
||||
Settings::checkNoSettingNamesAtTopLevel(*config, config_path);
|
||||
|
||||
ServerSettings new_server_settings;
|
||||
|
@ -4453,8 +4453,9 @@ Optimize GROUP BY when all keys in block are constant
|
||||
M(Bool, legacy_column_name_of_tuple_literal, false, R"(
|
||||
List all names of element of large tuple literals in their column names instead of hash. This settings exists only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher.
|
||||
)", 0) \
|
||||
M(Bool, enable_named_columns_in_function_tuple, true, R"(
|
||||
M(Bool, enable_named_columns_in_function_tuple, false, R"(
|
||||
Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers.
|
||||
Beware that this setting might currently result in broken queries. It's not recommended to use in production
|
||||
)", 0) \
|
||||
\
|
||||
M(Bool, query_plan_enable_optimizations, true, R"(
|
||||
@ -5556,7 +5557,10 @@ If it is set to true, allow to specify experimental compression codecs (but we d
|
||||
Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
|
||||
)", 0) \
|
||||
M(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
|
||||
SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.
|
||||
SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
|
||||
)", 0) \
|
||||
M(UInt64, hnsw_candidate_list_size_for_search, 0, R"(
|
||||
The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'. 0 means USearch's default value (64).
|
||||
)", 0) \
|
||||
M(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
|
||||
Throw exception if unsupported query is used inside transaction
|
||||
|
@ -84,6 +84,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
|
||||
{"input_format_binary_read_json_as_string", false, false, "Add new setting to read values of JSON type as JSON string in RowBinary input format"},
|
||||
{"min_free_disk_bytes_to_perform_insert", 0, 0, "New setting."},
|
||||
{"min_free_disk_ratio_to_perform_insert", 0.0, 0.0, "New setting."},
|
||||
{"enable_named_columns_in_function_tuple", false, false, "Force disable the setting since it breaks queries"},
|
||||
{"cloud_mode_database_engine", 1, 1, "A setting for ClickHouse Cloud"},
|
||||
{"allow_experimental_shared_set_join", 1, 1, "A setting for ClickHouse Cloud"},
|
||||
{"read_through_distributed_cache", 0, 0, "A setting for ClickHouse Cloud"},
|
||||
@ -104,6 +105,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
|
||||
{"input_format_orc_dictionary_as_low_cardinality", false, true, "Treat ORC dictionary encoded columns as LowCardinality columns while reading ORC files"},
|
||||
{"allow_experimental_refreshable_materialized_view", false, true, "Not experimental anymore"},
|
||||
{"max_parts_to_move", 1000, 1000, "New setting"},
|
||||
{"hnsw_candidate_list_size_for_search", 0, 0, "New setting"},
|
||||
{"allow_reorder_prewhere_conditions", false, true, "New setting"},
|
||||
{"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
|
||||
{"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."}
|
||||
@ -158,7 +160,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
|
||||
{"output_format_native_encode_types_in_binary_format", false, false, "Added new setting to allow to write type names in binary format in Native output format"},
|
||||
{"input_format_native_decode_types_in_binary_format", false, false, "Added new setting to allow to read type names in binary format in Native output format"},
|
||||
{"read_in_order_use_buffering", false, true, "Use buffering before merging while reading in order of primary key"},
|
||||
{"enable_named_columns_in_function_tuple", false, true, "Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers."},
|
||||
{"enable_named_columns_in_function_tuple", false, false, "Generate named tuples in function tuple() when all names are unique and can be treated as unquoted identifiers."},
|
||||
{"optimize_trivial_insert_select", true, false, "The optimization does not make sense in many cases."},
|
||||
{"dictionary_validate_primary_key_type", false, false, "Validate primary key type for dictionaries. By default id type for simple layouts will be implicitly converted to UInt64."},
|
||||
{"collect_hash_table_stats_during_joins", false, true, "New setting."},
|
||||
|
@ -1,6 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Common/MemorySanitizer.h>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
@ -16,23 +18,20 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
struct Base58Encode
|
||||
{
|
||||
static constexpr auto name = "base58Encode";
|
||||
|
||||
static void process(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count)
|
||||
static void processString(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count)
|
||||
{
|
||||
auto & dst_data = dst_column->getChars();
|
||||
auto & dst_offsets = dst_column->getOffsets();
|
||||
|
||||
/// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/],
|
||||
/// and we take double scale to avoid any reallocation.
|
||||
|
||||
size_t max_result_size = static_cast<size_t>(ceil(2 * src_column.getChars().size() + 1));
|
||||
|
||||
dst_data.resize(max_result_size);
|
||||
@ -61,6 +60,37 @@ struct Base58Encode
|
||||
|
||||
dst_data.resize(current_dst_offset);
|
||||
}
|
||||
|
||||
static void processFixedString(const ColumnFixedString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count)
|
||||
{
|
||||
auto & dst_data = dst_column->getChars();
|
||||
auto & dst_offsets = dst_column->getOffsets();
|
||||
|
||||
/// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/],
|
||||
/// and we take double scale to avoid any reallocation.
|
||||
size_t max_result_size = static_cast<size_t>(ceil(2 * src_column.getChars().size() + 1));
|
||||
|
||||
dst_data.resize(max_result_size);
|
||||
dst_offsets.resize(input_rows_count);
|
||||
|
||||
const auto * src = src_column.getChars().data();
|
||||
auto * dst = dst_data.data();
|
||||
|
||||
size_t N = src_column.getN();
|
||||
size_t current_dst_offset = 0;
|
||||
|
||||
for (size_t row = 0; row < input_rows_count; ++row)
|
||||
{
|
||||
size_t encoded_size = encodeBase58(&src[row * N], N, &dst[current_dst_offset]);
|
||||
current_dst_offset += encoded_size;
|
||||
dst[current_dst_offset] = 0;
|
||||
++current_dst_offset;
|
||||
|
||||
dst_offsets[row] = current_dst_offset;
|
||||
}
|
||||
|
||||
dst_data.resize(current_dst_offset);
|
||||
}
|
||||
};
|
||||
|
||||
enum class Base58DecodeErrorHandling : uint8_t
|
||||
@ -74,14 +104,13 @@ struct Base58Decode
|
||||
{
|
||||
static constexpr auto name = Name::name;
|
||||
|
||||
static void process(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count)
|
||||
static void processString(const ColumnString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count)
|
||||
{
|
||||
auto & dst_data = dst_column->getChars();
|
||||
auto & dst_offsets = dst_column->getOffsets();
|
||||
|
||||
/// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/],
|
||||
/// and decoded value will be no longer than source.
|
||||
|
||||
size_t max_result_size = src_column.getChars().size() + 1;
|
||||
|
||||
dst_data.resize(max_result_size);
|
||||
@ -118,6 +147,45 @@ struct Base58Decode
|
||||
|
||||
dst_data.resize(current_dst_offset);
|
||||
}
|
||||
|
||||
static void processFixedString(const ColumnFixedString & src_column, ColumnString::MutablePtr & dst_column, size_t input_rows_count)
|
||||
{
|
||||
auto & dst_data = dst_column->getChars();
|
||||
auto & dst_offsets = dst_column->getOffsets();
|
||||
|
||||
/// Base58 has efficiency of 73% (8/11) [https://monerodocs.org/cryptography/base58/],
|
||||
/// and decoded value will be no longer than source.
|
||||
size_t max_result_size = src_column.getChars().size() + 1;
|
||||
|
||||
dst_data.resize(max_result_size);
|
||||
dst_offsets.resize(input_rows_count);
|
||||
|
||||
const auto * src = src_column.getChars().data();
|
||||
auto * dst = dst_data.data();
|
||||
|
||||
size_t N = src_column.getN();
|
||||
size_t current_dst_offset = 0;
|
||||
|
||||
for (size_t row = 0; row < input_rows_count; ++row)
|
||||
{
|
||||
std::optional<size_t> decoded_size = decodeBase58(&src[row * N], N, &dst[current_dst_offset]);
|
||||
if (!decoded_size)
|
||||
{
|
||||
if constexpr (ErrorHandling == Base58DecodeErrorHandling::ThrowException)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid Base58 value, cannot be decoded");
|
||||
else
|
||||
decoded_size = 0;
|
||||
}
|
||||
|
||||
current_dst_offset += *decoded_size;
|
||||
dst[current_dst_offset] = 0;
|
||||
++current_dst_offset;
|
||||
|
||||
dst_offsets[row] = current_dst_offset;
|
||||
}
|
||||
|
||||
dst_data.resize(current_dst_offset);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Func>
|
||||
@ -135,14 +203,10 @@ public:
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
if (arguments.size() != 1)
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function {}: 1 expected.", getName());
|
||||
|
||||
if (!isString(arguments[0].type))
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Illegal type {} of first argument of function {}. Must be String.",
|
||||
arguments[0].type->getName(), getName());
|
||||
FunctionArgumentDescriptors args{
|
||||
{"arg", static_cast<FunctionArgumentDescriptor::TypeValidator>(&isStringOrFixedString), nullptr, "String or FixedString"}
|
||||
};
|
||||
validateFunctionArguments(*this, arguments, args);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
@ -154,19 +218,25 @@ public:
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
{
|
||||
const ColumnPtr column_string = arguments[0].column;
|
||||
const ColumnString * input = checkAndGetColumn<ColumnString>(column_string.get());
|
||||
if (!input)
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}, must be String",
|
||||
arguments[0].column->getName(), getName());
|
||||
const ColumnPtr col = arguments[0].column;
|
||||
|
||||
auto dst_column = ColumnString::create();
|
||||
if (const ColumnString * col_string = checkAndGetColumn<ColumnString>(col.get()))
|
||||
{
|
||||
auto col_res = ColumnString::create();
|
||||
Func::processString(*col_string, col_res, input_rows_count);
|
||||
return col_res;
|
||||
}
|
||||
else if (const ColumnFixedString * col_fixed_string = checkAndGetColumn<ColumnFixedString>(col.get()))
|
||||
{
|
||||
auto col_res = ColumnString::create();
|
||||
Func::processFixedString(*col_fixed_string, col_res, input_rows_count);
|
||||
return col_res;
|
||||
}
|
||||
|
||||
Func::process(*input, dst_column, input_rows_count);
|
||||
|
||||
return dst_column;
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Illegal column {} of first argument of function {}, must be String or FixedString",
|
||||
arguments[0].column->getName(), getName());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -13,7 +13,6 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
@ -205,7 +204,11 @@ struct ReplaceRegexpImpl
|
||||
size_t input_rows_count)
|
||||
{
|
||||
if (needle.empty())
|
||||
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
||||
{
|
||||
res_data.assign(haystack_data);
|
||||
res_offsets.assign(haystack_offsets);
|
||||
return;
|
||||
}
|
||||
|
||||
ColumnString::Offset res_offset = 0;
|
||||
res_data.reserve(haystack_data.size());
|
||||
@ -240,7 +243,7 @@ struct ReplaceRegexpImpl
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
size_t from = i > 0 ? haystack_offsets[i - 1] : 0;
|
||||
size_t from = haystack_offsets[i - 1];
|
||||
|
||||
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + from);
|
||||
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - from - 1);
|
||||
@ -271,17 +274,24 @@ struct ReplaceRegexpImpl
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
|
||||
size_t hs_from = haystack_offsets[i - 1];
|
||||
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
|
||||
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);
|
||||
|
||||
size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0;
|
||||
size_t ndl_from = needle_offsets[i - 1];
|
||||
const char * ndl_data = reinterpret_cast<const char *>(needle_data.data() + ndl_from);
|
||||
const size_t ndl_length = static_cast<unsigned>(needle_offsets[i] - ndl_from - 1);
|
||||
std::string_view needle(ndl_data, ndl_length);
|
||||
|
||||
if (needle.empty())
|
||||
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
||||
{
|
||||
res_data.insert(res_data.end(), hs_data, hs_data + hs_length);
|
||||
res_data.push_back(0);
|
||||
|
||||
res_offset += hs_length + 1;
|
||||
res_offsets[i] = res_offset;
|
||||
continue;
|
||||
}
|
||||
|
||||
re2::RE2 searcher(needle, regexp_options);
|
||||
if (!searcher.ok())
|
||||
@ -308,7 +318,11 @@ struct ReplaceRegexpImpl
|
||||
assert(haystack_offsets.size() == replacement_offsets.size());
|
||||
|
||||
if (needle.empty())
|
||||
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
||||
{
|
||||
res_data.assign(haystack_data);
|
||||
res_offsets.assign(haystack_offsets);
|
||||
return;
|
||||
}
|
||||
|
||||
ColumnString::Offset res_offset = 0;
|
||||
res_data.reserve(haystack_data.size());
|
||||
@ -325,11 +339,11 @@ struct ReplaceRegexpImpl
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
|
||||
size_t hs_from = haystack_offsets[i - 1];
|
||||
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
|
||||
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);
|
||||
|
||||
size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0;
|
||||
size_t repl_from = replacement_offsets[i - 1];
|
||||
const char * repl_data = reinterpret_cast<const char *>(replacement_data.data() + repl_from);
|
||||
const size_t repl_length = static_cast<unsigned>(replacement_offsets[i] - repl_from - 1);
|
||||
std::string_view replacement(repl_data, repl_length);
|
||||
@ -364,19 +378,25 @@ struct ReplaceRegexpImpl
|
||||
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
size_t hs_from = i > 0 ? haystack_offsets[i - 1] : 0;
|
||||
size_t hs_from = haystack_offsets[i - 1];
|
||||
const char * hs_data = reinterpret_cast<const char *>(haystack_data.data() + hs_from);
|
||||
const size_t hs_length = static_cast<unsigned>(haystack_offsets[i] - hs_from - 1);
|
||||
|
||||
size_t ndl_from = i > 0 ? needle_offsets[i - 1] : 0;
|
||||
size_t ndl_from = needle_offsets[i - 1];
|
||||
const char * ndl_data = reinterpret_cast<const char *>(needle_data.data() + ndl_from);
|
||||
const size_t ndl_length = static_cast<unsigned>(needle_offsets[i] - ndl_from - 1);
|
||||
std::string_view needle(ndl_data, ndl_length);
|
||||
|
||||
if (needle.empty())
|
||||
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
||||
{
|
||||
res_data.insert(res_data.end(), hs_data, hs_data + hs_length);
|
||||
res_data.push_back(0);
|
||||
res_offsets[i] = res_offsets[i - 1] + hs_length + 1;
|
||||
res_offset = res_offsets[i];
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t repl_from = i > 0 ? replacement_offsets[i - 1] : 0;
|
||||
size_t repl_from = replacement_offsets[i - 1];
|
||||
const char * repl_data = reinterpret_cast<const char *>(replacement_data.data() + repl_from);
|
||||
const size_t repl_length = static_cast<unsigned>(replacement_offsets[i] - repl_from - 1);
|
||||
std::string_view replacement(repl_data, repl_length);
|
||||
@ -403,7 +423,21 @@ struct ReplaceRegexpImpl
|
||||
size_t input_rows_count)
|
||||
{
|
||||
if (needle.empty())
|
||||
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
|
||||
{
|
||||
chassert(input_rows_count == haystack_data.size() / n);
|
||||
/// Since ColumnFixedString does not have a zero byte at the end, while ColumnString does,
|
||||
/// we need to split haystack_data into strings of length n, add 1 zero byte to the end of each string
|
||||
/// and then copy to res_data, ref: ColumnString.h and ColumnFixedString.h
|
||||
res_data.reserve(haystack_data.size() + input_rows_count);
|
||||
res_offsets.resize(input_rows_count);
|
||||
for (size_t i = 0; i < input_rows_count; ++i)
|
||||
{
|
||||
res_data.insert(res_data.end(), haystack_data.begin() + i * n, haystack_data.begin() + (i + 1) * n);
|
||||
res_data.push_back(0);
|
||||
res_offsets[i] = res_offsets[i - 1] + n + 1;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
ColumnString::Offset res_offset = 0;
|
||||
res_data.reserve(haystack_data.size());
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <Parsers/ASTIdentifier_fwd.h>
|
||||
#include <Parsers/ASTInsertQuery.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTSelectWithUnionQuery.h>
|
||||
|
||||
#include <Parsers/CommonParsers.h>
|
||||
@ -7,6 +8,7 @@
|
||||
#include <Parsers/ExpressionListParsers.h>
|
||||
#include <Parsers/ParserSelectWithUnionQuery.h>
|
||||
#include <Parsers/ParserWatchQuery.h>
|
||||
#include <Parsers/ParserWithElement.h>
|
||||
#include <Parsers/ParserInsertQuery.h>
|
||||
#include <Parsers/ParserSetQuery.h>
|
||||
#include <Parsers/InsertQuerySettingsPushDownVisitor.h>
|
||||
@ -58,10 +60,20 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
|
||||
ASTPtr settings_ast;
|
||||
ASTPtr partition_by_expr;
|
||||
ASTPtr compression;
|
||||
ASTPtr with_expression_list;
|
||||
|
||||
/// Insertion data
|
||||
const char * data = nullptr;
|
||||
|
||||
if (s_with.ignore(pos, expected))
|
||||
{
|
||||
if (!ParserList(std::make_unique<ParserWithElement>(), std::make_unique<ParserToken>(TokenType::Comma))
|
||||
.parse(pos, with_expression_list, expected))
|
||||
return false;
|
||||
if (with_expression_list->children.empty())
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Check for key words `INSERT INTO`. If it isn't found, the query can't be parsed as insert query.
|
||||
if (!s_insert_into.ignore(pos, expected))
|
||||
return false;
|
||||
@ -162,7 +174,7 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
|
||||
|
||||
tryGetIdentifierNameInto(format, format_str);
|
||||
}
|
||||
else if (s_select.ignore(pos, expected) || s_with.ignore(pos,expected))
|
||||
else if (s_select.ignore(pos, expected) || s_with.ignore(pos, expected))
|
||||
{
|
||||
/// If SELECT is defined, return to position before select and parse
|
||||
/// rest of query as SELECT query.
|
||||
@ -170,6 +182,19 @@ bool ParserInsertQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
|
||||
ParserSelectWithUnionQuery select_p;
|
||||
select_p.parse(pos, select, expected);
|
||||
|
||||
if (with_expression_list)
|
||||
{
|
||||
const auto & children = select->as<ASTSelectWithUnionQuery>()->list_of_selects->children;
|
||||
for (const auto & child : children)
|
||||
{
|
||||
if (child->as<ASTSelectQuery>()->getExpression(ASTSelectQuery::Expression::WITH, false))
|
||||
throw Exception(ErrorCodes::SYNTAX_ERROR,
|
||||
"Only one WITH should be presented, either before INSERT or SELECT.");
|
||||
child->as<ASTSelectQuery>()->setExpression(ASTSelectQuery::Expression::WITH,
|
||||
std::move(with_expression_list));
|
||||
}
|
||||
}
|
||||
|
||||
/// FORMAT section is expected if we have input() in SELECT part
|
||||
if (s_format.ignore(pos, expected) && !name_p.parse(pos, format, expected))
|
||||
return false;
|
||||
|
@ -38,6 +38,7 @@ namespace DB
|
||||
{
|
||||
namespace Setting
|
||||
{
|
||||
extern const SettingsBool enable_named_columns_in_function_tuple;
|
||||
extern const SettingsBool transform_null_in;
|
||||
}
|
||||
|
||||
@ -182,6 +183,33 @@ public:
|
||||
break;
|
||||
}
|
||||
|
||||
if (planner_context.getQueryContext()->getSettingsRef()[Setting::enable_named_columns_in_function_tuple])
|
||||
{
|
||||
/// Function "tuple" which generates named tuple should use argument aliases to construct its name.
|
||||
if (function_node.getFunctionName() == "tuple")
|
||||
{
|
||||
if (const DataTypeTuple * type_tuple = typeid_cast<const DataTypeTuple *>(function_node.getResultType().get()))
|
||||
{
|
||||
if (type_tuple->haveExplicitNames())
|
||||
{
|
||||
const auto & names = type_tuple->getElementNames();
|
||||
size_t size = names.size();
|
||||
WriteBufferFromOwnString s;
|
||||
s << "tuple(";
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
s << ", ";
|
||||
s << backQuoteIfNeed(names[i]);
|
||||
}
|
||||
s << ")";
|
||||
result = s.str();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String in_function_second_argument_node_name;
|
||||
|
||||
if (isNameOfInFunction(function_node.getFunctionName()))
|
||||
|
@ -369,6 +369,7 @@ template <typename TColumn>
|
||||
void ParquetLeafColReader<TColumn>::readPage()
|
||||
{
|
||||
// refer to: ColumnReaderImplBase::ReadNewPage in column_reader.cc
|
||||
// this is where decompression happens
|
||||
auto cur_page = parquet_page_reader->NextPage();
|
||||
switch (cur_page->type())
|
||||
{
|
||||
@ -408,46 +409,13 @@ void ParquetLeafColReader<TColumn>::readPage()
|
||||
}
|
||||
|
||||
template <typename TColumn>
|
||||
void ParquetLeafColReader<TColumn>::readPageV1(const parquet::DataPageV1 & page)
|
||||
void ParquetLeafColReader<TColumn>::initDataReader(
|
||||
parquet::Encoding::type enconding_type,
|
||||
const uint8_t * buffer,
|
||||
std::size_t max_size,
|
||||
std::unique_ptr<RleValuesReader> && def_level_reader)
|
||||
{
|
||||
static parquet::LevelDecoder repetition_level_decoder;
|
||||
|
||||
cur_page_values = page.num_values();
|
||||
|
||||
// refer to: VectorizedColumnReader::readPageV1 in Spark and LevelDecoder::SetData in column_reader.cc
|
||||
if (page.definition_level_encoding() != parquet::Encoding::RLE && col_descriptor.max_definition_level() != 0)
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unsupported encoding: {}", page.definition_level_encoding());
|
||||
}
|
||||
const auto * buffer = page.data();
|
||||
auto max_size = page.size();
|
||||
|
||||
if (col_descriptor.max_repetition_level() > 0)
|
||||
{
|
||||
auto rep_levels_bytes = repetition_level_decoder.SetData(
|
||||
page.repetition_level_encoding(), col_descriptor.max_repetition_level(), 0, buffer, max_size);
|
||||
buffer += rep_levels_bytes;
|
||||
max_size -= rep_levels_bytes;
|
||||
}
|
||||
|
||||
assert(col_descriptor.max_definition_level() >= 0);
|
||||
std::unique_ptr<RleValuesReader> def_level_reader;
|
||||
if (col_descriptor.max_definition_level() > 0)
|
||||
{
|
||||
auto bit_width = arrow::bit_util::Log2(col_descriptor.max_definition_level() + 1);
|
||||
auto num_bytes = ::arrow::util::SafeLoadAs<int32_t>(buffer);
|
||||
auto bit_reader = std::make_unique<arrow::bit_util::BitReader>(buffer + 4, num_bytes);
|
||||
num_bytes += 4;
|
||||
buffer += num_bytes;
|
||||
max_size -= num_bytes;
|
||||
def_level_reader = std::make_unique<RleValuesReader>(std::move(bit_reader), bit_width);
|
||||
}
|
||||
else
|
||||
{
|
||||
def_level_reader = std::make_unique<RleValuesReader>(page.num_values());
|
||||
}
|
||||
|
||||
switch (page.encoding())
|
||||
switch (enconding_type)
|
||||
{
|
||||
case parquet::Encoding::PLAIN:
|
||||
{
|
||||
@ -488,17 +456,144 @@ void ParquetLeafColReader<TColumn>::readPageV1(const parquet::DataPageV1 & page)
|
||||
case parquet::Encoding::DELTA_BINARY_PACKED:
|
||||
case parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
|
||||
case parquet::Encoding::DELTA_BYTE_ARRAY:
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unsupported encoding: {}", page.encoding());
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unsupported encoding: {}", enconding_type);
|
||||
|
||||
default:
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", page.encoding());
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unknown encoding type: {}", enconding_type);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TColumn>
|
||||
void ParquetLeafColReader<TColumn>::readPageV2(const parquet::DataPageV2 & /*page*/)
|
||||
void ParquetLeafColReader<TColumn>::readPageV1(const parquet::DataPageV1 & page)
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "read page V2 is not implemented yet");
|
||||
cur_page_values = page.num_values();
|
||||
|
||||
// refer to: VectorizedColumnReader::readPageV1 in Spark and LevelDecoder::SetData in column_reader.cc
|
||||
if (page.definition_level_encoding() != parquet::Encoding::RLE && col_descriptor.max_definition_level() != 0)
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Unsupported encoding: {}", page.definition_level_encoding());
|
||||
}
|
||||
|
||||
const auto * buffer = page.data();
|
||||
auto max_size = static_cast<std::size_t>(page.size());
|
||||
|
||||
if (col_descriptor.max_repetition_level() > 0)
|
||||
{
|
||||
if (max_size < sizeof(int32_t))
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Not enough bytes in parquet page buffer, corrupt?");
|
||||
}
|
||||
|
||||
auto num_bytes = ::arrow::util::SafeLoadAs<int32_t>(buffer);
|
||||
|
||||
if (num_bytes < 0)
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Number of bytes for dl is negative, corrupt?");
|
||||
}
|
||||
|
||||
if (num_bytes + 4u > max_size)
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Not enough bytes in parquet page buffer, corrupt?");
|
||||
}
|
||||
|
||||
// not constructing level reader because we are not using it atm
|
||||
num_bytes += 4;
|
||||
buffer += num_bytes;
|
||||
max_size -= num_bytes;
|
||||
}
|
||||
|
||||
assert(col_descriptor.max_definition_level() >= 0);
|
||||
std::unique_ptr<RleValuesReader> def_level_reader;
|
||||
if (col_descriptor.max_definition_level() > 0)
|
||||
{
|
||||
auto bit_width = arrow::bit_util::Log2(col_descriptor.max_definition_level() + 1);
|
||||
|
||||
if (max_size < sizeof(int32_t))
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Not enough bytes in parquet page buffer, corrupt?");
|
||||
}
|
||||
|
||||
auto num_bytes = ::arrow::util::SafeLoadAs<int32_t>(buffer);
|
||||
|
||||
if (num_bytes < 0)
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Number of bytes for dl is negative, corrupt?");
|
||||
}
|
||||
|
||||
if (num_bytes + 4u > max_size)
|
||||
{
|
||||
throw Exception(ErrorCodes::PARQUET_EXCEPTION, "Not enough bytes in parquet page buffer, corrupt?");
|
||||
}
|
||||
|
||||
auto bit_reader = std::make_unique<arrow::bit_util::BitReader>(buffer + 4, num_bytes);
|
||||
num_bytes += 4;
|
||||
buffer += num_bytes;
|
||||
max_size -= num_bytes;
|
||||
def_level_reader = std::make_unique<RleValuesReader>(std::move(bit_reader), bit_width);
|
||||
}
|
||||
else
|
||||
{
|
||||
def_level_reader = std::make_unique<RleValuesReader>(page.num_values());
|
||||
}
|
||||
|
||||
initDataReader(page.encoding(), buffer, max_size, std::move(def_level_reader));
|
||||
}
|
||||
|
||||
/*
|
||||
* As far as I understand, the difference between page v1 and page v2 lies primarily on the below:
|
||||
* 1. repetition and definition levels are not compressed;
|
||||
* 2. size of repetition and definition levels is present in the header;
|
||||
* 3. the encoding is always RLE
|
||||
*
|
||||
* Therefore, this method leverages the existing `parquet::LevelDecoder::SetDataV2` method to build the repetition level decoder.
|
||||
* The data buffer is "offset-ed" by rl bytes length and then dl decoder is built using RLE decoder. Since dl bytes length was present in the header,
|
||||
* there is no need to read it and apply an offset like in page v1.
|
||||
* */
|
||||
template <typename TColumn>
|
||||
void ParquetLeafColReader<TColumn>::readPageV2(const parquet::DataPageV2 & page)
|
||||
{
|
||||
cur_page_values = page.num_values();
|
||||
|
||||
const auto * buffer = page.data();
|
||||
|
||||
if (page.repetition_levels_byte_length() < 0 || page.definition_levels_byte_length() < 0)
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::PARQUET_EXCEPTION, "Either RL or DL is negative, this should not happen. Most likely corrupt file or parsing issue");
|
||||
}
|
||||
|
||||
const int64_t total_levels_length =
|
||||
static_cast<int64_t>(page.repetition_levels_byte_length()) +
|
||||
page.definition_levels_byte_length();
|
||||
|
||||
if (total_levels_length > page.size())
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::BAD_ARGUMENTS, "Data page too small for levels (corrupt header?)");
|
||||
}
|
||||
|
||||
// ARROW-17453: Even if max_rep_level_ is 0, there may still be
|
||||
// repetition level bytes written and/or reported in the header by
|
||||
// some writers (e.g. Athena)
|
||||
buffer += page.repetition_levels_byte_length();
|
||||
|
||||
assert(col_descriptor.max_definition_level() >= 0);
|
||||
std::unique_ptr<RleValuesReader> def_level_reader;
|
||||
if (col_descriptor.max_definition_level() > 0)
|
||||
{
|
||||
auto bit_width = arrow::bit_util::Log2(col_descriptor.max_definition_level() + 1);
|
||||
auto num_bytes = page.definition_levels_byte_length();
|
||||
auto bit_reader = std::make_unique<arrow::bit_util::BitReader>(buffer, num_bytes);
|
||||
def_level_reader = std::make_unique<RleValuesReader>(std::move(bit_reader), bit_width);
|
||||
}
|
||||
else
|
||||
{
|
||||
def_level_reader = std::make_unique<RleValuesReader>(page.num_values());
|
||||
}
|
||||
|
||||
buffer += page.definition_levels_byte_length();
|
||||
|
||||
initDataReader(page.encoding(), buffer, page.size() - total_levels_length, std::move(def_level_reader));
|
||||
}
|
||||
|
||||
template <typename TColumn>
|
||||
|
@ -54,6 +54,10 @@ private:
|
||||
void readPage();
|
||||
void readPageV1(const parquet::DataPageV1 & page);
|
||||
void readPageV2(const parquet::DataPageV2 & page);
|
||||
void initDataReader(parquet::Encoding::type enconding_type,
|
||||
const uint8_t * buffer,
|
||||
std::size_t max_size,
|
||||
std::unique_ptr<RleValuesReader> && def_level_reader);
|
||||
|
||||
std::unique_ptr<ParquetDataValuesReader> createDictReader(
|
||||
std::unique_ptr<RleValuesReader> def_level_reader, std::unique_ptr<RleValuesReader> rle_data_reader);
|
||||
|
@ -159,6 +159,14 @@ bool IndicesDescription::has(const String & name) const
|
||||
return false;
|
||||
}
|
||||
|
||||
bool IndicesDescription::hasType(const String & type) const
|
||||
{
|
||||
for (const auto & index : *this)
|
||||
if (index.type == type)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
String IndicesDescription::toString() const
|
||||
{
|
||||
if (empty())
|
||||
|
@ -65,6 +65,8 @@ struct IndicesDescription : public std::vector<IndexDescription>, IHints<>
|
||||
{
|
||||
/// Index with name exists
|
||||
bool has(const String & name) const;
|
||||
/// Index with type exists
|
||||
bool hasType(const String & type) const;
|
||||
/// Convert description to string
|
||||
String toString() const;
|
||||
/// Parse description from string
|
||||
|
@ -261,6 +261,7 @@ namespace ErrorCodes
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
extern const int TOO_MANY_SIMULTANEOUS_QUERIES;
|
||||
extern const int INCORRECT_QUERY;
|
||||
extern const int INVALID_SETTING_VALUE;
|
||||
extern const int CANNOT_RESTORE_TABLE;
|
||||
extern const int ZERO_COPY_REPLICATION_ERROR;
|
||||
extern const int NOT_INITIALIZED;
|
||||
@ -759,6 +760,16 @@ void MergeTreeData::checkProperties(
|
||||
}
|
||||
}
|
||||
|
||||
/// If adaptive index granularity is disabled, certain vector search queries with PREWHERE run into LOGICAL_ERRORs.
|
||||
/// SET allow_experimental_vector_similarity_index = 1;
|
||||
/// CREATE TABLE tab (`id` Int32, `vec` Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 100000000) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0;
|
||||
/// INSERT INTO tab SELECT number, [toFloat32(number), 0.] FROM numbers(10000);
|
||||
/// WITH [1., 0.] AS reference_vec SELECT id, L2Distance(vec, reference_vec) FROM tab PREWHERE toLowCardinality(10) ORDER BY L2Distance(vec, reference_vec) ASC LIMIT 100;
|
||||
/// As a workaround, force enabled adaptive index granularity for now (it is the default anyways).
|
||||
if (new_metadata.secondary_indices.hasType("vector_similarity") && (*getSettings())[MergeTreeSetting::index_granularity_bytes] == 0)
|
||||
throw Exception(ErrorCodes::INVALID_SETTING_VALUE,
|
||||
"Experimental vector similarity index can only be used with MergeTree setting 'index_granularity_bytes' != 0");
|
||||
|
||||
if (!new_metadata.projections.empty())
|
||||
{
|
||||
std::unordered_set<String> projections_names;
|
||||
@ -3310,6 +3321,16 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
|
||||
"Experimental vector similarity index is disabled (turn on setting 'allow_experimental_vector_similarity_index')");
|
||||
|
||||
/// If adaptive index granularity is disabled, certain vector search queries with PREWHERE run into LOGICAL_ERRORs.
|
||||
/// SET allow_experimental_vector_similarity_index = 1;
|
||||
/// CREATE TABLE tab (`id` Int32, `vec` Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 100000000) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0;
|
||||
/// INSERT INTO tab SELECT number, [toFloat32(number), 0.] FROM numbers(10000);
|
||||
/// WITH [1., 0.] AS reference_vec SELECT id, L2Distance(vec, reference_vec) FROM tab PREWHERE toLowCardinality(10) ORDER BY L2Distance(vec, reference_vec) ASC LIMIT 100;
|
||||
/// As a workaround, force enabled adaptive index granularity for now (it is the default anyways).
|
||||
if (AlterCommands::hasVectorSimilarityIndex(new_metadata) && (*getSettings())[MergeTreeSetting::index_granularity_bytes] == 0)
|
||||
throw Exception(ErrorCodes::INVALID_SETTING_VALUE,
|
||||
"Experimental vector similarity index can only be used with MergeTree setting 'index_granularity_bytes' != 0");
|
||||
|
||||
for (const auto & disk : getDisks())
|
||||
if (!disk->supportsHardLinks() && !commands.isSettingsAlter() && !commands.isCommentAlter())
|
||||
throw Exception(
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Core/ServerSettings.h>
|
||||
#include <Core/Settings.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
@ -45,6 +46,11 @@ namespace ErrorCodes
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
namespace Setting
|
||||
{
|
||||
extern const SettingsUInt64 hnsw_candidate_list_size_for_search;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
@ -104,7 +110,7 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
|
||||
{
|
||||
USearchIndex::metric_t metric(dimensions, metric_kind, scalar_kind);
|
||||
|
||||
unum::usearch::index_dense_config_t config(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search);
|
||||
unum::usearch::index_dense_config_t config(usearch_hnsw_params.connectivity, usearch_hnsw_params.expansion_add, unum::usearch::default_expansion_search());
|
||||
config.enable_key_lookups = false; /// we don't do row-to-vector lookups
|
||||
|
||||
auto result = USearchIndex::make(metric, config);
|
||||
@ -399,6 +405,7 @@ MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity
|
||||
ContextPtr context)
|
||||
: vector_similarity_condition(query, context)
|
||||
, metric_kind(metric_kind_)
|
||||
, expansion_search(context->getSettingsRef()[Setting::hnsw_candidate_list_size_for_search])
|
||||
{
|
||||
}
|
||||
|
||||
@ -430,13 +437,17 @@ std::vector<UInt64> MergeTreeIndexConditionVectorSimilarity::calculateApproximat
|
||||
const USearchIndexWithSerializationPtr index = granule->index;
|
||||
|
||||
if (vector_similarity_condition.getDimensions() != index->dimensions())
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
|
||||
"does not match the dimension in the index ({})",
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) does not match the dimension in the index ({})",
|
||||
vector_similarity_condition.getDimensions(), index->dimensions());
|
||||
|
||||
const std::vector<Float64> reference_vector = vector_similarity_condition.getReferenceVector();
|
||||
|
||||
auto search_result = index->search(reference_vector.data(), limit);
|
||||
/// We want to run the search with the user-provided value for setting hnsw_candidate_list_size_for_search (aka. expansion_search).
|
||||
/// The way to do this in USearch is to call index_dense_gt::change_expansion_search. Unfortunately, this introduces a need to
|
||||
/// synchronize index access, see https://github.com/unum-cloud/usearch/issues/500. As a workaround, we extended USearch' search method
|
||||
/// to accept a custom expansion_add setting. The config value is only used on the fly, i.e. not persisted in the index.
|
||||
|
||||
auto search_result = index->search(reference_vector.data(), limit, USearchIndex::any_thread(), false, (expansion_search == 0) ? unum::usearch::default_expansion_search() : expansion_search);
|
||||
if (!search_result)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release()));
|
||||
|
||||
@ -501,13 +512,12 @@ MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index)
|
||||
UsearchHnswParams usearch_hnsw_params;
|
||||
|
||||
/// Optional parameters:
|
||||
const bool has_six_args = (index.arguments.size() == 6);
|
||||
if (has_six_args)
|
||||
const bool has_five_args = (index.arguments.size() == 5);
|
||||
if (has_five_args)
|
||||
{
|
||||
scalar_kind = quantizationToScalarKind.at(index.arguments[2].safeGet<String>());
|
||||
usearch_hnsw_params = {.m = index.arguments[3].safeGet<UInt64>(),
|
||||
.ef_construction = index.arguments[4].safeGet<UInt64>(),
|
||||
.ef_search = index.arguments[5].safeGet<UInt64>()};
|
||||
usearch_hnsw_params = {.connectivity = index.arguments[3].safeGet<UInt64>(),
|
||||
.expansion_add = index.arguments[4].safeGet<UInt64>()};
|
||||
}
|
||||
|
||||
return std::make_shared<MergeTreeIndexVectorSimilarity>(index, metric_kind, scalar_kind, usearch_hnsw_params);
|
||||
@ -516,25 +526,23 @@ MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index)
|
||||
void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
{
|
||||
const bool has_two_args = (index.arguments.size() == 2);
|
||||
const bool has_six_args = (index.arguments.size() == 6);
|
||||
const bool has_five_args = (index.arguments.size() == 5);
|
||||
|
||||
/// Check number and type of arguments
|
||||
if (!has_two_args && !has_six_args)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or six arguments");
|
||||
if (!has_two_args && !has_five_args)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or five arguments");
|
||||
if (index.arguments[0].getType() != Field::Types::String)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (method) must be of type String");
|
||||
if (index.arguments[1].getType() != Field::Types::String)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (metric) must be of type String");
|
||||
if (has_six_args)
|
||||
if (has_five_args)
|
||||
{
|
||||
if (index.arguments[2].getType() != Field::Types::String)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Third argument of vector similarity index (quantization) must be of type String");
|
||||
if (index.arguments[3].getType() != Field::Types::UInt64)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (M) must be of type UInt64");
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (hnsw_max_connections_per_layer) must be of type UInt64");
|
||||
if (index.arguments[4].getType() != Field::Types::UInt64)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (ef_construction) must be of type UInt64");
|
||||
if (index.arguments[5].getType() != Field::Types::UInt64)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Sixth argument of vector similarity index (ef_search) must be of type UInt64");
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (hnsw_candidate_list_size_for_construction) must be of type UInt64");
|
||||
}
|
||||
|
||||
/// Check that passed arguments are supported
|
||||
@ -542,18 +550,17 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "First argument (method) of vector similarity index is not supported. Supported methods are: {}", joinByComma(methods));
|
||||
if (!distanceFunctionToMetricKind.contains(index.arguments[1].safeGet<String>()))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Second argument (distance function) of vector similarity index is not supported. Supported distance function are: {}", joinByComma(distanceFunctionToMetricKind));
|
||||
if (has_six_args)
|
||||
if (has_five_args)
|
||||
{
|
||||
if (!quantizationToScalarKind.contains(index.arguments[2].safeGet<String>()))
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind));
|
||||
|
||||
/// Call Usearch's own parameter validation method for HNSW-specific parameters
|
||||
UInt64 m = index.arguments[3].safeGet<UInt64>();
|
||||
UInt64 ef_construction = index.arguments[4].safeGet<UInt64>();
|
||||
UInt64 ef_search = index.arguments[5].safeGet<UInt64>();
|
||||
|
||||
unum::usearch::index_dense_config_t config(m, ef_construction, ef_search);
|
||||
UInt64 connectivity = index.arguments[3].safeGet<UInt64>();
|
||||
UInt64 expansion_add = index.arguments[4].safeGet<UInt64>();
|
||||
UInt64 expansion_search = unum::usearch::default_expansion_search();
|
||||
|
||||
unum::usearch::index_dense_config_t config(connectivity, expansion_add, expansion_search);
|
||||
if (auto error = config.validate(); error)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parameters passed to vector similarity index. Error: {}", String(error.release()));
|
||||
}
|
||||
|
@ -13,9 +13,8 @@ namespace DB
|
||||
|
||||
struct UsearchHnswParams
|
||||
{
|
||||
size_t m = unum::usearch::default_connectivity();
|
||||
size_t ef_construction = unum::usearch::default_expansion_add();
|
||||
size_t ef_search = unum::usearch::default_expansion_search();
|
||||
size_t connectivity = unum::usearch::default_connectivity();
|
||||
size_t expansion_add = unum::usearch::default_expansion_add();
|
||||
};
|
||||
|
||||
using USearchIndex = unum::usearch::index_dense_t;
|
||||
@ -142,6 +141,7 @@ public:
|
||||
private:
|
||||
const VectorSimilarityCondition vector_similarity_condition;
|
||||
const unum::usearch::metric_kind_t metric_kind;
|
||||
const size_t expansion_search;
|
||||
};
|
||||
|
||||
|
||||
|
@ -361,7 +361,7 @@ ObjectStorageQueueSource::ObjectStorageQueueSource(
|
||||
ObjectStoragePtr object_storage_,
|
||||
const ReadFromFormatInfo & read_from_format_info_,
|
||||
const std::optional<FormatSettings> & format_settings_,
|
||||
const ObjectStorageQueueSettings & queue_settings_,
|
||||
const CommitSettings & commit_settings_,
|
||||
std::shared_ptr<ObjectStorageQueueMetadata> files_metadata_,
|
||||
ContextPtr context_,
|
||||
size_t max_block_size_,
|
||||
@ -380,7 +380,7 @@ ObjectStorageQueueSource::ObjectStorageQueueSource(
|
||||
, object_storage(object_storage_)
|
||||
, read_from_format_info(read_from_format_info_)
|
||||
, format_settings(format_settings_)
|
||||
, queue_settings(queue_settings_)
|
||||
, commit_settings(commit_settings_)
|
||||
, files_metadata(files_metadata_)
|
||||
, max_block_size(max_block_size_)
|
||||
, shutdown_called(shutdown_called_)
|
||||
@ -565,8 +565,8 @@ Chunk ObjectStorageQueueSource::generateImpl()
|
||||
processed_rows_from_file = 0;
|
||||
processed_files.push_back(file_metadata);
|
||||
|
||||
if (queue_settings.max_processed_files_before_commit
|
||||
&& processed_files.size() == queue_settings.max_processed_files_before_commit)
|
||||
if (commit_settings.max_processed_files_before_commit
|
||||
&& processed_files.size() == commit_settings.max_processed_files_before_commit)
|
||||
{
|
||||
LOG_TRACE(log, "Number of max processed files before commit reached "
|
||||
"(rows: {}, bytes: {}, files: {})",
|
||||
@ -574,15 +574,15 @@ Chunk ObjectStorageQueueSource::generateImpl()
|
||||
break;
|
||||
}
|
||||
|
||||
if (queue_settings.max_processed_rows_before_commit
|
||||
&& total_processed_rows == queue_settings.max_processed_rows_before_commit)
|
||||
if (commit_settings.max_processed_rows_before_commit
|
||||
&& total_processed_rows == commit_settings.max_processed_rows_before_commit)
|
||||
{
|
||||
LOG_TRACE(log, "Number of max processed rows before commit reached "
|
||||
"(rows: {}, bytes: {}, files: {})",
|
||||
total_processed_rows, total_processed_bytes, processed_files.size());
|
||||
break;
|
||||
}
|
||||
if (queue_settings.max_processed_bytes_before_commit && total_processed_bytes == queue_settings.max_processed_bytes_before_commit)
|
||||
if (commit_settings.max_processed_bytes_before_commit && total_processed_bytes == commit_settings.max_processed_bytes_before_commit)
|
||||
{
|
||||
LOG_TRACE(
|
||||
log,
|
||||
@ -593,8 +593,8 @@ Chunk ObjectStorageQueueSource::generateImpl()
|
||||
processed_files.size());
|
||||
break;
|
||||
}
|
||||
if (queue_settings.max_processing_time_sec_before_commit
|
||||
&& total_stopwatch.elapsedSeconds() >= queue_settings.max_processing_time_sec_before_commit)
|
||||
if (commit_settings.max_processing_time_sec_before_commit
|
||||
&& total_stopwatch.elapsedSeconds() >= commit_settings.max_processing_time_sec_before_commit)
|
||||
{
|
||||
LOG_TRACE(
|
||||
log,
|
||||
@ -648,15 +648,9 @@ void ObjectStorageQueueSource::commit(bool success, const std::string & exceptio
|
||||
|
||||
void ObjectStorageQueueSource::applyActionAfterProcessing(const String & path)
|
||||
{
|
||||
switch (queue_settings.after_processing.value)
|
||||
if (files_metadata->getTableMetadata().after_processing == "delete")
|
||||
{
|
||||
case ObjectStorageQueueAction::DELETE:
|
||||
{
|
||||
object_storage->removeObject(StoredObject(path));
|
||||
break;
|
||||
}
|
||||
case ObjectStorageQueueAction::KEEP:
|
||||
break;
|
||||
object_storage->removeObject(StoredObject(path));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -93,6 +93,14 @@ public:
|
||||
bool hasKeysForProcessor(const Processor & processor) const;
|
||||
};
|
||||
|
||||
struct CommitSettings
|
||||
{
|
||||
size_t max_processed_files_before_commit;
|
||||
size_t max_processed_rows_before_commit;
|
||||
size_t max_processed_bytes_before_commit;
|
||||
size_t max_processing_time_sec_before_commit;
|
||||
};
|
||||
|
||||
ObjectStorageQueueSource(
|
||||
String name_,
|
||||
size_t processor_id_,
|
||||
@ -101,7 +109,7 @@ public:
|
||||
ObjectStoragePtr object_storage_,
|
||||
const ReadFromFormatInfo & read_from_format_info_,
|
||||
const std::optional<FormatSettings> & format_settings_,
|
||||
const ObjectStorageQueueSettings & queue_settings_,
|
||||
const CommitSettings & commit_settings_,
|
||||
std::shared_ptr<ObjectStorageQueueMetadata> files_metadata_,
|
||||
ContextPtr context_,
|
||||
size_t max_block_size_,
|
||||
@ -130,7 +138,7 @@ private:
|
||||
const ObjectStoragePtr object_storage;
|
||||
ReadFromFormatInfo read_from_format_info;
|
||||
const std::optional<FormatSettings> format_settings;
|
||||
const ObjectStorageQueueSettings queue_settings;
|
||||
const CommitSettings commit_settings;
|
||||
const std::shared_ptr<ObjectStorageQueueMetadata> files_metadata;
|
||||
const size_t max_block_size;
|
||||
|
||||
|
@ -95,20 +95,20 @@ namespace
|
||||
std::shared_ptr<ObjectStorageQueueLog> getQueueLog(
|
||||
const ObjectStoragePtr & storage,
|
||||
const ContextPtr & context,
|
||||
const ObjectStorageQueueSettings & table_settings)
|
||||
bool enable_logging_to_queue_log)
|
||||
{
|
||||
const auto & settings = context->getSettingsRef();
|
||||
switch (storage->getType())
|
||||
{
|
||||
case DB::ObjectStorageType::S3:
|
||||
{
|
||||
if (table_settings.enable_logging_to_queue_log || settings[Setting::s3queue_enable_logging_to_s3queue_log])
|
||||
if (enable_logging_to_queue_log || settings[Setting::s3queue_enable_logging_to_s3queue_log])
|
||||
return context->getS3QueueLog();
|
||||
return nullptr;
|
||||
}
|
||||
case DB::ObjectStorageType::Azure:
|
||||
{
|
||||
if (table_settings.enable_logging_to_queue_log)
|
||||
if (enable_logging_to_queue_log)
|
||||
return context->getAzureQueueLog();
|
||||
return nullptr;
|
||||
}
|
||||
@ -131,11 +131,20 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
|
||||
LoadingStrictnessLevel mode)
|
||||
: IStorage(table_id_)
|
||||
, WithContext(context_)
|
||||
, queue_settings(std::move(queue_settings_))
|
||||
, zk_path(chooseZooKeeperPath(table_id_, context_->getSettingsRef(), *queue_settings))
|
||||
, zk_path(chooseZooKeeperPath(table_id_, context_->getSettingsRef(), *queue_settings_))
|
||||
, enable_logging_to_queue_log(queue_settings_->enable_logging_to_queue_log)
|
||||
, polling_min_timeout_ms(queue_settings_->polling_min_timeout_ms)
|
||||
, polling_max_timeout_ms(queue_settings_->polling_max_timeout_ms)
|
||||
, polling_backoff_ms(queue_settings_->polling_backoff_ms)
|
||||
, commit_settings(CommitSettings{
|
||||
.max_processed_files_before_commit = queue_settings_->max_processed_files_before_commit,
|
||||
.max_processed_rows_before_commit = queue_settings_->max_processed_rows_before_commit,
|
||||
.max_processed_bytes_before_commit = queue_settings_->max_processed_bytes_before_commit,
|
||||
.max_processing_time_sec_before_commit = queue_settings_->max_processing_time_sec_before_commit,
|
||||
})
|
||||
, configuration{configuration_}
|
||||
, format_settings(format_settings_)
|
||||
, reschedule_processing_interval_ms(queue_settings->polling_min_timeout_ms)
|
||||
, reschedule_processing_interval_ms(queue_settings_->polling_min_timeout_ms)
|
||||
, log(getLogger(fmt::format("Storage{}Queue ({})", configuration->getEngineName(), table_id_.getFullTableName())))
|
||||
{
|
||||
if (configuration->getPath().empty())
|
||||
@ -152,7 +161,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
|
||||
}
|
||||
|
||||
const bool is_attach = mode > LoadingStrictnessLevel::CREATE;
|
||||
validateSettings(*queue_settings, is_attach);
|
||||
validateSettings(*queue_settings_, is_attach);
|
||||
|
||||
object_storage = configuration->createObjectStorage(context_, /* is_readonly */true);
|
||||
FormatFactory::instance().checkFormatName(configuration->format);
|
||||
@ -173,10 +182,10 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
|
||||
LOG_INFO(log, "Using zookeeper path: {}", zk_path.string());
|
||||
|
||||
auto table_metadata = ObjectStorageQueueMetadata::syncWithKeeper(
|
||||
zk_path, *queue_settings, storage_metadata.getColumns(), configuration_->format, context_, is_attach, log);
|
||||
zk_path, *queue_settings_, storage_metadata.getColumns(), configuration_->format, context_, is_attach, log);
|
||||
|
||||
auto queue_metadata = std::make_unique<ObjectStorageQueueMetadata>(
|
||||
zk_path, std::move(table_metadata), queue_settings->cleanup_interval_min_ms, queue_settings->cleanup_interval_max_ms);
|
||||
zk_path, std::move(table_metadata), queue_settings_->cleanup_interval_min_ms, queue_settings_->cleanup_interval_max_ms);
|
||||
|
||||
files_metadata = ObjectStorageQueueMetadataFactory::instance().getOrCreate(zk_path, std::move(queue_metadata));
|
||||
|
||||
@ -317,7 +326,7 @@ void StorageObjectStorageQueue::read(
|
||||
void ReadFromObjectStorageQueue::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
|
||||
{
|
||||
Pipes pipes;
|
||||
const size_t adjusted_num_streams = storage->queue_settings->processing_threads_num;
|
||||
const size_t adjusted_num_streams = storage->getTableMetadata().processing_threads_num;
|
||||
|
||||
createIterator(nullptr);
|
||||
for (size_t i = 0; i < adjusted_num_streams; ++i)
|
||||
@ -351,9 +360,10 @@ std::shared_ptr<ObjectStorageQueueSource> StorageObjectStorageQueue::createSourc
|
||||
getName(), processor_id,
|
||||
file_iterator, configuration, object_storage,
|
||||
info, format_settings,
|
||||
*queue_settings, files_metadata,
|
||||
commit_settings,
|
||||
files_metadata,
|
||||
local_context, max_block_size, shutdown_called, table_is_being_dropped,
|
||||
getQueueLog(object_storage, local_context, *queue_settings),
|
||||
getQueueLog(object_storage, local_context, enable_logging_to_queue_log),
|
||||
getStorageID(), log, commit_once_processed);
|
||||
}
|
||||
|
||||
@ -400,12 +410,12 @@ void StorageObjectStorageQueue::threadFunc()
|
||||
if (streamToViews())
|
||||
{
|
||||
/// Reset the reschedule interval.
|
||||
reschedule_processing_interval_ms = queue_settings->polling_min_timeout_ms;
|
||||
reschedule_processing_interval_ms = polling_min_timeout_ms;
|
||||
}
|
||||
else
|
||||
{
|
||||
/// Increase the reschedule interval.
|
||||
reschedule_processing_interval_ms += queue_settings->polling_backoff_ms;
|
||||
reschedule_processing_interval_ms = std::min<size_t>(polling_max_timeout_ms, reschedule_processing_interval_ms + polling_backoff_ms);
|
||||
}
|
||||
|
||||
LOG_DEBUG(log, "Stopped streaming to {} attached views", dependencies_count);
|
||||
@ -446,6 +456,9 @@ bool StorageObjectStorageQueue::streamToViews()
|
||||
|
||||
auto file_iterator = createFileIterator(queue_context, nullptr);
|
||||
size_t total_rows = 0;
|
||||
const size_t processing_threads_num = getTableMetadata().processing_threads_num;
|
||||
|
||||
LOG_TEST(log, "Using {} processing threads", processing_threads_num);
|
||||
|
||||
while (!shutdown_called && !file_iterator->isFinished())
|
||||
{
|
||||
@ -466,10 +479,10 @@ bool StorageObjectStorageQueue::streamToViews()
|
||||
Pipes pipes;
|
||||
std::vector<std::shared_ptr<ObjectStorageQueueSource>> sources;
|
||||
|
||||
pipes.reserve(queue_settings->processing_threads_num);
|
||||
sources.reserve(queue_settings->processing_threads_num);
|
||||
pipes.reserve(processing_threads_num);
|
||||
sources.reserve(processing_threads_num);
|
||||
|
||||
for (size_t i = 0; i < queue_settings->processing_threads_num; ++i)
|
||||
for (size_t i = 0; i < processing_threads_num; ++i)
|
||||
{
|
||||
auto source = createSource(
|
||||
i/* processor_id */,
|
||||
@ -485,7 +498,7 @@ bool StorageObjectStorageQueue::streamToViews()
|
||||
auto pipe = Pipe::unitePipes(std::move(pipes));
|
||||
|
||||
block_io.pipeline.complete(std::move(pipe));
|
||||
block_io.pipeline.setNumThreads(queue_settings->processing_threads_num);
|
||||
block_io.pipeline.setNumThreads(processing_threads_num);
|
||||
block_io.pipeline.setConcurrencyControl(queue_context->getSettingsRef()[Setting::use_concurrency_control]);
|
||||
|
||||
std::atomic_size_t rows = 0;
|
||||
|
@ -54,9 +54,14 @@ public:
|
||||
private:
|
||||
friend class ReadFromObjectStorageQueue;
|
||||
using FileIterator = ObjectStorageQueueSource::FileIterator;
|
||||
using CommitSettings = ObjectStorageQueueSource::CommitSettings;
|
||||
|
||||
const std::unique_ptr<ObjectStorageQueueSettings> queue_settings;
|
||||
const fs::path zk_path;
|
||||
const bool enable_logging_to_queue_log;
|
||||
const size_t polling_min_timeout_ms;
|
||||
const size_t polling_max_timeout_ms;
|
||||
const size_t polling_backoff_ms;
|
||||
const CommitSettings commit_settings;
|
||||
|
||||
std::shared_ptr<ObjectStorageQueueMetadata> files_metadata;
|
||||
ConfigurationPtr configuration;
|
||||
@ -81,6 +86,7 @@ private:
|
||||
bool supportsSubcolumns() const override { return true; }
|
||||
bool supportsOptimizationToSubcolumns() const override { return false; }
|
||||
bool supportsDynamicSubcolumns() const override { return true; }
|
||||
const ObjectStorageQueueTableMetadata & getTableMetadata() const { return files_metadata->getTableMetadata(); }
|
||||
|
||||
std::shared_ptr<FileIterator> createFileIterator(ContextPtr local_context, const ActionsDAG::Node * predicate);
|
||||
std::shared_ptr<ObjectStorageQueueSource> createSource(
|
||||
|
@ -2046,3 +2046,50 @@ def test_bad_settings(started_cluster):
|
||||
assert False
|
||||
except Exception as e:
|
||||
assert "Ordered mode in cloud without either" in str(e)
|
||||
|
||||
|
||||
def test_processing_threads(started_cluster):
|
||||
node = started_cluster.instances["node1"]
|
||||
|
||||
table_name = f"test_processing_threads_{uuid.uuid4().hex[:8]}"
|
||||
dst_table_name = f"{table_name}_dst"
|
||||
# A unique path is necessary for repeatable tests
|
||||
keeper_path = f"/clickhouse/test_{table_name}"
|
||||
files_path = f"{table_name}_data"
|
||||
files_to_generate = 10
|
||||
|
||||
create_table(
|
||||
started_cluster,
|
||||
node,
|
||||
table_name,
|
||||
"ordered",
|
||||
files_path,
|
||||
additional_settings={
|
||||
"keeper_path": keeper_path,
|
||||
},
|
||||
)
|
||||
|
||||
assert '"processing_threads_num":16' in node.query(
|
||||
f"SELECT * FROM system.zookeeper WHERE path = '{keeper_path}'"
|
||||
)
|
||||
|
||||
total_values = generate_random_files(
|
||||
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
||||
)
|
||||
|
||||
create_mv(node, table_name, dst_table_name)
|
||||
|
||||
def get_count():
|
||||
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
|
||||
|
||||
expected_rows = 10
|
||||
for _ in range(20):
|
||||
if expected_rows == get_count():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert expected_rows == get_count()
|
||||
|
||||
assert node.contains_in_log(
|
||||
f"StorageS3Queue (default.{table_name}): Using 16 processing threads"
|
||||
)
|
||||
|
@ -40,3 +40,15 @@ foobar
|
||||
|
||||
1
|
||||
1
|
||||
32YCBjgZhV4AdCWHaCDNu
|
||||
foobar
|
||||
111
|
||||
bG7y
|
||||
bQZu
|
||||
bQbp
|
||||
\0\0\0
|
||||
f\0\0
|
||||
fo\0
|
||||
foo
|
||||
1
|
||||
1
|
||||
|
@ -1,15 +1,25 @@
|
||||
-- Tags: no-fasttest
|
||||
|
||||
SELECT base58Encode('Hold my beer...');
|
||||
|
||||
SELECT base58Encode('Hold my beer...', 'Second arg'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
|
||||
SELECT base58Decode('Hold my beer...'); -- { serverError BAD_ARGUMENTS }
|
||||
|
||||
SELECT base58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (select arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val));
|
||||
SELECT tryBase58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (select arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val));
|
||||
SELECT base58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (SELECT arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val));
|
||||
SELECT tryBase58Decode(encoded) FROM (SELECT base58Encode(val) as encoded FROM (SELECT arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar', 'Hello world!']) val));
|
||||
SELECT tryBase58Decode(val) FROM (SELECT arrayJoin(['Hold my beer', 'Hold another beer', '3csAg9', 'And a wine', 'And another wine', 'And a lemonade', 't1Zv2yaZ', 'And another wine']) val);
|
||||
|
||||
SELECT base58Encode(val) FROM (select arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar']) val);
|
||||
SELECT base58Decode(val) FROM (select arrayJoin(['', '2m', '8o8', 'bQbp', '3csAg9', 'CZJRhmz', 't1Zv2yaZ', '']) val);
|
||||
SELECT base58Encode(val) FROM (SELECT arrayJoin(['', 'f', 'fo', 'foo', 'foob', 'fooba', 'foobar']) val);
|
||||
SELECT base58Decode(val) FROM (SELECT arrayJoin(['', '2m', '8o8', 'bQbp', '3csAg9', 'CZJRhmz', 't1Zv2yaZ', '']) val);
|
||||
|
||||
SELECT base58Encode(base58Decode('1BWutmTvYPwDtmw9abTkS4Ssr8no61spGAvW1X6NDix')) == '1BWutmTvYPwDtmw9abTkS4Ssr8no61spGAvW1X6NDix';
|
||||
select base58Encode('\x00\x0b\xe3\xe1\xeb\xa1\x7a\x47\x3f\x89\xb0\xf7\xe8\xe2\x49\x40\xf2\x0a\xeb\x8e\xbc\xa7\x1a\x88\xfd\xe9\x5d\x4b\x83\xb7\x1a\x09') == '1BWutmTvYPwDtmw9abTkS4Ssr8no61spGAvW1X6NDix';
|
||||
|
||||
SELECT base58Encode(toFixedString('Hold my beer...', 15));
|
||||
SELECT base58Decode(toFixedString('t1Zv2yaZ', 8));
|
||||
|
||||
SELECT base58Encode(val) FROM (SELECT arrayJoin([toFixedString('', 3), toFixedString('f', 3), toFixedString('fo', 3), toFixedString('foo', 3)]) val);
|
||||
SELECT base58Decode(val) FROM (SELECT arrayJoin([toFixedString('111', 3), toFixedString('bG7y', 4), toFixedString('bQZu', 4), toFixedString('bQbp', 4)]) val);
|
||||
|
||||
Select base58Encode(reinterpretAsFixedString(byteSwap(toUInt256('256')))) == '1111111111111111111111111111115R';
|
||||
Select base58Encode(reinterpretAsString(byteSwap(toUInt256('256')))) == '1111111111111111111111111111112'; -- { reinterpretAsString drops the last null byte hence, encoded value is different than the FixedString version above }
|
||||
|
@ -40,3 +40,4 @@ Expression (Projection)
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
index_granularity_bytes = 0 is disallowed
|
||||
|
@ -37,7 +37,7 @@ DROP TABLE tab;
|
||||
|
||||
SELECT 'Correctness of index with > 1 mark';
|
||||
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
|
||||
INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000);
|
||||
|
||||
WITH [1.0, 0.0] AS reference_vec
|
||||
@ -56,7 +56,7 @@ DROP TABLE tab;
|
||||
|
||||
SELECT 'Issue #69085: Reference vector computed by a subquery';
|
||||
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
|
||||
-- works
|
||||
@ -100,3 +100,20 @@ FROM tab
|
||||
ORDER BY distance
|
||||
LIMIT 1
|
||||
SETTINGS enable_analyzer = 0;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT 'index_granularity_bytes = 0 is disallowed';
|
||||
|
||||
-- If adaptive index granularity is disabled, certain vector search queries with PREWHERE run into LOGICAL_ERRORs.
|
||||
-- SET allow_experimental_vector_similarity_index = 1;
|
||||
-- CREATE TABLE tab (`id` Int32, `vec` Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 100000000) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0;
|
||||
-- INSERT INTO tab SELECT number, [toFloat32(number), 0.] FROM numbers(10000);
|
||||
-- WITH [1., 0.] AS reference_vec SELECT id, L2Distance(vec, reference_vec) FROM tab PREWHERE toLowCardinality(10) ORDER BY L2Distance(vec, reference_vec) ASC LIMIT 100;
|
||||
-- As a workaround, force enabled adaptive index granularity for now (it is the default anyways).
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0; -- { serverError INVALID_SETTING_VALUE }
|
||||
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32)) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0;
|
||||
ALTER TABLE tab ADD INDEX vec_idx1(vec) TYPE vector_similarity('hnsw', 'cosineDistance'); -- { serverError INVALID_SETTING_VALUE }
|
||||
|
||||
DROP TABLE tab;
|
||||
|
@ -0,0 +1 @@
|
||||
2
|
@ -0,0 +1,41 @@
|
||||
-- Tags: no-fasttest, long, no-asan, no-asan, no-ubsan, no-debug
|
||||
-- ^^ Disable test for slow builds: generating data takes time but a sufficiently large data set
|
||||
-- is necessary for different hnsw_candidate_list_size_for_search settings to make a difference
|
||||
|
||||
-- Tests vector search with setting 'hnsw_candidate_list_size_for_search'
|
||||
|
||||
SET allow_experimental_vector_similarity_index = 1;
|
||||
SET enable_analyzer = 0;
|
||||
|
||||
DROP TABLE IF EXISTS tab;
|
||||
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192;
|
||||
|
||||
-- Generate random values but with a fixed seed (conceptually), so that the data is deterministic.
|
||||
-- Unfortunately, no random functions in ClickHouse accepts a seed. Instead, abuse the numbers table + hash functions to provide
|
||||
-- deterministic randomness.
|
||||
INSERT INTO tab SELECT number, [sipHash64(number)/18446744073709551615, wyHash64(number)/18446744073709551615] FROM numbers(370000); -- 18446744073709551615 is the biggest UInt64
|
||||
|
||||
DROP TABLE IF EXISTS results;
|
||||
CREATE TABLE results(id Int32) ENGINE = Memory;
|
||||
|
||||
-- Standard vector search with default hnsw_candidate_list_size_for_search = 64
|
||||
INSERT INTO results
|
||||
SELECT id
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vec, [0.5, 0.5])
|
||||
LIMIT 1;
|
||||
|
||||
-- Vector search with custom hnsw_candidate_list_size_for_search
|
||||
INSERT INTO results
|
||||
SELECT id
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vec, [0.5, 0.5])
|
||||
LIMIT 1
|
||||
SETTINGS hnsw_candidate_list_size_for_search = 1;
|
||||
|
||||
-- Expect that matches are different
|
||||
SELECT count(distinct *) FROM results;
|
||||
|
||||
DROP TABLE results;
|
||||
DROP TABLE tab;
|
@ -1,4 +1,4 @@
|
||||
Two or six index arguments
|
||||
Two or five index arguments
|
||||
1st argument (method) must be String and hnsw
|
||||
2nd argument (distance function) must be String and L2Distance or cosineDistance
|
||||
3nd argument (quantization), if given, must be String and f32, f16, ...
|
||||
|
@ -6,12 +6,12 @@ SET allow_experimental_vector_similarity_index = 1;
|
||||
|
||||
DROP TABLE IF EXISTS tab;
|
||||
|
||||
SELECT 'Two or six index arguments';
|
||||
SELECT 'Two or five index arguments';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant_have_one_arg')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'three_args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'six', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'five', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
SELECT '1st argument (method) must be String and hnsw';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3, 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
@ -22,11 +22,11 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'invalid_distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
|
||||
SELECT '3nd argument (quantization), if given, must be String and f32, f16, ...';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
SELECT '4nd argument (M), if given, must be UInt64 and > 1';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 'invalid', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
|
||||
SELECT 'Must be created on single column';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
|
||||
|
@ -37,7 +37,7 @@ Expression (Projection)
|
||||
Parts: 1/1
|
||||
Granules: 2/4
|
||||
Special cases
|
||||
-- Non-default metric, M, ef_construction, ef_search
|
||||
-- Non-default metric, hnsw_max_connections_per_layer, hnsw_candidate_list_size_for_construction
|
||||
6 [1,9.3] 0.005731362878640178
|
||||
4 [2.4,5.2] 0.09204062768384846
|
||||
1 [2,3.2] 0.15200169244542905
|
||||
|
@ -53,8 +53,8 @@ DROP TABLE tab;
|
||||
|
||||
SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen.
|
||||
|
||||
SELECT '-- Non-default metric, M, ef_construction, ef_search';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
SELECT '-- Non-default metric, hnsw_max_connections_per_layer, hnsw_candidate_list_size_for_construction';
|
||||
CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
|
||||
WITH [0.0, 2.0] AS reference_vec
|
||||
@ -82,11 +82,11 @@ SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann ind
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '-- Non-default quantization';
|
||||
CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
|
||||
INSERT INTO tab_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
|
||||
|
@ -134,15 +134,30 @@
|
||||
3 Hello World not_found x Hello World
|
||||
4 Hello World [eo] x Hxllo World
|
||||
5 Hello World . x xello World
|
||||
Check that whether an exception is thrown if the needle is empty
|
||||
- should not throw an exception if the needle is empty
|
||||
- non-const needle, const replacement
|
||||
Hexxo Worxd
|
||||
Hello World
|
||||
Hexlo World
|
||||
Hello World
|
||||
Hexxo Worxd
|
||||
Hello World
|
||||
Hexlo World
|
||||
Hello World
|
||||
- const needle, non-const replacement
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
- non-const needle, non-const replacement
|
||||
Hexxo Worxd
|
||||
Hello World
|
||||
Hexlo World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hello World
|
||||
Hexxo Worxd
|
||||
Hello World
|
||||
Hexlo World
|
||||
|
@ -69,8 +69,7 @@ SELECT id, haystack, needle, replacement, replaceRegexpOne('Hello World', needle
|
||||
|
||||
DROP TABLE IF EXISTS test_tab;
|
||||
|
||||
|
||||
SELECT 'Check that whether an exception is thrown if the needle is empty';
|
||||
SELECT '- should not throw an exception if the needle is empty';
|
||||
|
||||
CREATE TABLE test_tab
|
||||
(id UInt32, haystack String, needle String, replacement String)
|
||||
@ -79,22 +78,22 @@ CREATE TABLE test_tab
|
||||
|
||||
INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'x') (2, 'Hello World', '', 'y');
|
||||
|
||||
-- needle: non-const, replacement: const
|
||||
SELECT '- non-const needle, const replacement';
|
||||
SELECT replaceAll(haystack, needle, 'x') FROM test_tab;
|
||||
SELECT replaceOne(haystack, needle, 'x') FROM test_tab;
|
||||
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
|
||||
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
|
||||
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab;
|
||||
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab;
|
||||
|
||||
-- needle: const, replacement: non-const
|
||||
SELECT '- const needle, non-const replacement';
|
||||
SELECT replaceAll(haystack, '', replacement) FROM test_tab;
|
||||
SELECT replaceOne(haystack, '', replacement) FROM test_tab;
|
||||
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
|
||||
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
|
||||
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab;
|
||||
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab;
|
||||
|
||||
-- needle: non-const, replacement: non-const
|
||||
SELECT '- non-const needle, non-const replacement';
|
||||
SELECT replaceAll(haystack, needle, replacement) FROM test_tab;
|
||||
SELECT replaceOne(haystack, needle, replacement) FROM test_tab;
|
||||
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
|
||||
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
|
||||
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab;
|
||||
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab;
|
||||
|
||||
DROP TABLE IF EXISTS test_tab;
|
||||
|
@ -1,5 +1,6 @@
|
||||
set use_variant_as_common_type = 1;
|
||||
set allow_experimental_variant_type = 1;
|
||||
SET enable_named_columns_in_function_tuple=1;
|
||||
|
||||
SELECT if(number % 2, tuple(number), tuple(toString(number))) as res, toTypeName(res) FROM numbers(5);
|
||||
SELECT if(number % 2, map(number, number), map(toString(number), toString(number))) as res, toTypeName(res) FROM numbers(5);
|
||||
|
@ -1,4 +1,5 @@
|
||||
SET enable_analyzer = 1;
|
||||
SET enable_named_columns_in_function_tuple=1;
|
||||
|
||||
SELECT JSONExtract('{"hello":[{"world":"wtf"}]}', 'Tuple(hello Array(Tuple(world String)))') AS x,
|
||||
x.hello, x.hello[1].world;
|
||||
|
@ -0,0 +1,3 @@
|
||||
1 ('dete','ok') ('dete','ok')
|
||||
{"id":1,"a":{"col_a":"dete","type":"ok"},"b":{"col_b":"dete","type":"ok"}}
|
||||
{"id":1,"a":{"col_a":"dete","type":"ok"},"b":{"col_b":"dete","type":"ok"}}
|
@ -0,0 +1,22 @@
|
||||
SET enable_analyzer = 1;
|
||||
SET enable_named_columns_in_function_tuple = 1;
|
||||
|
||||
DROP TABLE IF EXISTS src;
|
||||
DROP TABLE IF EXISTS dst;
|
||||
|
||||
CREATE TABLE src (id UInt32, type String, data String) ENGINE=MergeTree ORDER BY tuple();
|
||||
CREATE TABLE dst (id UInt32, a Tuple (col_a Nullable(String), type String), b Tuple (col_b Nullable(String), type String)) ENGINE = MergeTree ORDER BY id;
|
||||
|
||||
INSERT INTO src VALUES (1, 'ok', 'data');
|
||||
INSERT INTO dst (id, a, b) SELECT id, tuple(replaceAll(data, 'a', 'e') AS col_a, type) AS a, tuple(replaceAll(data, 'a', 'e') AS col_b, type) AS b FROM src;
|
||||
SELECT * FROM dst;
|
||||
|
||||
DROP TABLE src;
|
||||
DROP TABLE dst;
|
||||
|
||||
DROP TABLE IF EXISTS src;
|
||||
CREATE TABLE src (id UInt32, type String, data String) ENGINE=MergeTree ORDER BY tuple();
|
||||
INSERT INTO src VALUES (1, 'ok', 'data');
|
||||
SELECT id, tuple(replaceAll(data, 'a', 'e') AS col_a, type) AS a, tuple(replaceAll(data, 'a', 'e') AS col_b, type) AS b FROM cluster(test_cluster_two_shards, currentDatabase(), src) SETTINGS prefer_localhost_replica=0 FORMAT JSONEachRow;
|
||||
|
||||
DROP TABLE src;
|
@ -0,0 +1,5 @@
|
||||
4c36abda-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1
|
||||
4c408902-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1
|
||||
4c5bf20a-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1
|
||||
4c61623a-8bd8-11eb-8204-005056aa8bf6 2021-03-24 01:04:27 1
|
||||
4c6efab2-8bd8-11eb-a952-005056aa8bf6 2021-03-24 01:04:27 1
|
@ -0,0 +1,18 @@
|
||||
DROP TABLE IF EXISTS table_3 SYNC;
|
||||
|
||||
CREATE TABLE table_3 (uid UUID, date DateTime('Asia/Kamchatka')) ENGINE = ReplicatedMergeTree('/pr_local_plan/{database}/table_3', 'r1') ORDER BY date;
|
||||
|
||||
INSERT INTO table_3 VALUES ('4c36abda-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c408902-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c5bf20a-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c61623a-8bd8-11eb-8204-005056aa8bf6', '2021-03-24 01:04:27'), ('4c6efab2-8bd8-11eb-a952-005056aa8bf6', '2021-03-24 01:04:27');
|
||||
|
||||
SELECT
|
||||
uid,
|
||||
date,
|
||||
toDate(date) = toDate('2021-03-24') AS res
|
||||
FROM table_3
|
||||
WHERE res = 1
|
||||
ORDER BY
|
||||
uid ASC,
|
||||
date ASC
|
||||
SETTINGS enable_parallel_replicas = 1, max_parallel_replicas = 3, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', parallel_replicas_local_plan = 1;
|
||||
|
||||
DROP TABLE table_3 SYNC;
|
20
tests/queries/0_stateless/03248_with_insert.reference
Normal file
20
tests/queries/0_stateless/03248_with_insert.reference
Normal file
@ -0,0 +1,20 @@
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
2025-01-01
|
||||
2026-01-01
|
||||
2027-01-01
|
||||
2028-01-01
|
||||
2029-01-01
|
||||
2030-01-01
|
||||
2031-01-01
|
||||
2032-01-01
|
||||
2033-01-01
|
||||
2034-01-01
|
49
tests/queries/0_stateless/03248_with_insert.sql
Normal file
49
tests/queries/0_stateless/03248_with_insert.sql
Normal file
@ -0,0 +1,49 @@
|
||||
DROP TABLE IF EXISTS x;
|
||||
|
||||
CREATE TABLE x ENGINE = Log AS SELECT * FROM numbers(0);
|
||||
|
||||
SYSTEM STOP MERGES x;
|
||||
|
||||
WITH y AS
|
||||
(
|
||||
SELECT *
|
||||
FROM numbers(10)
|
||||
)
|
||||
INSERT INTO x
|
||||
SELECT *
|
||||
FROM y
|
||||
INTERSECT
|
||||
SELECT *
|
||||
FROM numbers(5);
|
||||
|
||||
WITH y AS
|
||||
(
|
||||
SELECT *
|
||||
FROM numbers(10)
|
||||
)
|
||||
INSERT INTO x
|
||||
SELECT *
|
||||
FROM numbers(5)
|
||||
INTERSECT
|
||||
SELECT *
|
||||
FROM y;
|
||||
|
||||
SELECT * FROM x;
|
||||
|
||||
DROP TABLE x;
|
||||
|
||||
CREATE TABLE x (d date) ENGINE = Log;
|
||||
|
||||
WITH y AS
|
||||
(
|
||||
SELECT
|
||||
number,
|
||||
date_add(YEAR, number, toDate('2025-01-01')) AS new_date
|
||||
FROM numbers(10)
|
||||
)
|
||||
INSERT INTO x
|
||||
SELECT y.new_date FROM y;
|
||||
|
||||
SELECT * FROM x;
|
||||
|
||||
DROP TABLE x;
|
@ -0,0 +1 @@
|
||||
Only one WITH should be presented, either before INSERT or SELECT
|
10
tests/queries/0_stateless/03248_with_insert_with.sh
Executable file
10
tests/queries/0_stateless/03248_with_insert_with.sh
Executable file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q 'DROP TABLE IF EXISTS x'
|
||||
$CLICKHOUSE_CLIENT -q 'CREATE TABLE x ENGINE = Log AS SELECT * FROM numbers(0)'
|
||||
$CLICKHOUSE_CLIENT -q 'WITH y AS ( SELECT * FROM numbers(10) ) INSERT INTO x WITH y2 AS ( SELECT * FROM numbers(10) ) SELECT * FROM y;' |& grep --max-count 1 -F --only-matching "Only one WITH should be presented, either before INSERT or SELECT"
|
||||
$CLICKHOUSE_CLIENT -q 'DROP TABLE x'
|
@ -0,0 +1,10 @@
|
||||
abc 2
|
||||
abc 2
|
||||
abc 3
|
||||
abc 4
|
||||
\N 5
|
||||
abc 2
|
||||
abc 2
|
||||
abc 3
|
||||
abc 4
|
||||
\N 5
|
23
tests/queries/0_stateless/03251_parquet_page_v2_native_reader.sh
Executable file
23
tests/queries/0_stateless/03251_parquet_page_v2_native_reader.sh
Executable file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-ubsan, no-fasttest
|
||||
|
||||
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
|
||||
|
||||
WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}"
|
||||
|
||||
mkdir -p "${WORKING_DIR}"
|
||||
|
||||
DATA_FILE="${CUR_DIR}/data_parquet/datapage_v2.snappy.parquet"
|
||||
|
||||
DATA_FILE_USER_PATH="${WORKING_DIR}/datapage_v2.snappy.parquet"
|
||||
|
||||
cp ${DATA_FILE} ${DATA_FILE_USER_PATH}
|
||||
|
||||
# Not reading all columns because some data types and encodings are not supported by native reader yet
|
||||
# TODO read all columns once implemented
|
||||
${CLICKHOUSE_CLIENT} --query="select a, c from file('${DATA_FILE_USER_PATH}', Parquet) order by c SETTINGS input_format_parquet_use_native_reader=false;"
|
||||
${CLICKHOUSE_CLIENT} --query="select a, c from file('${DATA_FILE_USER_PATH}', Parquet) order by c SETTINGS input_format_parquet_use_native_reader=true;"
|
@ -31,6 +31,7 @@ AnyEvent
|
||||
AppleClang
|
||||
Approximative
|
||||
ArrayJoin
|
||||
ArrowCompression
|
||||
ArrowStream
|
||||
AsyncInsertCacheSize
|
||||
AsynchronousHeavyMetricsCalculationTimeSpent
|
||||
@ -123,6 +124,7 @@ CMPLNT
|
||||
CMake
|
||||
CMakeLists
|
||||
CODECS
|
||||
CORS
|
||||
COVID
|
||||
CPUFrequencyMHz
|
||||
CPUs
|
||||
@ -138,11 +140,13 @@ CacheDictionaryThreadsActive
|
||||
CacheDictionaryUpdateQueueBatches
|
||||
CacheDictionaryUpdateQueueKeys
|
||||
CacheFileSegments
|
||||
CacheWarmer
|
||||
CamelCase
|
||||
Cap'n
|
||||
CapContains
|
||||
CapUnion
|
||||
CapnProto
|
||||
CapnProtoEnumComparingMode
|
||||
CatBoost
|
||||
CellAreaM
|
||||
CellAreaRads
|
||||
@ -209,6 +213,7 @@ DDLWORKER
|
||||
DDLWorker
|
||||
DDLWorkerThreads
|
||||
DDLWorkerThreadsActive
|
||||
DDLs
|
||||
DECRYPT
|
||||
DELETEs
|
||||
DESC
|
||||
@ -217,6 +222,7 @@ DOGEFI
|
||||
Damerau
|
||||
DataGrip
|
||||
DataLens
|
||||
DataPacket
|
||||
DataTime
|
||||
DataTypes
|
||||
DatabaseCatalog
|
||||
@ -228,11 +234,15 @@ DatabaseOnDiskThreadsActive
|
||||
DatabaseOrdinaryThreads
|
||||
DatabaseOrdinaryThreadsActive
|
||||
DateTime
|
||||
DateTimeInputFormat
|
||||
DateTimeOutputFormat
|
||||
DateTimeOverflowBehavior
|
||||
DateTimes
|
||||
DbCL
|
||||
Decrypted
|
||||
Deduplicate
|
||||
Deduplication
|
||||
DefaultTableEngine
|
||||
DelayedInserts
|
||||
DeliveryTag
|
||||
DeltaLake
|
||||
@ -248,7 +258,11 @@ DiskSpaceReservedForMerge
|
||||
DiskTotal
|
||||
DiskUnreserved
|
||||
DiskUsed
|
||||
DistributedCacheLogMode
|
||||
DistributedCachePoolBehaviourOnLimit
|
||||
DistributedDDLOutputMode
|
||||
DistributedFilesToInsert
|
||||
DistributedProductMode
|
||||
DistributedSend
|
||||
DockerHub
|
||||
DoubleDelta
|
||||
@ -257,6 +271,7 @@ Dresseler
|
||||
Durre
|
||||
ECMA
|
||||
ETag
|
||||
EachRow
|
||||
Ecto
|
||||
EdgeAngle
|
||||
EdgeLengthKm
|
||||
@ -269,6 +284,7 @@ Enum
|
||||
Enums
|
||||
Eoan
|
||||
EphemeralNode
|
||||
EscapingRule
|
||||
Ethereum
|
||||
ExactEdgeLengthKm
|
||||
ExactEdgeLengthM
|
||||
@ -373,6 +389,7 @@ IMDS
|
||||
INFILE
|
||||
INSERTed
|
||||
INSERTs
|
||||
INVOKER
|
||||
IOPrefetchThreads
|
||||
IOPrefetchThreadsActive
|
||||
IOThreads
|
||||
@ -384,7 +401,10 @@ IOWriterThreadsActive
|
||||
IPTrie
|
||||
IProcessor
|
||||
IPv
|
||||
ITION
|
||||
Identifiant
|
||||
IdentifierQuotingRule
|
||||
IdentifierQuotingStyle
|
||||
Incrementing
|
||||
IndexesAreNeighbors
|
||||
InfluxDB
|
||||
@ -403,6 +423,7 @@ IntervalMilliseconds
|
||||
IntervalMinute
|
||||
IntervalMonth
|
||||
IntervalNanosecond
|
||||
IntervalOutputFormat
|
||||
IntervalQuarter
|
||||
IntervalSecond
|
||||
IntervalWeek
|
||||
@ -464,6 +485,8 @@ Jepsen
|
||||
JetBrains
|
||||
Jitter
|
||||
Joda
|
||||
JoinAlgorithm
|
||||
JoinStrictness
|
||||
JumpConsistentHash
|
||||
Jupyter
|
||||
KDevelop
|
||||
@ -512,10 +535,16 @@ LinfNorm
|
||||
LinfNormalize
|
||||
LinksDeployment
|
||||
Linq
|
||||
ListObject
|
||||
ListObjects
|
||||
LoadAverage
|
||||
LoadBalancing
|
||||
LocalFSReadMethod
|
||||
LocalThread
|
||||
LocalThreadActive
|
||||
LogQL
|
||||
LogQueriesType
|
||||
LogsLevel
|
||||
Logstash
|
||||
LookML
|
||||
LoongArch
|
||||
@ -552,6 +581,7 @@ MaxDDLEntryID
|
||||
MaxMind
|
||||
MaxPartCountForPartition
|
||||
MaxPushedDDLEntryID
|
||||
MaxThreads
|
||||
Mbps
|
||||
McNeal
|
||||
Memcheck
|
||||
@ -559,6 +589,7 @@ MemoryCode
|
||||
MemoryDataAndStack
|
||||
MemoryResident
|
||||
MemoryResidentMax
|
||||
MemorySample
|
||||
MemorySanitizer
|
||||
MemoryShared
|
||||
MemoryTracking
|
||||
@ -594,6 +625,7 @@ Mongo
|
||||
Mongodb
|
||||
Monotonicity
|
||||
MsgPack
|
||||
MsgPackUUIDRepresentation
|
||||
MultiLineString
|
||||
MultiPolygon
|
||||
Multiline
|
||||
@ -602,6 +634,7 @@ Multithreading
|
||||
Multiword
|
||||
MurmurHash
|
||||
MySQLConnection
|
||||
MySQLDataTypesSupport
|
||||
MySQLDump
|
||||
MySQLThreads
|
||||
NATS
|
||||
@ -637,6 +670,7 @@ NetworkSendPackets
|
||||
Noaa
|
||||
NodeJs
|
||||
NonMonotonic
|
||||
NonZeroUInt
|
||||
NuRaft
|
||||
NumHexagons
|
||||
NumPy
|
||||
@ -649,6 +683,7 @@ NumberOfTables
|
||||
OFNS
|
||||
OLAP
|
||||
OLTP
|
||||
ORCCompression
|
||||
OSContextSwitches
|
||||
OSGuestNiceTime
|
||||
OSGuestNiceTimeCPU
|
||||
@ -715,6 +750,8 @@ OrDefault
|
||||
OrNull
|
||||
OrZero
|
||||
OvercommitTracker
|
||||
OverflowMode
|
||||
OverflowModeGroupBy
|
||||
PAAMAYIM
|
||||
PCRE
|
||||
PRCP
|
||||
@ -728,8 +765,11 @@ ParallelFormattingOutputFormatThreadsActive
|
||||
ParallelParsingInputFormat
|
||||
ParallelParsingInputFormatThreads
|
||||
ParallelParsingInputFormatThreadsActive
|
||||
ParallelReplicasMode
|
||||
Parametrized
|
||||
ParquetCompression
|
||||
ParquetMetadata
|
||||
ParquetVersion
|
||||
Parsers
|
||||
PartMutation
|
||||
Partitioner
|
||||
@ -746,6 +786,7 @@ PartsWide
|
||||
PeerDB
|
||||
PendingAsyncInsert
|
||||
Percona
|
||||
PerfEventInfo
|
||||
PhpStorm
|
||||
PlantUML
|
||||
Poess
|
||||
@ -797,6 +838,8 @@ QueryCacheBytes
|
||||
QueryCacheEntries
|
||||
QueryCacheHits
|
||||
QueryCacheMisses
|
||||
QueryCacheNondeterministicFunctionHandling
|
||||
QueryCacheSystemTableHandling
|
||||
QueryPreempted
|
||||
QueryThread
|
||||
QuickAssist
|
||||
@ -805,6 +848,7 @@ QuoteMeta
|
||||
RBAC
|
||||
RClickHouse
|
||||
RHEL
|
||||
RIPEMD
|
||||
ROLLUP
|
||||
RWLock
|
||||
RWLockActiveReaders
|
||||
@ -857,7 +901,7 @@ RestartReplicaThreads
|
||||
RestartReplicaThreadsActive
|
||||
RestoreThreads
|
||||
RestoreThreadsActive
|
||||
RIPEMD
|
||||
RetryStrategy
|
||||
RoaringBitmap
|
||||
RocksDB
|
||||
Rollup
|
||||
@ -881,6 +925,7 @@ SQLAlchemy
|
||||
SQLConsoleDetail
|
||||
SQLInsert
|
||||
SQLSTATE
|
||||
SQLSecurityType
|
||||
SSDCache
|
||||
SSDComplexKeyCache
|
||||
SSDs
|
||||
@ -893,6 +938,7 @@ Sankey
|
||||
Scalable
|
||||
Scatterplot
|
||||
Schaefer
|
||||
SchemaInferenceMode
|
||||
Schemas
|
||||
Schwartzian
|
||||
SeasClick
|
||||
@ -901,8 +947,12 @@ SelfManaged
|
||||
Sematext
|
||||
SendExternalTables
|
||||
SendScalars
|
||||
SetOperationMode
|
||||
ShareAlike
|
||||
ShareSet
|
||||
SharedJoin
|
||||
SharedMergeTree
|
||||
ShortCircuitFunctionEvaluation
|
||||
Shortkeys
|
||||
Signup
|
||||
SimHash
|
||||
@ -953,6 +1003,7 @@ SystemReplicasThreadsActive
|
||||
TABLUM
|
||||
TAVG
|
||||
TCPConnection
|
||||
TCPHandler
|
||||
TCPThreads
|
||||
TDigest
|
||||
TINYINT
|
||||
@ -1020,8 +1071,10 @@ TotalPrimaryKeyBytesInMemory
|
||||
TotalPrimaryKeyBytesInMemoryAllocated
|
||||
TotalRowsOfMergeTreeTables
|
||||
TotalTemporaryFiles
|
||||
TotalsMode
|
||||
Tradeoff
|
||||
Transactional
|
||||
TransactionsWaitCSNMode
|
||||
Tsai
|
||||
Tukey
|
||||
TwoColumnList
|
||||
@ -1043,6 +1096,7 @@ URLHash
|
||||
URLHierarchy
|
||||
URLPathHierarchy
|
||||
USearch
|
||||
USearch
|
||||
UTCTimestamp
|
||||
UUIDNumToString
|
||||
UUIDStringToNum
|
||||
@ -1146,6 +1200,7 @@ aggregatio
|
||||
aggretate
|
||||
aggthrow
|
||||
aiochclient
|
||||
alloc
|
||||
allocator
|
||||
alphaTokens
|
||||
amplab
|
||||
@ -1428,6 +1483,7 @@ config
|
||||
configs
|
||||
conformant
|
||||
congruential
|
||||
conjuctive
|
||||
connectionId
|
||||
const
|
||||
contrib
|
||||
@ -1437,9 +1493,11 @@ corrMatrix
|
||||
corrStable
|
||||
corrmatrix
|
||||
corrstable
|
||||
cors
|
||||
cosineDistance
|
||||
countDigits
|
||||
countEqual
|
||||
countIf
|
||||
countMatches
|
||||
countMatchesCaseInsensitive
|
||||
countSubstrings
|
||||
@ -1561,7 +1619,9 @@ denormalizing
|
||||
denormals
|
||||
dequeued
|
||||
dequeues
|
||||
dereference
|
||||
deserialization
|
||||
deserialize
|
||||
deserialized
|
||||
deserializing
|
||||
dest
|
||||
@ -1604,6 +1664,7 @@ domainWithoutWWW
|
||||
domainWithoutWWWRFC
|
||||
dont
|
||||
dotProduct
|
||||
dotall
|
||||
downsampling
|
||||
dplyr
|
||||
dragonbox
|
||||
@ -1707,6 +1768,7 @@ formatReadableSize
|
||||
formatReadableTimeDelta
|
||||
formatRow
|
||||
formatRowNoNewline
|
||||
formatdatetime
|
||||
formatschema
|
||||
formatter
|
||||
formatters
|
||||
@ -1850,6 +1912,7 @@ heredocs
|
||||
hilbertDecode
|
||||
hilbertEncode
|
||||
hiveHash
|
||||
hnsw
|
||||
holistics
|
||||
homebrew
|
||||
hopEnd
|
||||
@ -1880,6 +1943,7 @@ ilike
|
||||
incrementing
|
||||
indexHint
|
||||
indexOf
|
||||
inequal
|
||||
infi
|
||||
inflight
|
||||
infty
|
||||
@ -1956,6 +2020,7 @@ kRing
|
||||
kafka
|
||||
kafkaMurmurHash
|
||||
kafkacat
|
||||
keepalive
|
||||
keepermap
|
||||
kerberized
|
||||
kerberos
|
||||
@ -2148,15 +2213,19 @@ multiSearchFirstPosition
|
||||
multiSearchFirstPositionCaseInsensitive
|
||||
multiSearchFirstPositionCaseInsensitiveUTF
|
||||
multiSearchFirstPositionUTF
|
||||
multibuffer
|
||||
multibyte
|
||||
multidirectory
|
||||
multiif
|
||||
multiline
|
||||
multilinestring
|
||||
multiplyDecimal
|
||||
multipolygon
|
||||
multiread
|
||||
multisearchany
|
||||
multisets
|
||||
multithread
|
||||
multithreading
|
||||
multiword
|
||||
munmap
|
||||
murmurHash
|
||||
@ -2208,6 +2277,7 @@ ngrambf
|
||||
ngrams
|
||||
noaa
|
||||
nonNegativeDerivative
|
||||
nonconst
|
||||
noop
|
||||
normalizeQuery
|
||||
normalizeQueryKeepNames
|
||||
@ -2229,6 +2299,7 @@ nullIf
|
||||
nullability
|
||||
nullable
|
||||
nullables
|
||||
nullptr
|
||||
num
|
||||
numerics
|
||||
nypd
|
||||
@ -2258,6 +2329,7 @@ pageviews
|
||||
parallelization
|
||||
parallelize
|
||||
parallelized
|
||||
param
|
||||
params
|
||||
parseDateTime
|
||||
parseDateTimeBestEffort
|
||||
@ -2276,13 +2348,16 @@ parseReadableSizeOrNull
|
||||
parseReadableSizeOrZero
|
||||
parseTimeDelta
|
||||
parseable
|
||||
parsedatetime
|
||||
parsers
|
||||
partitionID
|
||||
partitionId
|
||||
pathFull
|
||||
pclmulqdq
|
||||
pcre
|
||||
perf
|
||||
performant
|
||||
perkey
|
||||
perl
|
||||
persistency
|
||||
phpclickhouse
|
||||
@ -2317,6 +2392,7 @@ positionUTF
|
||||
positiveModulo
|
||||
postfix
|
||||
postfixes
|
||||
postgres
|
||||
postgresql
|
||||
pre
|
||||
pread
|
||||
@ -2326,7 +2402,11 @@ prebuilt
|
||||
preemptable
|
||||
preferServerCiphers
|
||||
prefetch
|
||||
prefetched
|
||||
prefetches
|
||||
prefetching
|
||||
prefetchsize
|
||||
preimage
|
||||
preloaded
|
||||
prem
|
||||
prepend
|
||||
@ -2480,6 +2560,7 @@ reinterpretAsInt
|
||||
reinterpretAsString
|
||||
reinterpretAsUInt
|
||||
reinterpretAsUUID
|
||||
remerge
|
||||
remoteSecure
|
||||
repivot
|
||||
replaceAll
|
||||
@ -2487,6 +2568,7 @@ replaceOne
|
||||
replaceRegexpAll
|
||||
replaceRegexpOne
|
||||
replacingmergetree
|
||||
replcase
|
||||
replicatable
|
||||
replicatedmergetree
|
||||
replxx
|
||||
@ -2494,6 +2576,7 @@ repo
|
||||
representable
|
||||
requestor
|
||||
requireTLSv
|
||||
rerange
|
||||
resharding
|
||||
reshards
|
||||
resolvers
|
||||
@ -2525,6 +2608,7 @@ rowbinary
|
||||
rowbinarywithdefaults
|
||||
rowbinarywithnames
|
||||
rowbinarywithnamesandtypes
|
||||
rowlist
|
||||
rsync
|
||||
rsyslog
|
||||
runnable
|
||||
@ -2713,6 +2797,7 @@ subtrees
|
||||
subtype
|
||||
sudo
|
||||
sumCount
|
||||
sumIf
|
||||
sumKahan
|
||||
sumMap
|
||||
sumMapFiltered
|
||||
@ -2758,6 +2843,7 @@ theilsu
|
||||
themself
|
||||
threadpool
|
||||
throwIf
|
||||
throwif
|
||||
timeDiff
|
||||
timeSeriesData
|
||||
timeSeriesMetrics
|
||||
@ -2923,9 +3009,11 @@ typename
|
||||
ubuntu
|
||||
uint
|
||||
ulid
|
||||
unacked
|
||||
unary
|
||||
unbin
|
||||
uncomment
|
||||
undelete
|
||||
undrop
|
||||
unencoded
|
||||
unencrypted
|
||||
@ -2954,6 +3042,7 @@ uniqthetasketch
|
||||
unix
|
||||
unixODBC
|
||||
unixodbc
|
||||
unmerged
|
||||
unoptimized
|
||||
unparsed
|
||||
unpooled
|
||||
@ -3060,90 +3149,3 @@ znode
|
||||
znodes
|
||||
zookeeperSessionUptime
|
||||
zstd
|
||||
postgres
|
||||
ArrowCompression
|
||||
CapnProtoEnumComparingMode
|
||||
DateTimeInputFormat
|
||||
DateTimeOutputFormat
|
||||
DateTimeOverflowBehavior
|
||||
deserialize
|
||||
dotall
|
||||
EachRow
|
||||
EscapingRule
|
||||
IdentifierQuotingRule
|
||||
IdentifierQuotingStyle
|
||||
IntervalOutputFormat
|
||||
MsgPackUUIDRepresentation
|
||||
ORCCompression
|
||||
ParquetCompression
|
||||
ParquetVersion
|
||||
SchemaInferenceMode
|
||||
alloc
|
||||
CacheWarmer
|
||||
conjuctive
|
||||
cors
|
||||
CORS
|
||||
countIf
|
||||
DefaultTableEngine
|
||||
dereference
|
||||
DistributedDDLOutputMode
|
||||
DistributedProductMode
|
||||
formatdatetime
|
||||
inequal
|
||||
INVOKER
|
||||
ITION
|
||||
JoinAlgorithm
|
||||
JoinStrictness
|
||||
keepalive
|
||||
ListObject
|
||||
ListObjects
|
||||
LoadBalancing
|
||||
LocalFSReadMethod
|
||||
LogQueriesType
|
||||
LogsLevel
|
||||
MaxThreads
|
||||
MemorySample
|
||||
multibuffer
|
||||
multiif
|
||||
multiread
|
||||
multithreading
|
||||
MySQLDataTypesSupport
|
||||
nonconst
|
||||
NonZeroUInt
|
||||
nullptr
|
||||
OverflowMode
|
||||
OverflowModeGroupBy
|
||||
ParallelReplicasMode
|
||||
param
|
||||
parsedatetime
|
||||
perf
|
||||
PerfEventInfo
|
||||
perkey
|
||||
prefetched
|
||||
prefetches
|
||||
prefetching
|
||||
preimage
|
||||
QueryCacheNondeterministicFunctionHandling
|
||||
QueryCacheSystemTableHandling
|
||||
remerge
|
||||
replcase
|
||||
rerange
|
||||
RetryStrategy
|
||||
rowlist
|
||||
SetOperationMode
|
||||
ShortCircuitFunctionEvaluation
|
||||
SQLSecurityType
|
||||
sumIf
|
||||
TCPHandler
|
||||
throwif
|
||||
TotalsMode
|
||||
TransactionsWaitCSNMode
|
||||
undelete
|
||||
unmerged
|
||||
DataPacket
|
||||
DDLs
|
||||
DistributedCacheLogMode
|
||||
DistributedCachePoolBehaviourOnLimit
|
||||
SharedJoin
|
||||
ShareSet
|
||||
unacked
|
||||
|
Loading…
Reference in New Issue
Block a user