mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge branch 'master' into allow-skip-empty-files
This commit is contained in:
commit
39ba925f8b
@ -74,6 +74,7 @@ ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
IndentRequiresClause: false
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
MacroBlockBegin: ''
|
||||
|
2
contrib/capnproto
vendored
2
contrib/capnproto
vendored
@ -1 +1 @@
|
||||
Subproject commit dc8b50b999777bcb23c89bb5907c785c3f654441
|
||||
Subproject commit 976209a6d18074804f60d18ef99b6a809d27dadf
|
@ -15,6 +15,9 @@ dpkg -i package_folder/clickhouse-client_*.deb
|
||||
|
||||
ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source /usr/share/clickhouse-test/ci/attach_gdb.lib
|
||||
|
||||
# install test configs
|
||||
/usr/share/clickhouse-test/config/install.sh
|
||||
|
||||
@ -85,6 +88,8 @@ fi
|
||||
|
||||
sleep 5
|
||||
|
||||
attach_gdb_to_clickhouse
|
||||
|
||||
function run_tests()
|
||||
{
|
||||
set -x
|
||||
|
@ -1,104 +1,142 @@
|
||||
# Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex}
|
||||
|
||||
Nearest neighborhood search refers to the problem of finding the point(s) with the smallest distance to a given point in an n-dimensional
|
||||
space. Since exact search is in practice usually typically too slow, the task is often solved with approximate algorithms. A popular use
|
||||
case of of neighbor search is finding similar pictures (texts) for a given picture (text). Pictures (texts) can be decomposed into
|
||||
[embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning), and instead of
|
||||
comparing pictures (texts) pixel-by-pixel (character-by-character), only the embeddings are compared.
|
||||
Nearest neighborhood search is the problem of finding the M closest points for a given point in an N-dimensional vector space. The most
|
||||
straightforward approach to solve this problem is a brute force search where the distance between all points in the vector space and the
|
||||
reference point is computed. This method guarantees perfect accuracy but it is usually too slow for practical applications. Thus, nearest
|
||||
neighborhood search problems are often solved with [approximative algorithms](https://github.com/erikbern/ann-benchmarks). Approximative
|
||||
nearest neighborhood search techniques, in conjunction with [embedding
|
||||
methods](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning) allow to search huge
|
||||
amounts of media (pictures, songs, articles, etc.) in milliseconds.
|
||||
|
||||
In terms of SQL, the problem can be expressed as follows:
|
||||
Blogs:
|
||||
- [Vector Search with ClickHouse - Part 1](https://clickhouse.com/blog/vector-search-clickhouse-p1)
|
||||
- [Vector Search with ClickHouse - Part 2](https://clickhouse.com/blog/vector-search-clickhouse-p2)
|
||||
|
||||
|
||||
In terms of SQL, the nearest neighborhood problem can be expressed as follows:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table
|
||||
WHERE L2Distance(column, Point) < MaxDistance
|
||||
ORDER BY Distance(vectors, Point)
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
`vectors` contains N-dimensional values of type [Array](../../../sql-reference/data-types/array.md) or
|
||||
[Tuple](../../../sql-reference/data-types/tuple.md), for example embeddings. Function `Distance` computes the distance between two vectors.
|
||||
Often, the the Euclidean (L2) distance is chosen as distance function but [other
|
||||
distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17,
|
||||
0.33, ...)`, and `N` limits the number of search results.
|
||||
|
||||
An alternative formulation of the nearest neighborhood search problem looks as follows:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table
|
||||
ORDER BY L2Distance(column, Point)
|
||||
WHERE Distance(vectors, Point) < MaxDistance
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
The queries are expensive because the L2 (Euclidean) distance between `Point` and all points in `column` and must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer quickly.
|
||||
While the first query returns the top-`N` closest points to the reference point, the second query returns all points closer to the reference
|
||||
point than a maximally allowed radius `MaxDistance`. Parameter `N` limits the number of returned values which is useful for situations where
|
||||
`MaxDistance` is difficult to determine in advance.
|
||||
|
||||
# Creating ANN Indexes
|
||||
With brute force search, both queries are expensive (linear in the number of points) because the distance between all points in `vectors` and
|
||||
`Point` must be computed. To speed this process up, Approximate Nearest Neighbor Search Indexes (ANN indexes) store a compact representation
|
||||
of the search space (using clustering, search trees, etc.) which allows to compute an approximate answer much quicker (in sub-linear time).
|
||||
|
||||
As long as ANN indexes are experimental, you first need to `SET allow_experimental_annoy_index = 1`.
|
||||
# Creating and Using ANN Indexes
|
||||
|
||||
Syntax to create an ANN index over an `Array` column:
|
||||
Syntax to create an ANN index over an [Array](../../../sql-reference/data-types/array.md) column:
|
||||
|
||||
```sql
|
||||
CREATE TABLE table
|
||||
(
|
||||
`id` Int64,
|
||||
`embedding` Array(Float32),
|
||||
INDEX <ann_index_name> embedding TYPE <ann_index_type>(<ann_index_parameters>) GRANULARITY <N>
|
||||
`vectors` Array(Float32),
|
||||
INDEX <ann_index_name> vectors TYPE <ann_index_type>(<ann_index_parameters>) [GRANULARITY <N>]
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
```
|
||||
|
||||
Syntax to create an ANN index over a `Tuple` column:
|
||||
Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column:
|
||||
|
||||
```sql
|
||||
CREATE TABLE table
|
||||
(
|
||||
`id` Int64,
|
||||
`embedding` Tuple(Float32[, Float32[, ...]]),
|
||||
INDEX <ann_index_name> embedding TYPE <ann_index_type>(<ann_index_parameters>) GRANULARITY <N>
|
||||
`vectors` Tuple(Float32[, Float32[, ...]]),
|
||||
INDEX <ann_index_name> vectors TYPE <ann_index_type>(<ann_index_parameters>) [GRANULARITY <N>]
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
```
|
||||
|
||||
ANN indexes are built during column insertion and merge and `INSERT` and `OPTIMIZE` statements will be slower than for ordinary tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively there are much more read requests than write requests.
|
||||
|
||||
Similar to regular skip indexes, ANN indexes are constructed over granules and each indexed block consists of `GRANULARITY = <N>`-many
|
||||
granules. For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`,
|
||||
then each indexed block will consist of 16384 rows. However, unlike skip indexes, ANN indexes are not only able to skip the entire indexed
|
||||
block, they are able to skip individual granules in indexed blocks. As a result, the `GRANULARITY` parameter has a different meaning in ANN
|
||||
indexes than in normal skip indexes. Basically, the bigger `GRANULARITY` is chosen, the more data is provided to a single ANN index, and the
|
||||
higher the chance that with the right hyper parameters, the index will remember the data structure better.
|
||||
|
||||
# Using ANN Indexes
|
||||
ANN indexes are built during column insertion and merge. As a result, `INSERT` and `OPTIMIZE` statements will be slower than for ordinary
|
||||
tables. ANNIndexes are ideally used only with immutable or rarely changed data, respectively when are far more read requests than write
|
||||
requests.
|
||||
|
||||
ANN indexes support two types of queries:
|
||||
|
||||
- WHERE queries:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table
|
||||
WHERE DistanceFunction(column, Point) < MaxDistance
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
- ORDER BY queries:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table
|
||||
[WHERE ...]
|
||||
ORDER BY DistanceFunction(column, Point)
|
||||
ORDER BY Distance(vectors, Point)
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
`DistanceFunction` is a [distance function](/docs/en/sql-reference/functions/distance-functions.md), `Point` is a reference vector (e.g. `(0.17, 0.33, ...)`) and `MaxDistance` is a floating point value which restricts the size of the neighbourhood.
|
||||
- WHERE queries:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table
|
||||
WHERE Distance(vectors, Point) < MaxDistance
|
||||
LIMIT N
|
||||
```
|
||||
|
||||
:::tip
|
||||
To avoid writing out large vectors, you can use [query parameters](/docs/en//interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g.
|
||||
To avoid writing out large vectors, you can use [query
|
||||
parameters](/docs/en/interfaces/cli.md#queries-with-parameters-cli-queries-with-parameters), e.g.
|
||||
|
||||
```bash
|
||||
clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(embedding, {vec: Array(Float32)}) < 1.0"
|
||||
clickhouse-client --param_vec='hello' --query="SELECT * FROM table WHERE L2Distance(vectors, {vec: Array(Float32)}) < 1.0"
|
||||
```
|
||||
:::
|
||||
|
||||
ANN indexes cannot speed up queries that contain both a `WHERE DistanceFunction(column, Point) < MaxDistance` and an `ORDER BY DistanceFunction(column, Point)` clause. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries that use an ANN index must have a `LIMIT` clause.
|
||||
**Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)`
|
||||
clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries
|
||||
without `LIMIT` clause cannot utilize ANN indexes. Also ANN indexes are only used if the query has a `LIMIT` value smaller than setting
|
||||
`max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for
|
||||
approximate neighbor search.
|
||||
|
||||
**Differences to Skip Indexes** Similar to regular [skip indexes](https://clickhouse.com/docs/en/optimize/skipping-indexes), ANN indexes are
|
||||
constructed over granules and each indexed block consists of `GRANULARITY = <N>`-many granules (`<N>` = 1 by default for normal skip
|
||||
indexes). For example, if the primary index granularity of the table is 8192 (setting `index_granularity = 8192`) and `GRANULARITY = 2`,
|
||||
then each indexed block will contain 16384 rows. However, data structures and algorithms for approximate neighborhood search (usually
|
||||
provided by external libraries) are inherently row-oriented. They store a compact representation of a set of rows and also return rows for
|
||||
ANN queries. This causes some rather unintuitive differences in the way ANN indexes behave compared to normal skip indexes.
|
||||
|
||||
When a user defines a ANN index on a column, ClickHouse internally creates a ANN "sub-index" for each index block. The sub-index is "local"
|
||||
in the sense that it only knows about the rows of its containing index block. In the previous example and assuming that a column has 65536
|
||||
rows, we obtain four index blocks (spanning eight granules) and a ANN sub-index for each index block. A sub-index is theoretically able to
|
||||
return the rows with the N closest points within its index block directly. However, since ClickHouse loads data from disk to memory at the
|
||||
granularity of granules, sub-indexes extrapolate matching rows to granule granularity. This is different from regular skip indexes which
|
||||
skip data at the granularity of index blocks.
|
||||
|
||||
The `GRANULARITY` parameter determines how many ANN sub-indexes are created. Bigger `GRANULARITY` values mean fewer but larger ANN
|
||||
sub-indexes, up to the point where a column (or a column's data part) has only a single sub-index. In that case, the sub-index has a
|
||||
"global" view of all column rows and can directly return all granules of the column (part) with relevant rows (there are at most `LIMIT
|
||||
<N>`-many such granules). In a second step, ClickHouse will load these granules and identify the actually best rows by performing a
|
||||
brute-force distance calculation over all rows of the granules. With a small `GRANULARITY` value, each of the sub-indexes returns up to
|
||||
`LIMIT N`-many granules. As a result, more granules need to be loaded and post-filtered. Note that the search accuracy is with both cases
|
||||
equally good, only the processing performance differs. It is generally recommended to use a large `GRANULARITY` for ANN indexes and fall
|
||||
back to a smaller `GRANULARITY` values only in case of problems like excessive memory consumption of the ANN structures. If no `GRANULARITY`
|
||||
was specified for ANN indexes, the default value is 100 million.
|
||||
|
||||
An ANN index is only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safety measure which helps to avoid large memory consumption by external libraries for approximate neighbor search.
|
||||
|
||||
# Available ANN Indexes
|
||||
|
||||
@ -106,51 +144,68 @@ An ANN index is only used if the query has a `LIMIT` value smaller than setting
|
||||
|
||||
## Annoy {#annoy}
|
||||
|
||||
(currently disabled on ARM due to memory safety problems with the algorithm)
|
||||
Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
|
||||
disabled on ARM due to memory safety problems with the algorithm.
|
||||
|
||||
This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which uses a recursive division of the space in random linear surfaces (lines in 2D, planes in 3D etc.).
|
||||
This type of ANN index implements [the Annoy algorithm](https://github.com/spotify/annoy) which is based on a recursive division of the
|
||||
space in random linear surfaces (lines in 2D, planes in 3D etc.).
|
||||
|
||||
Syntax to create a Annoy index over a `Array` column:
|
||||
<div class='vimeo-container'>
|
||||
<iframe src="//www.youtube.com/watch?v=QkCCyLW0ehU"
|
||||
width="640"
|
||||
height="360"
|
||||
frameborder="0"
|
||||
allow="autoplay;
|
||||
fullscreen;
|
||||
picture-in-picture"
|
||||
allowfullscreen>
|
||||
</iframe>
|
||||
</div>
|
||||
|
||||
Syntax to create an Annoy index over an [Array](../../../sql-reference/data-types/array.md) column:
|
||||
|
||||
```sql
|
||||
CREATE TABLE table
|
||||
(
|
||||
id Int64,
|
||||
embedding Array(Float32),
|
||||
INDEX <ann_index_name> embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N
|
||||
vectors Array(Float32),
|
||||
INDEX <ann_index_name> vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N]
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
```
|
||||
|
||||
Syntax to create a Annoy index over a `Tuple` column:
|
||||
Syntax to create an ANN index over a [Tuple](../../../sql-reference/data-types/tuple.md) column:
|
||||
|
||||
```sql
|
||||
CREATE TABLE table
|
||||
(
|
||||
id Int64,
|
||||
embedding Tuple(Float32[, Float32[, ...]]),
|
||||
INDEX <ann_index_name> embedding TYPE annoy([DistanceName[, NumTrees]]) GRANULARITY N
|
||||
vectors Tuple(Float32[, Float32[, ...]]),
|
||||
INDEX <ann_index_name> vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N]
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id;
|
||||
```
|
||||
|
||||
Parameter `DistanceName` is name of a distance function (default `L2Distance`). Annoy currently supports `L2Distance` and `cosineDistance` as distance functions. Parameter `NumTrees` (default: 100) is the number of trees which the algorithm will create. Higher values of `NumTree` mean slower `CREATE` and `SELECT` statements (approximately linearly), but increase the accuracy of search results.
|
||||
Annoy currently supports `L2Distance` and `cosineDistance` as distance function `Distance`. If no distance function was specified during
|
||||
index creation, `L2Distance` is used as default. Parameter `NumTrees` is the number of trees which the algorithm creates (default if not
|
||||
specified: 100). Higher values of `NumTree` mean more accurate search results but slower index creation / query times (approximately
|
||||
linearly) as well as larger index sizes.
|
||||
|
||||
:::note
|
||||
Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1 CHECK length(embedding) = 256`.
|
||||
Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use
|
||||
[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1
|
||||
CHECK length(vectors) = 256`.
|
||||
:::
|
||||
|
||||
Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. It can be used to
|
||||
balance runtime and accuracy at runtime.
|
||||
|
||||
Example:
|
||||
Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
|
||||
values mean more accurate results at the cost of longer query runtime:
|
||||
|
||||
``` sql
|
||||
SELECT *
|
||||
FROM table_name [WHERE ...]
|
||||
ORDER BY L2Distance(column, Point)
|
||||
FROM table_name
|
||||
ORDER BY L2Distance(vectors, Point)
|
||||
LIMIT N
|
||||
SETTINGS annoy_index_search_k_nodes=100
|
||||
```
|
||||
|
@ -491,7 +491,7 @@ Syntax: `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, ran
|
||||
|
||||
#### Special-purpose
|
||||
|
||||
- An experimental index to support approximate nearest neighbor (ANN) search. See [here](annindexes.md) for details.
|
||||
- Experimental indexes to support approximate nearest neighbor (ANN) search. See [here](annindexes.md) for details.
|
||||
- An experimental inverted index to support full-text search. See [here](invertedindexes.md) for details.
|
||||
|
||||
### Functions Support {#functions-support}
|
||||
@ -1138,7 +1138,7 @@ These parameters define the cache layer:
|
||||
|
||||
Cache parameters:
|
||||
- `path` — The path where metadata for the cache is stored.
|
||||
- `max_size` — The size (amount of memory) that the cache can grow to.
|
||||
- `max_size` — The size (amount of disk space) that the cache can grow to.
|
||||
|
||||
:::tip
|
||||
There are several other cache parameters that you can use to tune your storage, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) for the details.
|
||||
|
@ -193,6 +193,7 @@ SELECT * FROM nestedt FORMAT TSV
|
||||
- [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
||||
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
|
||||
- [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`.
|
||||
- [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
|
||||
|
||||
## TabSeparatedRaw {#tabseparatedraw}
|
||||
|
||||
@ -467,6 +468,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe
|
||||
- [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
||||
- [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`.
|
||||
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
|
||||
- [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`.
|
||||
- [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`.
|
||||
|
||||
## CSVWithNames {#csvwithnames}
|
||||
@ -495,7 +497,9 @@ the types from input data will be compared with the types of the corresponding c
|
||||
|
||||
Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings.
|
||||
|
||||
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
|
||||
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
|
||||
|
||||
If setting [input_format_tsv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_custom_detect_header) is enabled, trailing empty lines at the end of file will be skipped.
|
||||
|
||||
There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces).
|
||||
|
||||
|
@ -329,8 +329,8 @@ SELECT count() FROM system.schema_inference_cache WHERE storage='S3'
|
||||
## Text formats {#text-formats}
|
||||
|
||||
For text formats, ClickHouse reads the data row by row, extracts column values according to the format,
|
||||
and then uses some recursive parsers and heuristics to determine the type for each value. The maximum number of rows read from the data in schema inference
|
||||
is controlled by the setting `input_format_max_rows_to_read_for_schema_inference` with default value 25000.
|
||||
and then uses some recursive parsers and heuristics to determine the type for each value. The maximum number of rows and bytes read from the data in schema inference
|
||||
is controlled by the settings `input_format_max_rows_to_read_for_schema_inference` (25000 by default) and `input_format_max_bytes_to_read_for_schema_inference` (32Mb by default).
|
||||
By default, all inferred types are [Nullable](../sql-reference/data-types/nullable.md), but you can change this by setting `schema_inference_make_columns_nullable` (see examples in the [settings](#settings-for-text-formats) section).
|
||||
|
||||
### JSON formats {#json-formats}
|
||||
@ -1144,13 +1144,15 @@ Line: value_1=2, value_2="Some string 2", value_3="[4, 5, NULL]"$$)
|
||||
|
||||
### Settings for text formats {#settings-for-text-formats}
|
||||
|
||||
#### input_format_max_rows_to_read_for_schema_inference
|
||||
#### input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference
|
||||
|
||||
This setting controls the maximum number of rows to be read while schema inference.
|
||||
The more rows are read, the more time is spent on schema inference, but the greater the chance to
|
||||
These settings control the amount of data to be read while schema inference.
|
||||
The more rows/bytes are read, the more time is spent on schema inference, but the greater the chance to
|
||||
correctly determine the types (especially when the data contains a lot of nulls).
|
||||
|
||||
Default value: `25000`.
|
||||
Default values:
|
||||
- `25000` for `input_format_max_rows_to_read_for_schema_inference`.
|
||||
- `33554432` (32 Mb) for `input_format_max_bytes_to_read_for_schema_inference`.
|
||||
|
||||
#### column_names_for_schema_inference
|
||||
|
||||
@ -1643,7 +1645,7 @@ In schema inference for CapnProto format ClickHouse uses the following type matc
|
||||
## Strong-typed binary formats {#strong-typed-binary-formats}
|
||||
|
||||
In such formats, each serialized value contains information about its type (and possibly about its name), but there is no information about the whole table.
|
||||
In schema inference for such formats, ClickHouse reads data row by row (up to `input_format_max_rows_to_read_for_schema_inference` rows) and extracts
|
||||
In schema inference for such formats, ClickHouse reads data row by row (up to `input_format_max_rows_to_read_for_schema_inference` rows or `input_format_max_bytes_to_read_for_schema_inference` bytes) and extracts
|
||||
the type (and possibly name) for each value from the data and then converts these types to ClickHouse types.
|
||||
|
||||
### MsgPack {#msgpack}
|
||||
|
@ -137,6 +137,12 @@ The maximum rows of data to read for automatic schema inference.
|
||||
|
||||
Default value: `25'000`.
|
||||
|
||||
## input_format_max_bytes_to_read_for_schema_inference {#input_format_max_bytes_to_read_for_schema_inference}
|
||||
|
||||
The maximum amount of data in bytes to read for automatic schema inference.
|
||||
|
||||
Default value: `33554432` (32 Mb).
|
||||
|
||||
## column_names_for_schema_inference {#column_names_for_schema_inference}
|
||||
|
||||
The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'
|
||||
@ -728,6 +734,12 @@ My NULL
|
||||
My NULL
|
||||
```
|
||||
|
||||
### input_format_tsv_skip_trailing_empty_lines {input_format_tsv_skip_trailing_empty_lines}
|
||||
|
||||
When enabled, trailing empty lines at the end of TSV file will be skipped.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
## CSV format settings {#csv-format-settings}
|
||||
|
||||
### format_csv_delimiter {#format_csv_delimiter}
|
||||
@ -882,6 +894,12 @@ My NULL
|
||||
My NULL
|
||||
```
|
||||
|
||||
### input_format_csv_skip_trailing_empty_lines {input_format_csv_skip_trailing_empty_lines}
|
||||
|
||||
When enabled, trailing empty lines at the end of CSV file will be skipped.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
### input_format_csv_trim_whitespaces {#input_format_csv_trim_whitespaces}
|
||||
|
||||
Trims spaces and tabs in non-quoted CSV strings.
|
||||
@ -1475,6 +1493,12 @@ Sets the character that is interpreted as a suffix after the result set for [Cus
|
||||
|
||||
Default value: `''`.
|
||||
|
||||
### input_format_custom_skip_trailing_empty_lines {input_format_custom_skip_trailing_empty_lines}
|
||||
|
||||
When enabled, trailing empty lines at the end of file in CustomSeparated format will be skipped.
|
||||
|
||||
Disabled by default.
|
||||
|
||||
## Regexp format settings {#regexp-format-settings}
|
||||
|
||||
### format_regexp_escaping_rule {#format_regexp_escaping_rule}
|
||||
|
@ -1957,6 +1957,10 @@ Default value: empty string (disabled)
|
||||
For the replicated tables by default the only 100 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md/#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)).
|
||||
For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window).
|
||||
|
||||
:::note
|
||||
`insert_deduplication_token` works on a partition level (the same as `insert_deduplication` checksum). Multiple partitions can have the same `insert_deduplication_token`.
|
||||
:::
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
|
@ -122,7 +122,7 @@ public:
|
||||
size_t size;
|
||||
readVarUInt(size, in);
|
||||
|
||||
static constexpr size_t max_size = 1_GiB;
|
||||
static constexpr size_t max_size = 100_GiB;
|
||||
|
||||
if (size == 0)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect size (0) in groupBitmap.");
|
||||
|
@ -157,8 +157,8 @@ public:
|
||||
void read(DB::ReadBuffer & buf)
|
||||
{
|
||||
size_t size = 0;
|
||||
DB::readIntBinary<size_t>(size, buf);
|
||||
DB::readIntBinary<size_t>(total_values, buf);
|
||||
readBinaryLittleEndian(size, buf);
|
||||
readBinaryLittleEndian(total_values, buf);
|
||||
|
||||
/// Compatibility with old versions.
|
||||
if (size > total_values)
|
||||
@ -171,16 +171,16 @@ public:
|
||||
|
||||
samples.resize(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
DB::readPODBinary(samples[i], buf);
|
||||
readBinaryLittleEndian(samples[i], buf);
|
||||
|
||||
sorted = false;
|
||||
}
|
||||
|
||||
void write(DB::WriteBuffer & buf) const
|
||||
{
|
||||
size_t size = samples.size();
|
||||
DB::writeIntBinary<size_t>(size, buf);
|
||||
DB::writeIntBinary<size_t>(total_values, buf);
|
||||
const size_t size = samples.size();
|
||||
writeBinaryLittleEndian(size, buf);
|
||||
writeBinaryLittleEndian(total_values, buf);
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
@ -190,12 +190,12 @@ public:
|
||||
/// Here we ensure that padding is zero without changing the protocol.
|
||||
/// TODO: After implementation of "versioning aggregate function state",
|
||||
/// change the serialization format.
|
||||
|
||||
Element elem;
|
||||
memset(&elem, 0, sizeof(elem));
|
||||
elem = samples[i];
|
||||
|
||||
DB::writePODBinary(elem, buf);
|
||||
DB::transformEndianness<std::endian::little>(elem);
|
||||
DB::writeString(reinterpret_cast<const char*>(&elem), sizeof(elem), buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -144,6 +144,7 @@ void BackupImpl::open(const ContextPtr & context)
|
||||
if (!uuid)
|
||||
uuid = UUIDHelpers::generateV4();
|
||||
lock_file_name = use_archive ? (archive_params.archive_name + ".lock") : ".lock";
|
||||
lock_file_before_first_file_checked = false;
|
||||
writing_finalized = false;
|
||||
|
||||
/// Check that we can write a backup there and create the lock file to own this destination.
|
||||
@ -833,13 +834,10 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry)
|
||||
if (writing_finalized)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized");
|
||||
|
||||
bool should_check_lock_file = false;
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
++num_files;
|
||||
total_size += info.size;
|
||||
if (!num_entries)
|
||||
should_check_lock_file = true;
|
||||
}
|
||||
|
||||
auto src_disk = entry->getDisk();
|
||||
@ -859,7 +857,7 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!should_check_lock_file)
|
||||
if (!lock_file_before_first_file_checked.exchange(true))
|
||||
checkLockFile(true);
|
||||
|
||||
/// NOTE: `mutex` must be unlocked during copying otherwise writing will be in one thread maximum and hence slow.
|
||||
|
@ -141,6 +141,7 @@ private:
|
||||
std::shared_ptr<IArchiveReader> archive_reader;
|
||||
std::shared_ptr<IArchiveWriter> archive_writer;
|
||||
String lock_file_name;
|
||||
std::atomic<bool> lock_file_before_first_file_checked = false;
|
||||
|
||||
bool writing_finalized = false;
|
||||
bool deduplicate_files = true;
|
||||
|
@ -727,6 +727,68 @@ void AsynchronousMetrics::update(TimePoint update_time)
|
||||
}
|
||||
}
|
||||
|
||||
Float64 max_cpu_cgroups = 0;
|
||||
if (cgroupcpu_max)
|
||||
{
|
||||
try
|
||||
{
|
||||
cgroupcpu_max->rewind();
|
||||
|
||||
uint64_t quota = 0;
|
||||
uint64_t period = 0;
|
||||
|
||||
std::string line;
|
||||
readText(line, *cgroupcpu_max);
|
||||
|
||||
auto space = line.find(' ');
|
||||
|
||||
if (line.rfind("max", space) == std::string::npos)
|
||||
{
|
||||
auto field1 = line.substr(0, space);
|
||||
quota = std::stoull(field1);
|
||||
}
|
||||
|
||||
if (space != std::string::npos)
|
||||
{
|
||||
auto field2 = line.substr(space + 1);
|
||||
period = std::stoull(field2);
|
||||
}
|
||||
|
||||
if (quota > 0 && period > 0)
|
||||
max_cpu_cgroups = static_cast<Float64>(quota) / period;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
}
|
||||
else if (cgroupcpu_cfs_quota && cgroupcpu_cfs_period)
|
||||
{
|
||||
try
|
||||
{
|
||||
cgroupcpu_cfs_quota->rewind();
|
||||
cgroupcpu_cfs_period->rewind();
|
||||
|
||||
uint64_t quota = 0;
|
||||
uint64_t period = 0;
|
||||
|
||||
tryReadText(quota, *cgroupcpu_cfs_quota);
|
||||
tryReadText(period, *cgroupcpu_cfs_period);
|
||||
|
||||
if (quota > 0 && period > 0)
|
||||
max_cpu_cgroups = static_cast<Float64>(quota) / period;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
}
|
||||
|
||||
if (max_cpu_cgroups > 0)
|
||||
{
|
||||
new_values["CGroupMaxCPU"] = { max_cpu_cgroups, "The maximum number of CPU cores according to CGroups."};
|
||||
}
|
||||
|
||||
if (proc_stat)
|
||||
{
|
||||
try
|
||||
@ -871,36 +933,38 @@ void AsynchronousMetrics::update(TimePoint update_time)
|
||||
/// Also write values normalized to 0..1 by diving to the number of CPUs.
|
||||
/// These values are good to be averaged across the cluster of non-uniform servers.
|
||||
|
||||
if (num_cpus)
|
||||
Float64 num_cpus_to_normalize = max_cpu_cgroups > 0 ? max_cpu_cgroups : num_cpus;
|
||||
|
||||
if (num_cpus_to_normalize > 0)
|
||||
{
|
||||
new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus,
|
||||
new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus,
|
||||
new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus,
|
||||
new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus,
|
||||
new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus,
|
||||
new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus,
|
||||
new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus,
|
||||
new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus,
|
||||
new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus,
|
||||
new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus,
|
||||
new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus_to_normalize,
|
||||
"The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
|
||||
" This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
|
||||
}
|
||||
@ -937,60 +1001,6 @@ void AsynchronousMetrics::update(TimePoint update_time)
|
||||
}
|
||||
}
|
||||
|
||||
if (cgroupcpu_max)
|
||||
{
|
||||
try {
|
||||
cgroupcpu_max->rewind();
|
||||
|
||||
uint64_t quota = 0;
|
||||
uint64_t period = 0;
|
||||
|
||||
std::string line;
|
||||
readText(line, *cgroupcpu_max);
|
||||
|
||||
auto space = line.find(' ');
|
||||
|
||||
if (line.rfind("max", space) == std::string::npos)
|
||||
{
|
||||
auto field1 = line.substr(0, space);
|
||||
quota = std::stoull(field1);
|
||||
}
|
||||
|
||||
if (space != std::string::npos)
|
||||
{
|
||||
auto field2 = line.substr(space + 1);
|
||||
period = std::stoull(field2);
|
||||
}
|
||||
|
||||
new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."};
|
||||
new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."};
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
}
|
||||
else if (cgroupcpu_cfs_quota && cgroupcpu_cfs_period)
|
||||
{
|
||||
try {
|
||||
cgroupcpu_cfs_quota->rewind();
|
||||
cgroupcpu_cfs_period->rewind();
|
||||
|
||||
uint64_t quota = 0;
|
||||
uint64_t period = 0;
|
||||
|
||||
tryReadText(quota, *cgroupcpu_cfs_quota);
|
||||
tryReadText(period, *cgroupcpu_cfs_period);
|
||||
|
||||
new_values["CGroupCpuCfsPeriod"] = { period, "The CFS period of CPU cgroup."};
|
||||
new_values["CGroupCpuCfsQuota"] = { quota, "The CFS quota of CPU cgroup. If stated zero, the quota is max."};
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
}
|
||||
|
||||
if (meminfo)
|
||||
{
|
||||
try
|
||||
|
62
src/Common/TransformEndianness.hpp
Normal file
62
src/Common/TransformEndianness.hpp
Normal file
@ -0,0 +1,62 @@
|
||||
#pragma once
|
||||
|
||||
#include <base/Decimal_fwd.h>
|
||||
#include <base/extended_types.h>
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
template <std::endian endian, typename T>
|
||||
requires std::is_integral_v<T>
|
||||
inline void transformEndianness(T & value)
|
||||
{
|
||||
if constexpr (endian != std::endian::native)
|
||||
value = std::byteswap(value);
|
||||
}
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires is_big_int_v<T>
|
||||
inline void transformEndianness(T & x)
|
||||
{
|
||||
if constexpr (std::endian::native != endian)
|
||||
{
|
||||
auto & items = x.items;
|
||||
std::transform(std::begin(items), std::end(items), std::begin(items), [](auto & item) { return std::byteswap(item); });
|
||||
std::reverse(std::begin(items), std::end(items));
|
||||
}
|
||||
}
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires is_decimal<T>
|
||||
inline void transformEndianness(T & x)
|
||||
{
|
||||
transformEndianness<endian>(x.value);
|
||||
}
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires std::is_floating_point_v<T>
|
||||
inline void transformEndianness(T & value)
|
||||
{
|
||||
if constexpr (std::endian::native != endian)
|
||||
{
|
||||
auto * start = reinterpret_cast<std::byte *>(&value);
|
||||
std::reverse(start, start + sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires std::is_scoped_enum_v<T>
|
||||
inline void transformEndianness(T & x)
|
||||
{
|
||||
using UnderlyingType = std::underlying_type_t<T>;
|
||||
transformEndianness<endian>(reinterpret_cast<UnderlyingType &>(x));
|
||||
}
|
||||
|
||||
template <std::endian endian, typename A, typename B>
|
||||
inline void transformEndianness(std::pair<A, B> & pair)
|
||||
{
|
||||
transformEndianness<endian>(pair.first);
|
||||
transformEndianness<endian>(pair.second);
|
||||
}
|
||||
}
|
@ -860,6 +860,7 @@ class IColumn;
|
||||
M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \
|
||||
M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \
|
||||
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
|
||||
M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \
|
||||
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
|
||||
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
|
||||
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
|
||||
@ -890,6 +891,9 @@ class IColumn;
|
||||
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
|
||||
M(UInt64, input_format_csv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in CSV format", 0) \
|
||||
M(UInt64, input_format_tsv_skip_first_lines, 0, "Skip specified number of lines at the beginning of data in TSV format", 0) \
|
||||
M(Bool, input_format_csv_skip_trailing_empty_lines, false, "Skip trailing empty lines in CSV format", 0) \
|
||||
M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \
|
||||
M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \
|
||||
\
|
||||
M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \
|
||||
\
|
||||
@ -982,7 +986,7 @@ class IColumn;
|
||||
M(Bool, output_format_orc_string_as_string, false, "Use ORC String type instead of Binary for String columns", 0) \
|
||||
M(ORCCompression, output_format_orc_compression_method, "lz4", "Compression method for ORC output format. Supported codecs: lz4, snappy, zlib, zstd, none (uncompressed)", 0) \
|
||||
\
|
||||
M(EnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::EnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \
|
||||
M(CapnProtoEnumComparingMode, format_capn_proto_enum_comparising_mode, FormatSettings::CapnProtoEnumComparingMode::BY_VALUES, "How to map ClickHouse Enum and CapnProto Enum", 0) \
|
||||
\
|
||||
M(String, input_format_mysql_dump_table_name, "", "Name of the table in MySQL dump from which to read data", 0) \
|
||||
M(Bool, input_format_mysql_dump_map_column_names, true, "Match columns from table in MySQL dump and columns from ClickHouse table by names", 0) \
|
||||
|
@ -144,10 +144,10 @@ IMPLEMENT_SETTING_ENUM(TransactionsWaitCSNMode, ErrorCodes::BAD_ARGUMENTS,
|
||||
{"wait", TransactionsWaitCSNMode::WAIT},
|
||||
{"wait_unknown", TransactionsWaitCSNMode::WAIT_UNKNOWN}})
|
||||
|
||||
IMPLEMENT_SETTING_ENUM(EnumComparingMode, ErrorCodes::BAD_ARGUMENTS,
|
||||
{{"by_names", FormatSettings::EnumComparingMode::BY_NAMES},
|
||||
{"by_values", FormatSettings::EnumComparingMode::BY_VALUES},
|
||||
{"by_names_case_insensitive", FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE}})
|
||||
IMPLEMENT_SETTING_ENUM(CapnProtoEnumComparingMode, ErrorCodes::BAD_ARGUMENTS,
|
||||
{{"by_names", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES},
|
||||
{"by_values", FormatSettings::CapnProtoEnumComparingMode::BY_VALUES},
|
||||
{"by_names_case_insensitive", FormatSettings::CapnProtoEnumComparingMode::BY_NAMES_CASE_INSENSITIVE}})
|
||||
|
||||
IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS,
|
||||
{{"None", FormatSettings::EscapingRule::None},
|
||||
|
@ -188,7 +188,7 @@ enum class TransactionsWaitCSNMode
|
||||
|
||||
DECLARE_SETTING_ENUM(TransactionsWaitCSNMode)
|
||||
|
||||
DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode)
|
||||
DECLARE_SETTING_ENUM_WITH_RENAME(CapnProtoEnumComparingMode, FormatSettings::CapnProtoEnumComparingMode)
|
||||
|
||||
DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule)
|
||||
|
||||
|
@ -1038,34 +1038,6 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
|
||||
|
||||
current_file_segment_counters.increment(ProfileEvents::FileSegmentUsedBytes, available());
|
||||
|
||||
// No necessary because of the SCOPE_EXIT above, but useful for logging below.
|
||||
if (download_current_segment)
|
||||
file_segment.completePartAndResetDownloader();
|
||||
|
||||
chassert(!file_segment.isDownloader());
|
||||
|
||||
LOG_TEST(
|
||||
log,
|
||||
"Key: {}. Returning with {} bytes, buffer position: {} (offset: {}, predownloaded: {}), "
|
||||
"buffer available: {}, current range: {}, file offset of buffer end: {}, impl offset: {}, file segment state: {}, "
|
||||
"current write offset: {}, read_type: {}, reading until position: {}, started with offset: {}, "
|
||||
"remaining ranges: {}",
|
||||
cache_key.toString(),
|
||||
working_buffer.size(),
|
||||
getPosition(),
|
||||
offset(),
|
||||
needed_to_predownload,
|
||||
available(),
|
||||
current_read_range.toString(),
|
||||
file_offset_of_buffer_end,
|
||||
implementation_buffer->getFileOffsetOfBufferEnd(),
|
||||
FileSegment::stateToString(file_segment.state()),
|
||||
file_segment.getCurrentWriteOffset(false),
|
||||
toString(read_type),
|
||||
read_until_position,
|
||||
first_offset,
|
||||
file_segments->toString());
|
||||
|
||||
if (size == 0 && file_offset_of_buffer_end < read_until_position)
|
||||
{
|
||||
size_t cache_file_size = getFileSizeFromReadBuffer(*implementation_buffer);
|
||||
@ -1086,6 +1058,33 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
|
||||
file_segment.getInfoForLog());
|
||||
}
|
||||
|
||||
// No necessary because of the SCOPE_EXIT above, but useful for logging below.
|
||||
if (download_current_segment)
|
||||
file_segment.completePartAndResetDownloader();
|
||||
|
||||
chassert(!file_segment.isDownloader());
|
||||
|
||||
LOG_TEST(
|
||||
log,
|
||||
"Key: {}. Returning with {} bytes, buffer position: {} (offset: {}, predownloaded: {}), "
|
||||
"buffer available: {}, current range: {}, file offset of buffer end: {}, file segment state: {}, "
|
||||
"current write offset: {}, read_type: {}, reading until position: {}, started with offset: {}, "
|
||||
"remaining ranges: {}",
|
||||
cache_key.toString(),
|
||||
working_buffer.size(),
|
||||
getPosition(),
|
||||
offset(),
|
||||
needed_to_predownload,
|
||||
available(),
|
||||
current_read_range.toString(),
|
||||
file_offset_of_buffer_end,
|
||||
FileSegment::stateToString(file_segment.state()),
|
||||
file_segment.getCurrentWriteOffset(false),
|
||||
toString(read_type),
|
||||
read_until_position,
|
||||
first_offset,
|
||||
file_segments->toString());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -48,7 +48,9 @@ void registerDiskCache(DiskFactory & factory, bool /* global_skip_access_check *
|
||||
auto cache = FileCacheFactory::instance().getOrCreate(name, file_cache_settings);
|
||||
auto disk = disk_it->second;
|
||||
if (!dynamic_cast<const DiskObjectStorage *>(disk.get()))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cached disk is allowed only on top of object storage");
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Cannot wrap disk `{}` with cache layer `{}`: cached disk is allowed only on top of object storage",
|
||||
disk_name, name);
|
||||
|
||||
auto disk_object_storage = disk->createDiskObjectStorage();
|
||||
|
||||
|
298
src/Formats/CapnProtoSchema.cpp
Normal file
298
src/Formats/CapnProtoSchema.cpp
Normal file
@ -0,0 +1,298 @@
|
||||
#include <Formats/CapnProtoSchema.h>
|
||||
|
||||
#if USE_CAPNP
|
||||
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeEnum.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <boost/algorithm/string/join.hpp>
|
||||
#include <capnp/schema.h>
|
||||
#include <capnp/schema-parser.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA;
|
||||
extern const int BAD_TYPE_OF_FIELD;
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
extern const int UNKNOWN_EXCEPTION;
|
||||
extern const int CAPN_PROTO_BAD_TYPE;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info)
|
||||
{
|
||||
capnp::ParsedSchema schema;
|
||||
try
|
||||
{
|
||||
int fd;
|
||||
KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon)
|
||||
auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd));
|
||||
schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {});
|
||||
}
|
||||
catch (const kj::Exception & e)
|
||||
{
|
||||
/// That's not good to determine the type of error by its description, but
|
||||
/// this is the only way to do it here, because kj doesn't specify the type of error.
|
||||
auto description = std::string_view(e.getDescription().cStr());
|
||||
if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos || description.find("no such file") != String::npos)
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath());
|
||||
|
||||
if (description.find("Parse error") != String::npos)
|
||||
throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine());
|
||||
|
||||
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
|
||||
"Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}",
|
||||
description, schema_info.schemaDirectory(), schema_info.schemaPath());
|
||||
}
|
||||
|
||||
auto message_maybe = schema.findNested(schema_info.messageName());
|
||||
auto * message_schema = kj::_::readMaybe(message_maybe);
|
||||
if (!message_schema)
|
||||
throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA,
|
||||
"CapnProto schema doesn't contain message with name {}", schema_info.messageName());
|
||||
return message_schema->asStruct();
|
||||
}
|
||||
|
||||
bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema)
|
||||
{
|
||||
return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size();
|
||||
}
|
||||
|
||||
bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema)
|
||||
{
|
||||
return struct_schema.getFields().size() == struct_schema.getUnionFields().size();
|
||||
}
|
||||
|
||||
/// Get full name of type for better exception messages.
|
||||
String getCapnProtoFullTypeName(const capnp::Type & type)
|
||||
{
|
||||
static const std::map<capnp::schema::Type::Which, String> capnp_simple_type_names =
|
||||
{
|
||||
{capnp::schema::Type::Which::BOOL, "Bool"},
|
||||
{capnp::schema::Type::Which::VOID, "Void"},
|
||||
{capnp::schema::Type::Which::INT8, "Int8"},
|
||||
{capnp::schema::Type::Which::INT16, "Int16"},
|
||||
{capnp::schema::Type::Which::INT32, "Int32"},
|
||||
{capnp::schema::Type::Which::INT64, "Int64"},
|
||||
{capnp::schema::Type::Which::UINT8, "UInt8"},
|
||||
{capnp::schema::Type::Which::UINT16, "UInt16"},
|
||||
{capnp::schema::Type::Which::UINT32, "UInt32"},
|
||||
{capnp::schema::Type::Which::UINT64, "UInt64"},
|
||||
{capnp::schema::Type::Which::FLOAT32, "Float32"},
|
||||
{capnp::schema::Type::Which::FLOAT64, "Float64"},
|
||||
{capnp::schema::Type::Which::TEXT, "Text"},
|
||||
{capnp::schema::Type::Which::DATA, "Data"},
|
||||
{capnp::schema::Type::Which::INTERFACE, "Interface"},
|
||||
{capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"},
|
||||
};
|
||||
|
||||
switch (type.which())
|
||||
{
|
||||
case capnp::schema::Type::Which::STRUCT:
|
||||
{
|
||||
auto struct_schema = type.asStruct();
|
||||
|
||||
auto non_union_fields = struct_schema.getNonUnionFields();
|
||||
std::vector<String> non_union_field_names;
|
||||
for (auto nested_field : non_union_fields)
|
||||
non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType()));
|
||||
|
||||
auto union_fields = struct_schema.getUnionFields();
|
||||
std::vector<String> union_field_names;
|
||||
for (auto nested_field : union_fields)
|
||||
union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType()));
|
||||
|
||||
String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")";
|
||||
/// Check if the struct is a named union.
|
||||
if (non_union_field_names.empty())
|
||||
return union_name;
|
||||
|
||||
String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", ");
|
||||
/// Check if the struct contains unnamed union.
|
||||
if (!union_field_names.empty())
|
||||
type_name += ", " + union_name;
|
||||
type_name += ")";
|
||||
return type_name;
|
||||
}
|
||||
case capnp::schema::Type::Which::LIST:
|
||||
return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")";
|
||||
case capnp::schema::Type::Which::ENUM:
|
||||
{
|
||||
auto enum_schema = type.asEnum();
|
||||
String enum_name = "Enum(";
|
||||
auto enumerants = enum_schema.getEnumerants();
|
||||
for (unsigned i = 0; i != enumerants.size(); ++i)
|
||||
{
|
||||
enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal());
|
||||
if (i + 1 != enumerants.size())
|
||||
enum_name += ", ";
|
||||
}
|
||||
enum_name += ")";
|
||||
return enum_name;
|
||||
}
|
||||
default:
|
||||
auto it = capnp_simple_type_names.find(type.which());
|
||||
if (it == capnp_simple_type_names.end())
|
||||
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type");
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template <typename ValueType>
|
||||
DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants)
|
||||
{
|
||||
std::vector<std::pair<String, ValueType>> values;
|
||||
for (auto enumerant : enumerants)
|
||||
values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal()));
|
||||
return std::make_shared<DataTypeEnum<ValueType>>(std::move(values));
|
||||
}
|
||||
|
||||
DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema)
|
||||
{
|
||||
auto enumerants = enum_schema.getEnumerants();
|
||||
if (enumerants.size() < 128)
|
||||
return getEnumDataTypeFromEnumerants<Int8>(enumerants);
|
||||
if (enumerants.size() < 32768)
|
||||
return getEnumDataTypeFromEnumerants<Int16>(enumerants);
|
||||
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums");
|
||||
}
|
||||
|
||||
DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields)
|
||||
{
|
||||
switch (capnp_type.which())
|
||||
{
|
||||
case capnp::schema::Type::INT8:
|
||||
return std::make_shared<DataTypeInt8>();
|
||||
case capnp::schema::Type::INT16:
|
||||
return std::make_shared<DataTypeInt16>();
|
||||
case capnp::schema::Type::INT32:
|
||||
return std::make_shared<DataTypeInt32>();
|
||||
case capnp::schema::Type::INT64:
|
||||
return std::make_shared<DataTypeInt64>();
|
||||
case capnp::schema::Type::BOOL: [[fallthrough]];
|
||||
case capnp::schema::Type::UINT8:
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
case capnp::schema::Type::UINT16:
|
||||
return std::make_shared<DataTypeUInt16>();
|
||||
case capnp::schema::Type::UINT32:
|
||||
return std::make_shared<DataTypeUInt32>();
|
||||
case capnp::schema::Type::UINT64:
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
case capnp::schema::Type::FLOAT32:
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
case capnp::schema::Type::FLOAT64:
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
case capnp::schema::Type::DATA: [[fallthrough]];
|
||||
case capnp::schema::Type::TEXT:
|
||||
return std::make_shared<DataTypeString>();
|
||||
case capnp::schema::Type::ENUM:
|
||||
return getEnumDataTypeFromEnumSchema(capnp_type.asEnum());
|
||||
case capnp::schema::Type::LIST:
|
||||
{
|
||||
auto list_schema = capnp_type.asList();
|
||||
auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeArray>(nested_type);
|
||||
}
|
||||
case capnp::schema::Type::STRUCT:
|
||||
{
|
||||
auto struct_schema = capnp_type.asStruct();
|
||||
|
||||
|
||||
if (struct_schema.getFields().size() == 0)
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported");
|
||||
}
|
||||
|
||||
/// Check if it can be Nullable.
|
||||
if (checkIfStructIsNamedUnion(struct_schema))
|
||||
{
|
||||
auto fields = struct_schema.getUnionFields();
|
||||
if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid()))
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported");
|
||||
}
|
||||
auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType();
|
||||
if (value_type.isStruct() || value_type.isList())
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable");
|
||||
}
|
||||
|
||||
auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeNullable>(nested_type);
|
||||
}
|
||||
|
||||
if (checkIfStructContainsUnnamedUnion(struct_schema))
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported");
|
||||
|
||||
/// Treat Struct as Tuple.
|
||||
DataTypes nested_types;
|
||||
Names nested_names;
|
||||
for (auto field : struct_schema.getNonUnionFields())
|
||||
{
|
||||
auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields);
|
||||
if (!nested_type)
|
||||
continue;
|
||||
nested_names.push_back(field.getProto().getName());
|
||||
nested_types.push_back(nested_type);
|
||||
}
|
||||
if (nested_types.empty())
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names));
|
||||
}
|
||||
default:
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields)
|
||||
{
|
||||
if (checkIfStructContainsUnnamedUnion(schema))
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported");
|
||||
|
||||
NamesAndTypesList names_and_types;
|
||||
for (auto field : schema.getNonUnionFields())
|
||||
{
|
||||
auto name = field.getProto().getName();
|
||||
auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields);
|
||||
if (type)
|
||||
names_and_types.emplace_back(name, type);
|
||||
}
|
||||
if (names_and_types.empty())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types");
|
||||
|
||||
return names_and_types;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -30,17 +30,14 @@ public:
|
||||
capnp::StructSchema getMessageSchema(const FormatSchemaInfo & schema_info);
|
||||
};
|
||||
|
||||
std::pair<String, String> splitCapnProtoFieldName(const String & name);
|
||||
bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema);
|
||||
bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema);
|
||||
|
||||
bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode);
|
||||
|
||||
std::pair<capnp::DynamicStruct::Builder, capnp::StructSchema::Field> getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name);
|
||||
|
||||
capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name);
|
||||
|
||||
void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode);
|
||||
/// Get full name of type for better exception messages.
|
||||
String getCapnProtoFullTypeName(const capnp::Type & type);
|
||||
|
||||
NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
1538
src/Formats/CapnProtoSerializer.cpp
Normal file
1538
src/Formats/CapnProtoSerializer.cpp
Normal file
File diff suppressed because it is too large
Load Diff
30
src/Formats/CapnProtoSerializer.h
Normal file
30
src/Formats/CapnProtoSerializer.h
Normal file
@ -0,0 +1,30 @@
|
||||
#pragma once
|
||||
|
||||
#if USE_CAPNP
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <capnp/dynamic.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class CapnProtoSerializer
|
||||
{
|
||||
public:
|
||||
CapnProtoSerializer(const DataTypes & data_types, const Names & names, const capnp::StructSchema & schema, const FormatSettings::CapnProto & settings);
|
||||
|
||||
void writeRow(const Columns & columns, capnp::DynamicStruct::Builder builder, size_t row_num);
|
||||
|
||||
void readRow(MutableColumns & columns, capnp::DynamicStruct::Reader & reader);
|
||||
|
||||
~CapnProtoSerializer();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> serializer_impl;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -1,734 +0,0 @@
|
||||
#include <Formats/CapnProtoUtils.h>
|
||||
|
||||
#if USE_CAPNP
|
||||
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeEnum.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/algorithm/string/join.hpp>
|
||||
#include <capnp/schema.h>
|
||||
#include <capnp/schema-parser.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_PARSE_CAPN_PROTO_SCHEMA;
|
||||
extern const int THERE_IS_NO_COLUMN;
|
||||
extern const int BAD_TYPE_OF_FIELD;
|
||||
extern const int CAPN_PROTO_BAD_CAST;
|
||||
extern const int FILE_DOESNT_EXIST;
|
||||
extern const int UNKNOWN_EXCEPTION;
|
||||
extern const int INCORRECT_DATA;
|
||||
extern const int CAPN_PROTO_BAD_TYPE;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
std::pair<String, String> splitCapnProtoFieldName(const String & name)
|
||||
{
|
||||
const auto * begin = name.data();
|
||||
const auto * end = name.data() + name.size();
|
||||
const auto * it = find_first_symbols<'_', '.'>(begin, end);
|
||||
String first = String(begin, it);
|
||||
String second = it == end ? "" : String(it + 1, end);
|
||||
return {first, second};
|
||||
}
|
||||
|
||||
capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info)
|
||||
{
|
||||
capnp::ParsedSchema schema;
|
||||
try
|
||||
{
|
||||
int fd;
|
||||
KJ_SYSCALL(fd = open(schema_info.schemaDirectory().data(), O_RDONLY)); // NOLINT(bugprone-suspicious-semicolon)
|
||||
auto schema_dir = kj::newDiskDirectory(kj::OsFileHandle(fd));
|
||||
schema = impl.parseFromDirectory(*schema_dir, kj::Path::parse(schema_info.schemaPath()), {});
|
||||
}
|
||||
catch (const kj::Exception & e)
|
||||
{
|
||||
/// That's not good to determine the type of error by its description, but
|
||||
/// this is the only way to do it here, because kj doesn't specify the type of error.
|
||||
auto description = std::string_view(e.getDescription().cStr());
|
||||
if (description.find("No such file or directory") != String::npos || description.find("no such directory") != String::npos)
|
||||
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot open CapnProto schema, file {} doesn't exists", schema_info.absoluteSchemaPath());
|
||||
|
||||
if (description.find("Parse error") != String::npos)
|
||||
throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA, "Cannot parse CapnProto schema {}:{}", schema_info.schemaPath(), e.getLine());
|
||||
|
||||
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
|
||||
"Unknown exception while parsing CapnProto schema: {}, schema dir and file: {}, {}",
|
||||
description, schema_info.schemaDirectory(), schema_info.schemaPath());
|
||||
}
|
||||
|
||||
auto message_maybe = schema.findNested(schema_info.messageName());
|
||||
auto * message_schema = kj::_::readMaybe(message_maybe);
|
||||
if (!message_schema)
|
||||
throw Exception(ErrorCodes::CANNOT_PARSE_CAPN_PROTO_SCHEMA,
|
||||
"CapnProto schema doesn't contain message with name {}", schema_info.messageName());
|
||||
return message_schema->asStruct();
|
||||
}
|
||||
|
||||
bool compareEnumNames(const String & first, const String & second, FormatSettings::EnumComparingMode mode)
|
||||
{
|
||||
if (mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE)
|
||||
return boost::algorithm::to_lower_copy(first) == boost::algorithm::to_lower_copy(second);
|
||||
return first == second;
|
||||
}
|
||||
|
||||
static const std::map<capnp::schema::Type::Which, String> capnp_simple_type_names =
|
||||
{
|
||||
{capnp::schema::Type::Which::BOOL, "Bool"},
|
||||
{capnp::schema::Type::Which::VOID, "Void"},
|
||||
{capnp::schema::Type::Which::INT8, "Int8"},
|
||||
{capnp::schema::Type::Which::INT16, "Int16"},
|
||||
{capnp::schema::Type::Which::INT32, "Int32"},
|
||||
{capnp::schema::Type::Which::INT64, "Int64"},
|
||||
{capnp::schema::Type::Which::UINT8, "UInt8"},
|
||||
{capnp::schema::Type::Which::UINT16, "UInt16"},
|
||||
{capnp::schema::Type::Which::UINT32, "UInt32"},
|
||||
{capnp::schema::Type::Which::UINT64, "UInt64"},
|
||||
{capnp::schema::Type::Which::FLOAT32, "Float32"},
|
||||
{capnp::schema::Type::Which::FLOAT64, "Float64"},
|
||||
{capnp::schema::Type::Which::TEXT, "Text"},
|
||||
{capnp::schema::Type::Which::DATA, "Data"},
|
||||
{capnp::schema::Type::Which::INTERFACE, "Interface"},
|
||||
{capnp::schema::Type::Which::ANY_POINTER, "AnyPointer"},
|
||||
};
|
||||
|
||||
static bool checkIfStructContainsUnnamedUnion(const capnp::StructSchema & struct_schema)
|
||||
{
|
||||
return struct_schema.getFields().size() != struct_schema.getNonUnionFields().size();
|
||||
}
|
||||
|
||||
static bool checkIfStructIsNamedUnion(const capnp::StructSchema & struct_schema)
|
||||
{
|
||||
return struct_schema.getFields().size() == struct_schema.getUnionFields().size();
|
||||
}
|
||||
|
||||
/// Get full name of type for better exception messages.
|
||||
static String getCapnProtoFullTypeName(const capnp::Type & type)
|
||||
{
|
||||
switch (type.which())
|
||||
{
|
||||
case capnp::schema::Type::Which::STRUCT:
|
||||
{
|
||||
auto struct_schema = type.asStruct();
|
||||
|
||||
auto non_union_fields = struct_schema.getNonUnionFields();
|
||||
std::vector<String> non_union_field_names;
|
||||
for (auto nested_field : non_union_fields)
|
||||
non_union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType()));
|
||||
|
||||
auto union_fields = struct_schema.getUnionFields();
|
||||
std::vector<String> union_field_names;
|
||||
for (auto nested_field : union_fields)
|
||||
union_field_names.push_back(String(nested_field.getProto().getName()) + " " + getCapnProtoFullTypeName(nested_field.getType()));
|
||||
|
||||
String union_name = "Union(" + boost::algorithm::join(union_field_names, ", ") + ")";
|
||||
/// Check if the struct is a named union.
|
||||
if (non_union_field_names.empty())
|
||||
return union_name;
|
||||
|
||||
String type_name = "Struct(" + boost::algorithm::join(non_union_field_names, ", ");
|
||||
/// Check if the struct contains unnamed union.
|
||||
if (!union_field_names.empty())
|
||||
type_name += ", " + union_name;
|
||||
type_name += ")";
|
||||
return type_name;
|
||||
}
|
||||
case capnp::schema::Type::Which::LIST:
|
||||
return "List(" + getCapnProtoFullTypeName(type.asList().getElementType()) + ")";
|
||||
case capnp::schema::Type::Which::ENUM:
|
||||
{
|
||||
auto enum_schema = type.asEnum();
|
||||
String enum_name = "Enum(";
|
||||
auto enumerants = enum_schema.getEnumerants();
|
||||
for (unsigned i = 0; i != enumerants.size(); ++i)
|
||||
{
|
||||
enum_name += String(enumerants[i].getProto().getName()) + " = " + std::to_string(enumerants[i].getOrdinal());
|
||||
if (i + 1 != enumerants.size())
|
||||
enum_name += ", ";
|
||||
}
|
||||
enum_name += ")";
|
||||
return enum_name;
|
||||
}
|
||||
default:
|
||||
auto it = capnp_simple_type_names.find(type.which());
|
||||
if (it == capnp_simple_type_names.end())
|
||||
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unknown CapnProto type");
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
static bool checkEnums(const capnp::Type & capnp_type, const DataTypePtr column_type, FormatSettings::EnumComparingMode mode, UInt64 max_value, String & error_message)
|
||||
{
|
||||
if (!capnp_type.isEnum())
|
||||
return false;
|
||||
|
||||
auto enum_schema = capnp_type.asEnum();
|
||||
bool to_lower = mode == FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE;
|
||||
const auto * enum_type = assert_cast<const DataTypeEnum<Type> *>(column_type.get());
|
||||
const auto & enum_values = dynamic_cast<const EnumValues<Type> &>(*enum_type);
|
||||
|
||||
auto enumerants = enum_schema.getEnumerants();
|
||||
if (mode == FormatSettings::EnumComparingMode::BY_VALUES)
|
||||
{
|
||||
/// In CapnProto Enum fields are numbered sequentially starting from zero.
|
||||
if (enumerants.size() > max_value)
|
||||
{
|
||||
error_message += "Enum from CapnProto schema contains values that is out of range for Clickhouse Enum";
|
||||
return false;
|
||||
}
|
||||
|
||||
auto values = enum_values.getSetOfAllValues();
|
||||
std::unordered_set<Type> capn_enum_values;
|
||||
for (auto enumerant : enumerants)
|
||||
capn_enum_values.insert(Type(enumerant.getOrdinal()));
|
||||
auto result = values == capn_enum_values;
|
||||
if (!result)
|
||||
error_message += "The set of values in Enum from CapnProto schema is different from the set of values in ClickHouse Enum";
|
||||
return result;
|
||||
}
|
||||
|
||||
auto names = enum_values.getSetOfAllNames(to_lower);
|
||||
std::unordered_set<String> capn_enum_names;
|
||||
|
||||
for (auto enumerant : enumerants)
|
||||
{
|
||||
String name = enumerant.getProto().getName();
|
||||
capn_enum_names.insert(to_lower ? boost::algorithm::to_lower_copy(name) : name);
|
||||
}
|
||||
|
||||
auto result = names == capn_enum_names;
|
||||
if (!result)
|
||||
error_message += "The set of names in Enum from CapnProto schema is different from the set of names in ClickHouse Enum";
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name);
|
||||
|
||||
static bool checkNullableType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name)
|
||||
{
|
||||
if (!capnp_type.isStruct())
|
||||
return false;
|
||||
|
||||
/// Check that struct is a named union of type VOID and one arbitrary type.
|
||||
auto struct_schema = capnp_type.asStruct();
|
||||
if (!checkIfStructIsNamedUnion(struct_schema))
|
||||
return false;
|
||||
|
||||
auto union_fields = struct_schema.getUnionFields();
|
||||
if (union_fields.size() != 2)
|
||||
return false;
|
||||
|
||||
auto first = union_fields[0];
|
||||
auto second = union_fields[1];
|
||||
|
||||
auto nested_type = assert_cast<const DataTypeNullable *>(data_type.get())->getNestedType();
|
||||
if (first.getType().isVoid())
|
||||
return checkCapnProtoType(second.getType(), nested_type, mode, error_message, column_name);
|
||||
if (second.getType().isVoid())
|
||||
return checkCapnProtoType(first.getType(), nested_type, mode, error_message, column_name);
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool checkTupleType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message)
|
||||
{
|
||||
if (!capnp_type.isStruct())
|
||||
return false;
|
||||
auto struct_schema = capnp_type.asStruct();
|
||||
|
||||
if (checkIfStructIsNamedUnion(struct_schema))
|
||||
return false;
|
||||
|
||||
if (checkIfStructContainsUnnamedUnion(struct_schema))
|
||||
{
|
||||
error_message += "CapnProto struct contains unnamed union";
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto * tuple_data_type = assert_cast<const DataTypeTuple *>(data_type.get());
|
||||
auto nested_types = tuple_data_type->getElements();
|
||||
if (nested_types.size() != struct_schema.getFields().size())
|
||||
{
|
||||
error_message += "Tuple and Struct types have different sizes";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool have_explicit_names = tuple_data_type->haveExplicitNames();
|
||||
const auto & nested_names = tuple_data_type->getElementNames();
|
||||
for (uint32_t i = 0; i != nested_names.size(); ++i)
|
||||
{
|
||||
if (have_explicit_names)
|
||||
{
|
||||
KJ_IF_MAYBE (field, struct_schema.findFieldByName(nested_names[i]))
|
||||
{
|
||||
if (!checkCapnProtoType(field->getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i]))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
error_message += "CapnProto struct doesn't contain a field with name " + nested_names[i];
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (!checkCapnProtoType(struct_schema.getFields()[i].getType(), nested_types[tuple_data_type->getPositionByName(nested_names[i])], mode, error_message, nested_names[i]))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool checkArrayType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name)
|
||||
{
|
||||
if (!capnp_type.isList())
|
||||
return false;
|
||||
auto list_schema = capnp_type.asList();
|
||||
auto nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType();
|
||||
|
||||
auto [field_name, nested_name] = splitCapnProtoFieldName(column_name);
|
||||
if (!nested_name.empty() && list_schema.getElementType().isStruct())
|
||||
{
|
||||
auto struct_schema = list_schema.getElementType().asStruct();
|
||||
KJ_IF_MAYBE(field, struct_schema.findFieldByName(nested_name))
|
||||
return checkCapnProtoType(field->getType(), nested_type, mode, error_message, nested_name);
|
||||
|
||||
error_message += "Element type of List {} doesn't contain field with name " + nested_name;
|
||||
return false;
|
||||
}
|
||||
|
||||
return checkCapnProtoType(list_schema.getElementType(), nested_type, mode, error_message, column_name);
|
||||
}
|
||||
|
||||
static bool checkMapType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message)
|
||||
{
|
||||
/// We output/input Map type as follow CapnProto schema
|
||||
///
|
||||
/// struct Map {
|
||||
/// struct Entry {
|
||||
/// key @0: Key;
|
||||
/// value @1: Value;
|
||||
/// }
|
||||
/// entries @0 :List(Entry);
|
||||
/// }
|
||||
|
||||
if (!capnp_type.isStruct())
|
||||
return false;
|
||||
auto struct_schema = capnp_type.asStruct();
|
||||
|
||||
if (checkIfStructContainsUnnamedUnion(struct_schema))
|
||||
{
|
||||
error_message += "CapnProto struct contains unnamed union";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (struct_schema.getFields().size() != 1)
|
||||
{
|
||||
error_message += "CapnProto struct that represents Map type can contain only one field";
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto & field_type = struct_schema.getFields()[0].getType();
|
||||
if (!field_type.isList())
|
||||
{
|
||||
error_message += "Field of CapnProto struct that represents Map is not a list";
|
||||
return false;
|
||||
}
|
||||
|
||||
auto list_element_type = field_type.asList().getElementType();
|
||||
if (!list_element_type.isStruct())
|
||||
{
|
||||
error_message += "Field of CapnProto struct that represents Map is not a list of structs";
|
||||
return false;
|
||||
}
|
||||
|
||||
auto key_value_struct = list_element_type.asStruct();
|
||||
if (checkIfStructContainsUnnamedUnion(key_value_struct))
|
||||
{
|
||||
error_message += "CapnProto struct contains unnamed union";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (key_value_struct.getFields().size() != 2)
|
||||
{
|
||||
error_message += "Key-value structure for Map struct should have exactly 2 fields";
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
|
||||
DataTypes types = {map_type.getKeyType(), map_type.getValueType()};
|
||||
Names names = {"key", "value"};
|
||||
|
||||
for (size_t i = 0; i != types.size(); ++i)
|
||||
{
|
||||
KJ_IF_MAYBE(field, key_value_struct.findFieldByName(names[i]))
|
||||
{
|
||||
if (!checkCapnProtoType(field->getType(), types[i], mode, error_message, names[i]))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
error_message += R"(Key-value structure for Map struct should have exactly 2 fields with names "key" and "value")";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool isCapnInteger(const capnp::Type & capnp_type)
|
||||
{
|
||||
return capnp_type.isInt8() || capnp_type.isUInt8() || capnp_type.isInt16() || capnp_type.isUInt16() || capnp_type.isInt32()
|
||||
|| capnp_type.isUInt32() || capnp_type.isInt64() || capnp_type.isUInt64();
|
||||
}
|
||||
|
||||
static bool checkCapnProtoType(const capnp::Type & capnp_type, const DataTypePtr & data_type, FormatSettings::EnumComparingMode mode, String & error_message, const String & column_name)
|
||||
{
|
||||
switch (data_type->getTypeId())
|
||||
{
|
||||
case TypeIndex::UInt8:
|
||||
return capnp_type.isBool() || isCapnInteger(capnp_type);
|
||||
case TypeIndex::Int8: [[fallthrough]];
|
||||
case TypeIndex::Int16: [[fallthrough]];
|
||||
case TypeIndex::UInt16: [[fallthrough]];
|
||||
case TypeIndex::Int32: [[fallthrough]];
|
||||
case TypeIndex::UInt32: [[fallthrough]];
|
||||
case TypeIndex::Int64: [[fallthrough]];
|
||||
case TypeIndex::UInt64:
|
||||
/// Allow integer conversions durin input/output.
|
||||
return isCapnInteger(capnp_type);
|
||||
case TypeIndex::Date:
|
||||
return capnp_type.isUInt16();
|
||||
case TypeIndex::DateTime: [[fallthrough]];
|
||||
case TypeIndex::IPv4:
|
||||
return capnp_type.isUInt32();
|
||||
case TypeIndex::Date32: [[fallthrough]];
|
||||
case TypeIndex::Decimal32:
|
||||
return capnp_type.isInt32() || capnp_type.isUInt32();
|
||||
case TypeIndex::DateTime64: [[fallthrough]];
|
||||
case TypeIndex::Decimal64:
|
||||
return capnp_type.isInt64() || capnp_type.isUInt64();
|
||||
case TypeIndex::Float32:[[fallthrough]];
|
||||
case TypeIndex::Float64:
|
||||
/// Allow converting between Float32 and isFloat64
|
||||
return capnp_type.isFloat32() || capnp_type.isFloat64();
|
||||
case TypeIndex::Enum8:
|
||||
return checkEnums<Int8>(capnp_type, data_type, mode, INT8_MAX, error_message);
|
||||
case TypeIndex::Enum16:
|
||||
return checkEnums<Int16>(capnp_type, data_type, mode, INT16_MAX, error_message);
|
||||
case TypeIndex::Int128: [[fallthrough]];
|
||||
case TypeIndex::UInt128: [[fallthrough]];
|
||||
case TypeIndex::Int256: [[fallthrough]];
|
||||
case TypeIndex::UInt256: [[fallthrough]];
|
||||
case TypeIndex::Decimal128: [[fallthrough]];
|
||||
case TypeIndex::Decimal256:
|
||||
return capnp_type.isData();
|
||||
case TypeIndex::Tuple:
|
||||
return checkTupleType(capnp_type, data_type, mode, error_message);
|
||||
case TypeIndex::Nullable:
|
||||
{
|
||||
auto result = checkNullableType(capnp_type, data_type, mode, error_message, column_name);
|
||||
if (!result)
|
||||
error_message += "Nullable can be represented only as a named union of type Void and nested type";
|
||||
return result;
|
||||
}
|
||||
case TypeIndex::Array:
|
||||
return checkArrayType(capnp_type, data_type, mode, error_message, column_name);
|
||||
case TypeIndex::LowCardinality:
|
||||
return checkCapnProtoType(capnp_type, assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType(), mode, error_message, column_name);
|
||||
case TypeIndex::FixedString: [[fallthrough]];
|
||||
case TypeIndex::IPv6: [[fallthrough]];
|
||||
case TypeIndex::String:
|
||||
return capnp_type.isText() || capnp_type.isData();
|
||||
case TypeIndex::Map:
|
||||
return checkMapType(capnp_type, data_type, mode, error_message);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Reader & struct_reader, const String & name)
|
||||
{
|
||||
auto [field_name, nested_name] = splitCapnProtoFieldName(name);
|
||||
KJ_IF_MAYBE(field, struct_reader.getSchema().findFieldByName(field_name))
|
||||
{
|
||||
capnp::DynamicValue::Reader field_reader;
|
||||
try
|
||||
{
|
||||
field_reader = struct_reader.get(*field);
|
||||
}
|
||||
catch (const kj::Exception & e)
|
||||
{
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA,
|
||||
"Cannot extract field value from struct by provided schema, error: "
|
||||
"{} Perhaps the data was generated by another schema", String(e.getDescription().cStr()));
|
||||
}
|
||||
|
||||
if (nested_name.empty())
|
||||
return field_reader;
|
||||
|
||||
/// Support reading Nested as List of Structs.
|
||||
if (field_reader.getType() == capnp::DynamicValue::LIST)
|
||||
{
|
||||
auto list_schema = field->getType().asList();
|
||||
if (!list_schema.getElementType().isStruct())
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name);
|
||||
|
||||
auto struct_schema = list_schema.getElementType().asStruct();
|
||||
KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name))
|
||||
return field_reader;
|
||||
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name);
|
||||
}
|
||||
|
||||
if (field_reader.getType() != capnp::DynamicValue::STRUCT)
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name);
|
||||
|
||||
return getReaderByColumnName(field_reader.as<capnp::DynamicStruct>(), nested_name);
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name);
|
||||
}
|
||||
|
||||
std::pair<capnp::DynamicStruct::Builder, capnp::StructSchema::Field> getStructBuilderAndFieldByColumnName(capnp::DynamicStruct::Builder struct_builder, const String & name)
|
||||
{
|
||||
auto [field_name, nested_name] = splitCapnProtoFieldName(name);
|
||||
KJ_IF_MAYBE(field, struct_builder.getSchema().findFieldByName(field_name))
|
||||
{
|
||||
if (nested_name.empty())
|
||||
return {struct_builder, *field};
|
||||
|
||||
auto field_builder = struct_builder.get(*field);
|
||||
|
||||
/// Support reading Nested as List of Structs.
|
||||
if (field_builder.getType() == capnp::DynamicValue::LIST)
|
||||
{
|
||||
auto list_schema = field->getType().asList();
|
||||
if (!list_schema.getElementType().isStruct())
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name);
|
||||
|
||||
auto struct_schema = list_schema.getElementType().asStruct();
|
||||
KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name))
|
||||
return {struct_builder, *field};
|
||||
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name);
|
||||
}
|
||||
|
||||
if (field_builder.getType() != capnp::DynamicValue::STRUCT)
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name);
|
||||
|
||||
return getStructBuilderAndFieldByColumnName(field_builder.as<capnp::DynamicStruct>(), nested_name);
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto struct doesn't contain field with name {}", field_name);
|
||||
}
|
||||
|
||||
static std::pair<capnp::StructSchema::Field, String> getFieldByName(const capnp::StructSchema & schema, const String & name)
|
||||
{
|
||||
auto [field_name, nested_name] = splitCapnProtoFieldName(name);
|
||||
KJ_IF_MAYBE(field, schema.findFieldByName(field_name))
|
||||
{
|
||||
if (nested_name.empty())
|
||||
return {*field, name};
|
||||
|
||||
/// Support reading Nested as List of Structs.
|
||||
if (field->getType().isList())
|
||||
{
|
||||
auto list_schema = field->getType().asList();
|
||||
if (!list_schema.getElementType().isStruct())
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Element type of List {} is not a struct", field_name);
|
||||
|
||||
auto struct_schema = list_schema.getElementType().asStruct();
|
||||
KJ_IF_MAYBE(nested_field, struct_schema.findFieldByName(nested_name))
|
||||
return {*field, name};
|
||||
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Element type of List {} doesn't contain field with name \"{}\"", field_name, nested_name);
|
||||
}
|
||||
|
||||
if (!field->getType().isStruct())
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Field {} is not a struct", field_name);
|
||||
|
||||
return getFieldByName(field->getType().asStruct(), nested_name);
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Capnproto schema doesn't contain field with name {}", field_name);
|
||||
}
|
||||
|
||||
void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode)
|
||||
{
|
||||
/// Firstly check that struct doesn't contain unnamed union, because we don't support it.
|
||||
if (checkIfStructContainsUnnamedUnion(schema))
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_CAST, "Schema contains unnamed union that is not supported");
|
||||
auto names_and_types = header.getNamesAndTypesList();
|
||||
String additional_error_message;
|
||||
for (auto & [name, type] : names_and_types)
|
||||
{
|
||||
auto [field, field_name] = getFieldByName(schema, name);
|
||||
if (!checkCapnProtoType(field.getType(), type, mode, additional_error_message, field_name))
|
||||
{
|
||||
auto e = Exception(
|
||||
ErrorCodes::CAPN_PROTO_BAD_CAST,
|
||||
"Cannot convert ClickHouse type {} to CapnProto type {}",
|
||||
type->getName(),
|
||||
getCapnProtoFullTypeName(field.getType()));
|
||||
if (!additional_error_message.empty())
|
||||
e.addMessage(additional_error_message);
|
||||
throw std::move(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ValueType>
|
||||
static DataTypePtr getEnumDataTypeFromEnumerants(const capnp::EnumSchema::EnumerantList & enumerants)
|
||||
{
|
||||
std::vector<std::pair<String, ValueType>> values;
|
||||
for (auto enumerant : enumerants)
|
||||
values.emplace_back(enumerant.getProto().getName(), ValueType(enumerant.getOrdinal()));
|
||||
return std::make_shared<DataTypeEnum<ValueType>>(std::move(values));
|
||||
}
|
||||
|
||||
static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_schema)
|
||||
{
|
||||
auto enumerants = enum_schema.getEnumerants();
|
||||
if (enumerants.size() < 128)
|
||||
return getEnumDataTypeFromEnumerants<Int8>(enumerants);
|
||||
if (enumerants.size() < 32768)
|
||||
return getEnumDataTypeFromEnumerants<Int16>(enumerants);
|
||||
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums");
|
||||
}
|
||||
|
||||
static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields)
|
||||
{
|
||||
switch (capnp_type.which())
|
||||
{
|
||||
case capnp::schema::Type::INT8:
|
||||
return std::make_shared<DataTypeInt8>();
|
||||
case capnp::schema::Type::INT16:
|
||||
return std::make_shared<DataTypeInt16>();
|
||||
case capnp::schema::Type::INT32:
|
||||
return std::make_shared<DataTypeInt32>();
|
||||
case capnp::schema::Type::INT64:
|
||||
return std::make_shared<DataTypeInt64>();
|
||||
case capnp::schema::Type::BOOL: [[fallthrough]];
|
||||
case capnp::schema::Type::UINT8:
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
case capnp::schema::Type::UINT16:
|
||||
return std::make_shared<DataTypeUInt16>();
|
||||
case capnp::schema::Type::UINT32:
|
||||
return std::make_shared<DataTypeUInt32>();
|
||||
case capnp::schema::Type::UINT64:
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
case capnp::schema::Type::FLOAT32:
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
case capnp::schema::Type::FLOAT64:
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
case capnp::schema::Type::DATA: [[fallthrough]];
|
||||
case capnp::schema::Type::TEXT:
|
||||
return std::make_shared<DataTypeString>();
|
||||
case capnp::schema::Type::ENUM:
|
||||
return getEnumDataTypeFromEnumSchema(capnp_type.asEnum());
|
||||
case capnp::schema::Type::LIST:
|
||||
{
|
||||
auto list_schema = capnp_type.asList();
|
||||
auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeArray>(nested_type);
|
||||
}
|
||||
case capnp::schema::Type::STRUCT:
|
||||
{
|
||||
auto struct_schema = capnp_type.asStruct();
|
||||
|
||||
|
||||
if (struct_schema.getFields().size() == 0)
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported");
|
||||
}
|
||||
|
||||
/// Check if it can be Nullable.
|
||||
if (checkIfStructIsNamedUnion(struct_schema))
|
||||
{
|
||||
auto fields = struct_schema.getUnionFields();
|
||||
if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid()))
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported");
|
||||
}
|
||||
auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType();
|
||||
if (value_type.isStruct() || value_type.isList())
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable");
|
||||
}
|
||||
|
||||
auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields);
|
||||
if (!nested_type)
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeNullable>(nested_type);
|
||||
}
|
||||
|
||||
if (checkIfStructContainsUnnamedUnion(struct_schema))
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported");
|
||||
|
||||
/// Treat Struct as Tuple.
|
||||
DataTypes nested_types;
|
||||
Names nested_names;
|
||||
for (auto field : struct_schema.getNonUnionFields())
|
||||
{
|
||||
auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields);
|
||||
if (!nested_type)
|
||||
continue;
|
||||
nested_names.push_back(field.getProto().getName());
|
||||
nested_types.push_back(nested_type);
|
||||
}
|
||||
if (nested_types.empty())
|
||||
return nullptr;
|
||||
return std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names));
|
||||
}
|
||||
default:
|
||||
{
|
||||
if (skip_unsupported_fields)
|
||||
return nullptr;
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields)
|
||||
{
|
||||
if (checkIfStructContainsUnnamedUnion(schema))
|
||||
throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported");
|
||||
|
||||
NamesAndTypesList names_and_types;
|
||||
for (auto field : schema.getNonUnionFields())
|
||||
{
|
||||
auto name = field.getProto().getName();
|
||||
auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields);
|
||||
if (type)
|
||||
names_and_types.emplace_back(name, type);
|
||||
}
|
||||
if (names_and_types.empty())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types");
|
||||
|
||||
return names_and_types;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -408,9 +408,10 @@ DataTypes getDefaultDataTypeForEscapingRules(const std::vector<FormatSettings::E
|
||||
String getAdditionalFormatInfoForAllRowBasedFormats(const FormatSettings & settings)
|
||||
{
|
||||
return fmt::format(
|
||||
"schema_inference_hints={}, max_rows_to_read_for_schema_inference={}, schema_inference_make_columns_nullable={}",
|
||||
"schema_inference_hints={}, max_rows_to_read_for_schema_inference={}, max_bytes_to_read_for_schema_inference={}, schema_inference_make_columns_nullable={}",
|
||||
settings.schema_inference_hints,
|
||||
settings.max_rows_to_read_for_schema_inference,
|
||||
settings.max_bytes_to_read_for_schema_inference,
|
||||
settings.schema_inference_make_columns_nullable);
|
||||
}
|
||||
|
||||
|
@ -69,6 +69,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
|
||||
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
|
||||
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
|
||||
format_settings.csv.skip_trailing_empty_lines = settings.input_format_csv_skip_trailing_empty_lines;
|
||||
format_settings.csv.trim_whitespaces = settings.input_format_csv_trim_whitespaces;
|
||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||
@ -81,6 +82,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
|
||||
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
|
||||
format_settings.custom.try_detect_header = settings.input_format_custom_detect_header;
|
||||
format_settings.custom.skip_trailing_empty_lines = settings.input_format_custom_skip_trailing_empty_lines;
|
||||
format_settings.date_time_input_format = settings.date_time_input_format;
|
||||
format_settings.date_time_output_format = settings.date_time_output_format;
|
||||
format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error;
|
||||
@ -150,6 +152,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
|
||||
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
|
||||
format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header;
|
||||
format_settings.tsv.skip_trailing_empty_lines = settings.input_format_tsv_skip_trailing_empty_lines;
|
||||
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
||||
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
||||
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
||||
@ -183,6 +186,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
||||
format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns;
|
||||
format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation;
|
||||
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
|
||||
format_settings.max_bytes_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
|
||||
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
|
||||
format_settings.schema_inference_hints = settings.schema_inference_hints;
|
||||
format_settings.schema_inference_make_columns_nullable = settings.schema_inference_make_columns_nullable;
|
||||
|
@ -36,7 +36,8 @@ struct FormatSettings
|
||||
bool defaults_for_omitted_fields = true;
|
||||
|
||||
bool seekable_read = true;
|
||||
UInt64 max_rows_to_read_for_schema_inference = 100;
|
||||
UInt64 max_rows_to_read_for_schema_inference = 25000;
|
||||
UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024;
|
||||
|
||||
String column_names_for_schema_inference;
|
||||
String schema_inference_hints;
|
||||
@ -136,6 +137,7 @@ struct FormatSettings
|
||||
UInt64 skip_first_lines = 0;
|
||||
String custom_delimiter;
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
bool trim_whitespaces = true;
|
||||
} csv;
|
||||
|
||||
@ -157,6 +159,7 @@ struct FormatSettings
|
||||
std::string field_delimiter;
|
||||
EscapingRule escaping_rule = EscapingRule::Escaped;
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
} custom;
|
||||
|
||||
struct
|
||||
@ -292,6 +295,7 @@ struct FormatSettings
|
||||
bool use_best_effort_in_schema_inference = true;
|
||||
UInt64 skip_first_lines = 0;
|
||||
bool try_detect_header = true;
|
||||
bool skip_trailing_empty_lines = false;
|
||||
} tsv;
|
||||
|
||||
struct
|
||||
@ -324,16 +328,16 @@ struct FormatSettings
|
||||
|
||||
/// For capnProto format we should determine how to
|
||||
/// compare ClickHouse Enum and Enum from schema.
|
||||
enum class EnumComparingMode
|
||||
enum class CapnProtoEnumComparingMode
|
||||
{
|
||||
BY_NAMES, // Names in enums should be the same, values can be different.
|
||||
BY_NAMES_CASE_INSENSITIVE, // Case-insensitive name comparison.
|
||||
BY_VALUES, // Values should be the same, names can be different.
|
||||
};
|
||||
|
||||
struct
|
||||
struct CapnProto
|
||||
{
|
||||
EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES;
|
||||
CapnProtoEnumComparingMode enum_comparing_mode = CapnProtoEnumComparingMode::BY_VALUES;
|
||||
bool skip_fields_with_unsupported_types_in_schema_inference = false;
|
||||
} capn_proto;
|
||||
|
||||
|
@ -75,6 +75,8 @@ ColumnsDescription readSchemaFromFormat(
|
||||
SchemaReaderPtr schema_reader;
|
||||
size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference
|
||||
: context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference;
|
||||
size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference
|
||||
: context->getSettingsRef().input_format_max_bytes_to_read_for_schema_inference;
|
||||
size_t iterations = 0;
|
||||
ColumnsDescription cached_columns;
|
||||
while (true)
|
||||
@ -120,7 +122,7 @@ ColumnsDescription readSchemaFromFormat(
|
||||
try
|
||||
{
|
||||
schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings);
|
||||
schema_reader->setMaxRowsToRead(max_rows_to_read);
|
||||
schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read);
|
||||
names_and_types = schema_reader->readSchema();
|
||||
break;
|
||||
}
|
||||
@ -132,10 +134,14 @@ ColumnsDescription readSchemaFromFormat(
|
||||
size_t rows_read = schema_reader->getNumRowsRead();
|
||||
assert(rows_read <= max_rows_to_read);
|
||||
max_rows_to_read -= schema_reader->getNumRowsRead();
|
||||
if (rows_read != 0 && max_rows_to_read == 0)
|
||||
size_t bytes_read = buf->count();
|
||||
/// We could exceed max_bytes_to_read a bit to complete row parsing.
|
||||
max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read);
|
||||
if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0))
|
||||
{
|
||||
exception_message += "\nTo increase the maximum number of rows to read for structure determination, use setting "
|
||||
"input_format_max_rows_to_read_for_schema_inference";
|
||||
exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting "
|
||||
"input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference";
|
||||
|
||||
if (iterations > 1)
|
||||
{
|
||||
exception_messages += "\n" + exception_message;
|
||||
|
@ -580,7 +580,7 @@ private:
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
unalignedStoreLittleEndian<UInt64>(buf + 8, 0x00000000FFFF0000ull | (static_cast<UInt64>(ntohl(in)) << 32));
|
||||
#else
|
||||
unalignedStoreLittleEndian<UInt64>(buf + 8, 0x00000000FFFF0000ull | (static_cast<UInt64>(__builtin_bswap32(in))) << 32));
|
||||
unalignedStoreLittleEndian<UInt64>(buf + 8, 0x00000000FFFF0000ull | (static_cast<UInt64>(std::byteswap(in)) << 32));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
@ -25,6 +25,12 @@ namespace ErrorCodes
|
||||
|
||||
void UserDefinedSQLFunctionVisitor::visit(ASTPtr & ast)
|
||||
{
|
||||
if (!ast)
|
||||
{
|
||||
chassert(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto visit_child_with_shared_ptr = [&](ASTPtr & child)
|
||||
{
|
||||
if (!child)
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <Common/DateLUT.h>
|
||||
#include <Common/LocalDate.h>
|
||||
#include <Common/LocalDateTime.h>
|
||||
#include <Common/TransformEndianness.hpp>
|
||||
#include <base/StringRef.h>
|
||||
#include <base/arithmeticOverflow.h>
|
||||
#include <base/sort.h>
|
||||
@ -1092,30 +1093,11 @@ inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf)
|
||||
inline void readBinary(Decimal256 & x, ReadBuffer & buf) { readPODBinary(x.value, buf); }
|
||||
inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); }
|
||||
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires is_arithmetic_v<T> && (sizeof(T) <= 8)
|
||||
inline void readBinaryEndian(T & x, ReadBuffer & buf)
|
||||
{
|
||||
readPODBinary(x, buf);
|
||||
if constexpr (std::endian::native != endian)
|
||||
x = std::byteswap(x);
|
||||
}
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires is_big_int_v<T>
|
||||
inline void readBinaryEndian(T & x, ReadBuffer & buf)
|
||||
{
|
||||
if constexpr (std::endian::native == endian)
|
||||
{
|
||||
for (size_t i = 0; i != std::size(x.items); ++i)
|
||||
readBinaryEndian<endian>(x.items[i], buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i != std::size(x.items); ++i)
|
||||
readBinaryEndian<endian>(x.items[std::size(x.items) - i - 1], buf);
|
||||
}
|
||||
transformEndianness<endian>(x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <Common/DateLUT.h>
|
||||
#include <Common/LocalDate.h>
|
||||
#include <Common/LocalDateTime.h>
|
||||
#include <Common/TransformEndianness.hpp>
|
||||
#include <base/find_symbols.h>
|
||||
#include <base/StringRef.h>
|
||||
#include <base/DecomposedFloat.h>
|
||||
@ -1174,32 +1175,13 @@ inline void writeNullTerminatedString(const String & s, WriteBuffer & buffer)
|
||||
buffer.write(s.c_str(), s.size() + 1);
|
||||
}
|
||||
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires is_arithmetic_v<T> && (sizeof(T) <= 8)
|
||||
inline void writeBinaryEndian(T x, WriteBuffer & buf)
|
||||
{
|
||||
if constexpr (std::endian::native != endian)
|
||||
x = std::byteswap(x);
|
||||
transformEndianness<endian>(x);
|
||||
writePODBinary(x, buf);
|
||||
}
|
||||
|
||||
template <std::endian endian, typename T>
|
||||
requires is_big_int_v<T>
|
||||
inline void writeBinaryEndian(const T & x, WriteBuffer & buf)
|
||||
{
|
||||
if constexpr (std::endian::native == endian)
|
||||
{
|
||||
for (size_t i = 0; i != std::size(x.items); ++i)
|
||||
writeBinaryEndian<endian>(x.items[i], buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i != std::size(x.items); ++i)
|
||||
writeBinaryEndian<endian>(x.items[std::size(x.items) - i - 1], buf);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void writeBinaryLittleEndian(T x, WriteBuffer & buf)
|
||||
{
|
||||
|
@ -477,7 +477,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
|
||||
/// Check support for JOIN for parallel replicas with custom key
|
||||
if (joined_tables.tablesCount() > 1 && !settings.parallel_replicas_custom_key.value.empty())
|
||||
{
|
||||
LOG_WARNING(log, "JOINs are not supported with parallel_replicas_custom_key. Query will be executed without using them.");
|
||||
LOG_DEBUG(log, "JOINs are not supported with parallel_replicas_custom_key. Query will be executed without using them.");
|
||||
context->setSetting("parallel_replicas_custom_key", String{""});
|
||||
}
|
||||
|
||||
@ -487,7 +487,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
|
||||
{
|
||||
if (settings.allow_experimental_parallel_reading_from_replicas == 1)
|
||||
{
|
||||
LOG_WARNING(log, "FINAL modifier is not supported with parallel replicas. Query will be executed without using them.");
|
||||
LOG_DEBUG(log, "FINAL modifier is not supported with parallel replicas. Query will be executed without using them.");
|
||||
context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
|
||||
context->setSetting("parallel_replicas_custom_key", String{""});
|
||||
}
|
||||
@ -503,7 +503,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
|
||||
{
|
||||
if (settings.allow_experimental_parallel_reading_from_replicas == 1)
|
||||
{
|
||||
LOG_WARNING(log, "To use parallel replicas with plain MergeTree tables please enable setting `parallel_replicas_for_non_replicated_merge_tree`. For now query will be executed without using them.");
|
||||
LOG_DEBUG(log, "To use parallel replicas with plain MergeTree tables please enable setting `parallel_replicas_for_non_replicated_merge_tree`. For now query will be executed without using them.");
|
||||
context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
|
||||
}
|
||||
else if (settings.allow_experimental_parallel_reading_from_replicas == 2)
|
||||
|
@ -12,6 +12,9 @@ class ASTFunction;
|
||||
class ASTIndexDeclaration : public IAST
|
||||
{
|
||||
public:
|
||||
static const auto DEFAULT_INDEX_GRANULARITY = 1uz;
|
||||
static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz;
|
||||
|
||||
String name;
|
||||
IAST * expr;
|
||||
ASTFunction * type;
|
||||
|
@ -46,7 +46,16 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected
|
||||
index->part_of_create_index_query = true;
|
||||
index->set(index->expr, expr);
|
||||
index->set(index->type, type);
|
||||
index->granularity = granularity ? granularity->as<ASTLiteral &>().value.safeGet<UInt64>() : 1;
|
||||
|
||||
if (granularity)
|
||||
index->granularity = granularity->as<ASTLiteral &>().value.safeGet<UInt64>();
|
||||
else
|
||||
{
|
||||
if (index->type->name == "annoy")
|
||||
index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
|
||||
else
|
||||
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
|
||||
}
|
||||
node = index;
|
||||
|
||||
return true;
|
||||
|
@ -141,7 +141,17 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
|
||||
index->name = name->as<ASTIdentifier &>().name();
|
||||
index->set(index->expr, expr);
|
||||
index->set(index->type, type);
|
||||
index->granularity = granularity ? granularity->as<ASTLiteral &>().value.safeGet<UInt64>() : 1;
|
||||
|
||||
if (granularity)
|
||||
index->granularity = granularity->as<ASTLiteral &>().value.safeGet<UInt64>();
|
||||
else
|
||||
{
|
||||
if (index->type->name == "annoy")
|
||||
index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
|
||||
else
|
||||
index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;
|
||||
}
|
||||
|
||||
node = index;
|
||||
|
||||
return true;
|
||||
|
@ -63,11 +63,15 @@ void checkFinalInferredType(
|
||||
}
|
||||
|
||||
IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_)
|
||||
: ISchemaReader(in_), default_type(default_type_), hints_str(format_settings_.schema_inference_hints), format_settings(format_settings_)
|
||||
: ISchemaReader(in_)
|
||||
, max_rows_to_read(format_settings_.max_rows_to_read_for_schema_inference)
|
||||
, max_bytes_to_read(format_settings_.max_bytes_to_read_for_schema_inference)
|
||||
, default_type(default_type_)
|
||||
, hints_str(format_settings_.schema_inference_hints)
|
||||
, format_settings(format_settings_)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void IIRowSchemaReader::setContext(ContextPtr & context)
|
||||
{
|
||||
ColumnsDescription columns;
|
||||
@ -105,11 +109,11 @@ IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, const FormatSettings & form
|
||||
|
||||
NamesAndTypesList IRowSchemaReader::readSchema()
|
||||
{
|
||||
if (max_rows_to_read == 0)
|
||||
if (max_rows_to_read == 0 || max_bytes_to_read == 0)
|
||||
throw Exception(
|
||||
ErrorCodes::BAD_ARGUMENTS,
|
||||
"Cannot read rows to determine the schema, the maximum number of rows to read is set to 0. "
|
||||
"Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0");
|
||||
"Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. "
|
||||
"Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0");
|
||||
|
||||
DataTypes data_types = readRowAndGetDataTypes();
|
||||
|
||||
@ -149,7 +153,7 @@ NamesAndTypesList IRowSchemaReader::readSchema()
|
||||
data_types[i] = hint_it->second;
|
||||
}
|
||||
|
||||
for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read)
|
||||
for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read)
|
||||
{
|
||||
DataTypes new_data_types = readRowAndGetDataTypes();
|
||||
if (new_data_types.empty())
|
||||
@ -226,11 +230,11 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, const For
|
||||
|
||||
NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
|
||||
{
|
||||
if (max_rows_to_read == 0)
|
||||
if (max_rows_to_read == 0 || max_bytes_to_read == 0)
|
||||
throw Exception(
|
||||
ErrorCodes::BAD_ARGUMENTS,
|
||||
"Cannot read rows to determine the schema, the maximum number of rows to read is set to 0. "
|
||||
"Most likely setting input_format_max_rows_to_read_for_schema_inference is set to 0");
|
||||
"Cannot read rows to determine the schema, the maximum number of rows (or bytes) to read is set to 0. "
|
||||
"Most likely setting input_format_max_rows_to_read_for_schema_inference or input_format_max_bytes_to_read_for_schema_inference is set to 0");
|
||||
|
||||
bool eof = false;
|
||||
auto names_and_types = readRowAndGetNamesAndDataTypes(eof);
|
||||
@ -251,7 +255,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema()
|
||||
names_order.push_back(name);
|
||||
}
|
||||
|
||||
for (rows_read = 1; rows_read < max_rows_to_read; ++rows_read)
|
||||
for (rows_read = 1; rows_read < max_rows_to_read && in.count() < max_bytes_to_read; ++rows_read)
|
||||
{
|
||||
auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof);
|
||||
if (eof)
|
||||
|
@ -32,7 +32,7 @@ public:
|
||||
virtual bool needContext() const { return false; }
|
||||
virtual void setContext(ContextPtr &) {}
|
||||
|
||||
virtual void setMaxRowsToRead(size_t) {}
|
||||
virtual void setMaxRowsAndBytesToRead(size_t, size_t) {}
|
||||
virtual size_t getNumRowsRead() const { return 0; }
|
||||
|
||||
virtual ~ISchemaReader() = default;
|
||||
@ -54,12 +54,17 @@ public:
|
||||
virtual void transformTypesIfNeeded(DataTypePtr & type, DataTypePtr & new_type);
|
||||
|
||||
protected:
|
||||
void setMaxRowsToRead(size_t max_rows) override { max_rows_to_read = max_rows; }
|
||||
void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override
|
||||
{
|
||||
max_rows_to_read = max_rows;
|
||||
max_bytes_to_read = max_bytes;
|
||||
}
|
||||
size_t getNumRowsRead() const override { return rows_read; }
|
||||
|
||||
virtual void transformFinalTypeIfNeeded(DataTypePtr &) {}
|
||||
|
||||
size_t max_rows_to_read;
|
||||
size_t max_bytes_to_read;
|
||||
size_t rows_read = 0;
|
||||
DataTypePtr default_type;
|
||||
String hints_str;
|
||||
|
@ -325,6 +325,20 @@ void CSVFormatReader::setReadBuffer(ReadBuffer & in_)
|
||||
FormatWithNamesAndTypesReader::setReadBuffer(*buf);
|
||||
}
|
||||
|
||||
bool CSVFormatReader::checkForSuffix()
|
||||
{
|
||||
if (!format_settings.csv.skip_trailing_empty_lines)
|
||||
return buf->eof();
|
||||
|
||||
PeekableReadBufferCheckpoint checkpoint(*buf);
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
if (buf->eof())
|
||||
return true;
|
||||
|
||||
buf->rollbackToCheckpoint();
|
||||
return false;
|
||||
}
|
||||
|
||||
CSVSchemaReader::CSVSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
buf,
|
||||
|
@ -75,6 +75,7 @@ public:
|
||||
std::vector<String> readRow() { return readRowImpl<false>(); }
|
||||
std::vector<String> readRowForHeaderDetection() override { return readHeaderRow(); }
|
||||
|
||||
bool checkForSuffix() override;
|
||||
|
||||
template <bool is_header>
|
||||
std::vector<String> readRowImpl();
|
||||
|
@ -9,42 +9,22 @@
|
||||
#include <capnp/dynamic.h>
|
||||
#include <capnp/common.h>
|
||||
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnLowCardinality.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnDecimal.h>
|
||||
#include <Columns/ColumnMap.h>
|
||||
|
||||
#include <DataTypes/DataTypeEnum.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int INCORRECT_DATA;
|
||||
}
|
||||
|
||||
CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings_)
|
||||
: IRowInputFormat(std::move(header), in_, std::move(params_))
|
||||
CapnProtoRowInputFormat::CapnProtoRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const FormatSchemaInfo & info, const FormatSettings & format_settings)
|
||||
: IRowInputFormat(std::move(header_), in_, std::move(params_))
|
||||
, parser(std::make_shared<CapnProtoSchemaParser>())
|
||||
, format_settings(format_settings_)
|
||||
, column_types(getPort().getHeader().getDataTypes())
|
||||
, column_names(getPort().getHeader().getNames())
|
||||
{
|
||||
// Parse the schema and fetch the root object
|
||||
root = parser->getMessageSchema(info);
|
||||
checkCapnProtoSchemaStructure(root, getPort().getHeader(), format_settings.capn_proto.enum_comparing_mode);
|
||||
schema = parser->getMessageSchema(info);
|
||||
const auto & header = getPort().getHeader();
|
||||
serializer = std::make_unique<CapnProtoSerializer>(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto);
|
||||
}
|
||||
|
||||
kj::Array<capnp::word> CapnProtoRowInputFormat::readMessage()
|
||||
@ -82,213 +62,6 @@ kj::Array<capnp::word> CapnProtoRowInputFormat::readMessage()
|
||||
return msg;
|
||||
}
|
||||
|
||||
static void insertInteger(IColumn & column, const DataTypePtr & column_type, UInt64 value)
|
||||
{
|
||||
switch (column_type->getTypeId())
|
||||
{
|
||||
case TypeIndex::Int8:
|
||||
assert_cast<ColumnInt8 &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::UInt8:
|
||||
assert_cast<ColumnUInt8 &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::Int16:
|
||||
assert_cast<ColumnInt16 &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::Date: [[fallthrough]];
|
||||
case TypeIndex::UInt16:
|
||||
assert_cast<ColumnUInt16 &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::Int32:
|
||||
assert_cast<ColumnInt32 &>(column).insertValue(static_cast<Int32>(value));
|
||||
break;
|
||||
case TypeIndex::DateTime: [[fallthrough]];
|
||||
case TypeIndex::UInt32:
|
||||
assert_cast<ColumnUInt32 &>(column).insertValue(static_cast<UInt32>(value));
|
||||
break;
|
||||
case TypeIndex::IPv4:
|
||||
assert_cast<ColumnIPv4 &>(column).insertValue(IPv4(static_cast<UInt32>(value)));
|
||||
break;
|
||||
case TypeIndex::Int64:
|
||||
assert_cast<ColumnInt64 &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::UInt64:
|
||||
assert_cast<ColumnUInt64 &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::DateTime64:
|
||||
assert_cast<ColumnDecimal<DateTime64> &>(column).insertValue(value);
|
||||
break;
|
||||
case TypeIndex::Decimal32:
|
||||
assert_cast<ColumnDecimal<Decimal32> &>(column).insertValue(static_cast<Int32>(value));
|
||||
break;
|
||||
case TypeIndex::Decimal64:
|
||||
assert_cast<ColumnDecimal<Decimal64> &>(column).insertValue(value);
|
||||
break;
|
||||
default:
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type {} cannot be parsed from integer", column_type->getName());
|
||||
}
|
||||
}
|
||||
|
||||
static void insertFloat(IColumn & column, const DataTypePtr & column_type, Float64 value)
|
||||
{
|
||||
switch (column_type->getTypeId())
|
||||
{
|
||||
case TypeIndex::Float32:
|
||||
assert_cast<ColumnFloat32 &>(column).insertValue(static_cast<Float32>(value));
|
||||
break;
|
||||
case TypeIndex::Float64:
|
||||
assert_cast<ColumnFloat64 &>(column).insertValue(value);
|
||||
break;
|
||||
default:
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column type is not a float.");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
static void insertData(IColumn & column, const DataTypePtr & column_type, Value value)
|
||||
{
|
||||
if (column_type->haveMaximumSizeOfValue() && value.size() != column_type->getSizeOfValueInMemory())
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected size of {} value: {}", column_type->getName(), value.size());
|
||||
|
||||
column.insertData(reinterpret_cast<const char *>(value.begin()), value.size());
|
||||
}
|
||||
|
||||
template <typename ValueType>
|
||||
static void insertEnum(IColumn & column, const DataTypePtr & column_type, const capnp::DynamicEnum & enum_value, FormatSettings::EnumComparingMode enum_comparing_mode)
|
||||
{
|
||||
auto enumerant = *kj::_::readMaybe(enum_value.getEnumerant());
|
||||
auto enum_type = assert_cast<const DataTypeEnum<ValueType> *>(column_type.get());
|
||||
DataTypePtr nested_type = std::make_shared<DataTypeNumber<ValueType>>();
|
||||
switch (enum_comparing_mode)
|
||||
{
|
||||
case FormatSettings::EnumComparingMode::BY_VALUES:
|
||||
insertInteger(column, nested_type, Int64(enumerant.getOrdinal()));
|
||||
return;
|
||||
case FormatSettings::EnumComparingMode::BY_NAMES:
|
||||
insertInteger(column, nested_type, Int64(enum_type->getValue(String(enumerant.getProto().getName()))));
|
||||
return;
|
||||
case FormatSettings::EnumComparingMode::BY_NAMES_CASE_INSENSITIVE:
|
||||
{
|
||||
/// Find the same enum name case insensitive.
|
||||
String enum_name = enumerant.getProto().getName();
|
||||
for (auto & name : enum_type->getAllRegisteredNames())
|
||||
{
|
||||
if (compareEnumNames(name, enum_name, enum_comparing_mode))
|
||||
{
|
||||
insertInteger(column, nested_type, Int64(enum_type->getValue(name)));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void insertValue(IColumn & column, const DataTypePtr & column_type, const String & column_name, const capnp::DynamicValue::Reader & value, FormatSettings::EnumComparingMode enum_comparing_mode)
|
||||
{
|
||||
if (column_type->lowCardinality())
|
||||
{
|
||||
auto & lc_column = assert_cast<ColumnLowCardinality &>(column);
|
||||
auto tmp_column = lc_column.getDictionary().getNestedColumn()->cloneEmpty();
|
||||
auto dict_type = assert_cast<const DataTypeLowCardinality *>(column_type.get())->getDictionaryType();
|
||||
insertValue(*tmp_column, dict_type, column_name, value, enum_comparing_mode);
|
||||
lc_column.insertFromFullColumn(*tmp_column, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (value.getType())
|
||||
{
|
||||
case capnp::DynamicValue::Type::INT:
|
||||
insertInteger(column, column_type, value.as<Int64>());
|
||||
break;
|
||||
case capnp::DynamicValue::Type::UINT:
|
||||
insertInteger(column, column_type, value.as<UInt64>());
|
||||
break;
|
||||
case capnp::DynamicValue::Type::FLOAT:
|
||||
insertFloat(column, column_type, value.as<Float64>());
|
||||
break;
|
||||
case capnp::DynamicValue::Type::BOOL:
|
||||
insertInteger(column, column_type, UInt64(value.as<bool>()));
|
||||
break;
|
||||
case capnp::DynamicValue::Type::DATA:
|
||||
insertData(column, column_type, value.as<capnp::Data>());
|
||||
break;
|
||||
case capnp::DynamicValue::Type::TEXT:
|
||||
insertData(column, column_type, value.as<capnp::Text>());
|
||||
break;
|
||||
case capnp::DynamicValue::Type::ENUM:
|
||||
if (column_type->getTypeId() == TypeIndex::Enum8)
|
||||
insertEnum<Int8>(column, column_type, value.as<capnp::DynamicEnum>(), enum_comparing_mode);
|
||||
else
|
||||
insertEnum<Int16>(column, column_type, value.as<capnp::DynamicEnum>(), enum_comparing_mode);
|
||||
break;
|
||||
case capnp::DynamicValue::LIST:
|
||||
{
|
||||
auto list_value = value.as<capnp::DynamicList>();
|
||||
auto & column_array = assert_cast<ColumnArray &>(column);
|
||||
auto & offsets = column_array.getOffsets();
|
||||
offsets.push_back(offsets.back() + list_value.size());
|
||||
|
||||
auto & nested_column = column_array.getData();
|
||||
auto nested_type = assert_cast<const DataTypeArray *>(column_type.get())->getNestedType();
|
||||
for (const auto & nested_value : list_value)
|
||||
insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode);
|
||||
break;
|
||||
}
|
||||
case capnp::DynamicValue::Type::STRUCT:
|
||||
{
|
||||
auto struct_value = value.as<capnp::DynamicStruct>();
|
||||
if (column_type->isNullable())
|
||||
{
|
||||
auto & nullable_column = assert_cast<ColumnNullable &>(column);
|
||||
auto field = *kj::_::readMaybe(struct_value.which());
|
||||
if (field.getType().isVoid())
|
||||
nullable_column.insertDefault();
|
||||
else
|
||||
{
|
||||
auto & nested_column = nullable_column.getNestedColumn();
|
||||
auto nested_type = assert_cast<const DataTypeNullable *>(column_type.get())->getNestedType();
|
||||
auto nested_value = struct_value.get(field);
|
||||
insertValue(nested_column, nested_type, column_name, nested_value, enum_comparing_mode);
|
||||
nullable_column.getNullMapData().push_back(0);
|
||||
}
|
||||
}
|
||||
else if (isTuple(column_type))
|
||||
{
|
||||
auto & tuple_column = assert_cast<ColumnTuple &>(column);
|
||||
const auto * tuple_type = assert_cast<const DataTypeTuple *>(column_type.get());
|
||||
bool have_explicit_names = tuple_type->haveExplicitNames();
|
||||
auto struct_schema = struct_value.getSchema();
|
||||
for (uint32_t i = 0; i != tuple_column.tupleSize(); ++i)
|
||||
insertValue(
|
||||
tuple_column.getColumn(i),
|
||||
tuple_type->getElements()[i],
|
||||
tuple_type->getElementNames()[i],
|
||||
struct_value.get(have_explicit_names ? struct_schema.getFieldByName(tuple_type->getElementNames()[i]) : struct_schema.getFields()[i]),
|
||||
enum_comparing_mode);
|
||||
}
|
||||
else if (isMap(column_type))
|
||||
{
|
||||
const auto & map_type = assert_cast<const DataTypeMap &>(*column_type);
|
||||
DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()};
|
||||
Names key_value_names = {"key", "value"};
|
||||
auto entries_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(key_value_types, key_value_names));
|
||||
auto & entries_column = assert_cast<ColumnMap &>(column).getNestedColumn();
|
||||
auto entries_field = struct_value.getSchema().getFields()[0];
|
||||
insertValue(entries_column, entries_type, column_name, struct_value.get(entries_field), enum_comparing_mode);
|
||||
}
|
||||
else
|
||||
{
|
||||
/// It can be nested column from Nested type.
|
||||
auto [field_name, nested_name] = splitCapnProtoFieldName(column_name);
|
||||
insertValue(column, column_type, nested_name, struct_value.get(nested_name), enum_comparing_mode);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto value type.");
|
||||
}
|
||||
}
|
||||
|
||||
bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
|
||||
{
|
||||
if (in->eof())
|
||||
@ -298,12 +71,8 @@ bool CapnProtoRowInputFormat::readRow(MutableColumns & columns, RowReadExtension
|
||||
{
|
||||
auto array = readMessage();
|
||||
capnp::FlatArrayMessageReader msg(array);
|
||||
auto root_reader = msg.getRoot<capnp::DynamicStruct>(root);
|
||||
for (size_t i = 0; i != columns.size(); ++i)
|
||||
{
|
||||
auto value = getReaderByColumnName(root_reader, column_names[i]);
|
||||
insertValue(*columns[i], column_types[i], column_names[i], value, format_settings.capn_proto.enum_comparing_mode);
|
||||
}
|
||||
auto root_reader = msg.getRoot<capnp::DynamicStruct>(schema);
|
||||
serializer->readRow(columns, root_reader);
|
||||
}
|
||||
catch (const kj::Exception & e)
|
||||
{
|
||||
@ -343,7 +112,14 @@ void registerInputFormatCapnProto(FormatFactory & factory)
|
||||
factory.markFormatSupportsSubsetOfColumns("CapnProto");
|
||||
factory.registerFileExtension("capnp", "CapnProto");
|
||||
factory.registerAdditionalInfoForSchemaCacheGetter(
|
||||
"CapnProto", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); });
|
||||
"CapnProto",
|
||||
[](const FormatSettings & settings)
|
||||
{
|
||||
return fmt::format(
|
||||
"format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}",
|
||||
settings.schema.format_schema,
|
||||
settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference);
|
||||
});
|
||||
}
|
||||
|
||||
void registerCapnProtoSchemaReader(FormatFactory & factory)
|
||||
|
@ -4,7 +4,8 @@
|
||||
#if USE_CAPNP
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Formats/CapnProtoUtils.h>
|
||||
#include <Formats/CapnProtoSchema.h>
|
||||
#include <Formats/CapnProtoSerializer.h>
|
||||
#include <Processors/Formats/IRowInputFormat.h>
|
||||
#include <Processors/Formats/ISchemaReader.h>
|
||||
|
||||
@ -33,10 +34,8 @@ private:
|
||||
kj::Array<capnp::word> readMessage();
|
||||
|
||||
std::shared_ptr<CapnProtoSchemaParser> parser;
|
||||
capnp::StructSchema root;
|
||||
const FormatSettings format_settings;
|
||||
DataTypes column_types;
|
||||
Names column_names;
|
||||
capnp::StructSchema schema;
|
||||
std::unique_ptr<CapnProtoSerializer> serializer;
|
||||
};
|
||||
|
||||
class CapnProtoSchemaReader : public IExternalSchemaReader
|
||||
|
@ -1,37 +1,16 @@
|
||||
#include <Processors/Formats/Impl/CapnProtoRowOutputFormat.h>
|
||||
#if USE_CAPNP
|
||||
|
||||
#include <Formats/CapnProtoUtils.h>
|
||||
#include <Formats/CapnProtoSchema.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
#include <Formats/CapnProtoSerializer.h>
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <capnp/dynamic.h>
|
||||
#include <capnp/serialize-packed.h>
|
||||
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnLowCardinality.h>
|
||||
#include <Columns/ColumnDecimal.h>
|
||||
#include <Columns/ColumnMap.h>
|
||||
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeEnum.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
|
||||
CapnProtoOutputStream::CapnProtoOutputStream(WriteBuffer & out_) : out(out_)
|
||||
{
|
||||
}
|
||||
@ -45,252 +24,25 @@ CapnProtoRowOutputFormat::CapnProtoRowOutputFormat(
|
||||
WriteBuffer & out_,
|
||||
const Block & header_,
|
||||
const FormatSchemaInfo & info,
|
||||
const FormatSettings & format_settings_)
|
||||
: IRowOutputFormat(header_, out_), column_names(header_.getNames()), column_types(header_.getDataTypes()), output_stream(std::make_unique<CapnProtoOutputStream>(out_)), format_settings(format_settings_)
|
||||
const FormatSettings & format_settings)
|
||||
: IRowOutputFormat(header_, out_)
|
||||
, column_names(header_.getNames())
|
||||
, column_types(header_.getDataTypes())
|
||||
, output_stream(std::make_unique<CapnProtoOutputStream>(out_))
|
||||
{
|
||||
schema = schema_parser.getMessageSchema(info);
|
||||
checkCapnProtoSchemaStructure(schema, getPort(PortKind::Main).getHeader(), format_settings.capn_proto.enum_comparing_mode);
|
||||
}
|
||||
|
||||
template <typename EnumValue>
|
||||
static capnp::DynamicEnum getDynamicEnum(
|
||||
const ColumnPtr & column,
|
||||
const DataTypePtr & data_type,
|
||||
size_t row_num,
|
||||
const capnp::EnumSchema & enum_schema,
|
||||
FormatSettings::EnumComparingMode mode)
|
||||
{
|
||||
const auto * enum_data_type = assert_cast<const DataTypeEnum<EnumValue> *>(data_type.get());
|
||||
EnumValue enum_value = column->getInt(row_num);
|
||||
if (mode == FormatSettings::EnumComparingMode::BY_VALUES)
|
||||
return capnp::DynamicEnum(enum_schema, enum_value);
|
||||
|
||||
auto enum_name = enum_data_type->getNameForValue(enum_value);
|
||||
for (const auto enumerant : enum_schema.getEnumerants())
|
||||
{
|
||||
if (compareEnumNames(String(enum_name), enumerant.getProto().getName(), mode))
|
||||
return capnp::DynamicEnum(enumerant);
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert CLickHouse Enum value to CapnProto Enum");
|
||||
}
|
||||
|
||||
static capnp::DynamicValue::Builder initStructFieldBuilder(const ColumnPtr & column, size_t row_num, capnp::DynamicStruct::Builder & struct_builder, capnp::StructSchema::Field field)
|
||||
{
|
||||
if (const auto * array_column = checkAndGetColumn<ColumnArray>(*column))
|
||||
{
|
||||
size_t size = array_column->getOffsets()[row_num] - array_column->getOffsets()[row_num - 1];
|
||||
return struct_builder.init(field, static_cast<unsigned>(size));
|
||||
}
|
||||
|
||||
if (field.getType().isStruct())
|
||||
return struct_builder.init(field);
|
||||
|
||||
return struct_builder.get(field);
|
||||
}
|
||||
|
||||
static std::optional<capnp::DynamicValue::Reader> convertToDynamicValue(
|
||||
const ColumnPtr & column,
|
||||
const DataTypePtr & data_type,
|
||||
size_t row_num,
|
||||
const String & column_name,
|
||||
capnp::DynamicValue::Builder builder,
|
||||
FormatSettings::EnumComparingMode enum_comparing_mode,
|
||||
std::vector<std::unique_ptr<String>> & temporary_text_data_storage)
|
||||
{
|
||||
/// Here we don't do any types validation, because we did it in CapnProtoRowOutputFormat constructor.
|
||||
|
||||
if (data_type->lowCardinality())
|
||||
{
|
||||
const auto * lc_column = assert_cast<const ColumnLowCardinality *>(column.get());
|
||||
const auto & dict_type = assert_cast<const DataTypeLowCardinality *>(data_type.get())->getDictionaryType();
|
||||
size_t index = lc_column->getIndexAt(row_num);
|
||||
return convertToDynamicValue(lc_column->getDictionary().getNestedColumn(), dict_type, index, column_name, builder, enum_comparing_mode, temporary_text_data_storage);
|
||||
}
|
||||
|
||||
switch (builder.getType())
|
||||
{
|
||||
case capnp::DynamicValue::Type::INT:
|
||||
return capnp::DynamicValue::Reader(column->getInt(row_num));
|
||||
case capnp::DynamicValue::Type::UINT:
|
||||
{
|
||||
/// IPv4 column doesn't support getUInt method.
|
||||
if (isIPv4(data_type))
|
||||
return capnp::DynamicValue::Reader(assert_cast<const ColumnIPv4 *>(column.get())->getElement(row_num));
|
||||
return capnp::DynamicValue::Reader(column->getUInt(row_num));
|
||||
}
|
||||
case capnp::DynamicValue::Type::BOOL:
|
||||
return capnp::DynamicValue::Reader(column->getBool(row_num));
|
||||
case capnp::DynamicValue::Type::FLOAT:
|
||||
return capnp::DynamicValue::Reader(column->getFloat64(row_num));
|
||||
case capnp::DynamicValue::Type::ENUM:
|
||||
{
|
||||
auto enum_schema = builder.as<capnp::DynamicEnum>().getSchema();
|
||||
if (data_type->getTypeId() == TypeIndex::Enum8)
|
||||
return capnp::DynamicValue::Reader(
|
||||
getDynamicEnum<Int8>(column, data_type, row_num, enum_schema, enum_comparing_mode));
|
||||
return capnp::DynamicValue::Reader(
|
||||
getDynamicEnum<Int16>(column, data_type, row_num, enum_schema, enum_comparing_mode));
|
||||
}
|
||||
case capnp::DynamicValue::Type::DATA:
|
||||
{
|
||||
auto data = column->getDataAt(row_num);
|
||||
return capnp::DynamicValue::Reader(capnp::Data::Reader(reinterpret_cast<const kj::byte *>(data.data), data.size));
|
||||
}
|
||||
case capnp::DynamicValue::Type::TEXT:
|
||||
{
|
||||
/// In TEXT type data should be null-terminated, but ClickHouse String data could not be.
|
||||
/// To make data null-terminated we should copy it to temporary String object, but
|
||||
/// capnp::Text::Reader works only with pointer to the data and it's size, so we should
|
||||
/// guarantee that new String object life time is longer than capnp::Text::Reader life time.
|
||||
/// To do this we store new String object in a temporary storage, passed in this function
|
||||
/// by reference. We use unique_ptr<String> instead of just String to avoid pointers
|
||||
/// invalidation on vector reallocation.
|
||||
temporary_text_data_storage.push_back(std::make_unique<String>(column->getDataAt(row_num)));
|
||||
auto & data = temporary_text_data_storage.back();
|
||||
return capnp::DynamicValue::Reader(capnp::Text::Reader(data->data(), data->size()));
|
||||
}
|
||||
case capnp::DynamicValue::Type::STRUCT:
|
||||
{
|
||||
auto struct_builder = builder.as<capnp::DynamicStruct>();
|
||||
auto nested_struct_schema = struct_builder.getSchema();
|
||||
/// Struct can represent Tuple, Nullable (named union with two fields) or single column when it contains one nested column.
|
||||
if (data_type->isNullable())
|
||||
{
|
||||
const auto * nullable_type = assert_cast<const DataTypeNullable *>(data_type.get());
|
||||
const auto * nullable_column = assert_cast<const ColumnNullable *>(column.get());
|
||||
auto fields = nested_struct_schema.getUnionFields();
|
||||
if (nullable_column->isNullAt(row_num))
|
||||
{
|
||||
auto null_field = fields[0].getType().isVoid() ? fields[0] : fields[1];
|
||||
struct_builder.set(null_field, capnp::Void());
|
||||
}
|
||||
else
|
||||
{
|
||||
auto value_field = fields[0].getType().isVoid() ? fields[1] : fields[0];
|
||||
struct_builder.clear(value_field);
|
||||
const auto & nested_column = nullable_column->getNestedColumnPtr();
|
||||
auto value_builder = initStructFieldBuilder(nested_column, row_num, struct_builder, value_field);
|
||||
auto value = convertToDynamicValue(nested_column, nullable_type->getNestedType(), row_num, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage);
|
||||
if (value)
|
||||
struct_builder.set(value_field, *value);
|
||||
}
|
||||
}
|
||||
else if (isTuple(data_type))
|
||||
{
|
||||
const auto * tuple_data_type = assert_cast<const DataTypeTuple *>(data_type.get());
|
||||
const auto & nested_types = tuple_data_type->getElements();
|
||||
const auto & nested_names = tuple_data_type->getElementNames();
|
||||
const auto & nested_columns = assert_cast<const ColumnTuple *>(column.get())->getColumns();
|
||||
bool have_explicit_names = tuple_data_type->haveExplicitNames();
|
||||
for (uint32_t i = 0; i != nested_names.size(); ++i)
|
||||
{
|
||||
capnp::StructSchema::Field nested_field = have_explicit_names ? nested_struct_schema.getFieldByName(nested_names[i]) : nested_struct_schema.getFields()[i];
|
||||
auto field_builder = initStructFieldBuilder(nested_columns[i], row_num, struct_builder, nested_field);
|
||||
auto value = convertToDynamicValue(nested_columns[i], nested_types[i], row_num, nested_names[i], field_builder, enum_comparing_mode, temporary_text_data_storage);
|
||||
if (value)
|
||||
struct_builder.set(nested_field, *value);
|
||||
}
|
||||
}
|
||||
else if (isMap(data_type))
|
||||
{
|
||||
/// We output Map type as follow CapnProto schema
|
||||
///
|
||||
/// struct Map {
|
||||
/// struct Entry {
|
||||
/// key @0: Key;
|
||||
/// value @1: Value;
|
||||
/// }
|
||||
/// entries @0 :List(Entry);
|
||||
/// }
|
||||
///
|
||||
/// And we don't need to check that struct have this form here because we checked it before.
|
||||
const auto & map_type = assert_cast<const DataTypeMap &>(*data_type);
|
||||
DataTypes key_value_types = {map_type.getKeyType(), map_type.getValueType()};
|
||||
Names key_value_names = {"key", "value"};
|
||||
auto entries_type = std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(key_value_types, key_value_names));
|
||||
|
||||
/// Nested column in Map is actually Array(Tuple), so we can output it according to "entries" field schema.
|
||||
const auto & entries_column = assert_cast<const ColumnMap *>(column.get())->getNestedColumnPtr();
|
||||
|
||||
auto entries_field = nested_struct_schema.getFields()[0];
|
||||
auto field_builder = initStructFieldBuilder(entries_column, row_num, struct_builder, entries_field);
|
||||
auto entries_value = convertToDynamicValue(entries_column, entries_type, row_num, column_name, field_builder, enum_comparing_mode, temporary_text_data_storage);
|
||||
if (entries_value)
|
||||
struct_builder.set(entries_field, *entries_value);
|
||||
}
|
||||
else
|
||||
{
|
||||
/// It can be nested column from Nested type.
|
||||
auto [field_name, nested_name] = splitCapnProtoFieldName(column_name);
|
||||
auto nested_field = nested_struct_schema.getFieldByName(nested_name);
|
||||
auto field_builder = initStructFieldBuilder(column, row_num, struct_builder, nested_field);
|
||||
auto value = convertToDynamicValue(column, data_type, row_num, nested_name, field_builder, enum_comparing_mode, temporary_text_data_storage);
|
||||
if (value)
|
||||
struct_builder.set(nested_field, *value);
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
case capnp::DynamicValue::Type::LIST:
|
||||
{
|
||||
auto list_builder = builder.as<capnp::DynamicList>();
|
||||
const auto * array_column = assert_cast<const ColumnArray *>(column.get());
|
||||
const auto & nested_column = array_column->getDataPtr();
|
||||
const auto & nested_type = assert_cast<const DataTypeArray *>(data_type.get())->getNestedType();
|
||||
const auto & offsets = array_column->getOffsets();
|
||||
auto offset = offsets[row_num - 1];
|
||||
size_t size = offsets[row_num] - offset;
|
||||
|
||||
const auto * nested_array_column = checkAndGetColumn<ColumnArray>(*nested_column);
|
||||
for (unsigned i = 0; i != static_cast<unsigned>(size); ++i)
|
||||
{
|
||||
capnp::DynamicValue::Builder value_builder;
|
||||
/// For nested arrays we need to initialize nested list builder.
|
||||
if (nested_array_column)
|
||||
{
|
||||
const auto & nested_offset = nested_array_column->getOffsets();
|
||||
size_t nested_array_size = nested_offset[offset + i] - nested_offset[offset + i - 1];
|
||||
value_builder = list_builder.init(i, static_cast<unsigned>(nested_array_size));
|
||||
}
|
||||
else
|
||||
value_builder = list_builder[i];
|
||||
|
||||
auto value = convertToDynamicValue(nested_column, nested_type, offset + i, column_name, value_builder, enum_comparing_mode, temporary_text_data_storage);
|
||||
if (value)
|
||||
list_builder.set(i, *value);
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
default:
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CapnProto type.");
|
||||
}
|
||||
const auto & header = getPort(PortKind::Main).getHeader();
|
||||
serializer = std::make_unique<CapnProtoSerializer>(header.getDataTypes(), header.getNames(), schema, format_settings.capn_proto);
|
||||
capnp::MallocMessageBuilder message;
|
||||
}
|
||||
|
||||
void CapnProtoRowOutputFormat::write(const Columns & columns, size_t row_num)
|
||||
{
|
||||
capnp::MallocMessageBuilder message;
|
||||
/// Temporary storage for data that will be outputted in fields with CapnProto type TEXT.
|
||||
/// See comment in convertToDynamicValue() for more details.
|
||||
std::vector<std::unique_ptr<String>> temporary_text_data_storage;
|
||||
capnp::DynamicStruct::Builder root = message.initRoot<capnp::DynamicStruct>(schema);
|
||||
|
||||
/// Some columns can share same field builder. For example when we have
|
||||
/// column with Nested type that was flattened into several columns.
|
||||
std::unordered_map<size_t, capnp::DynamicValue::Builder> field_builders;
|
||||
for (size_t i = 0; i != columns.size(); ++i)
|
||||
{
|
||||
auto [struct_builder, field] = getStructBuilderAndFieldByColumnName(root, column_names[i]);
|
||||
if (!field_builders.contains(field.getIndex()))
|
||||
{
|
||||
auto field_builder = initStructFieldBuilder(columns[i], row_num, struct_builder, field);
|
||||
field_builders[field.getIndex()] = field_builder;
|
||||
}
|
||||
auto value = convertToDynamicValue(columns[i], column_types[i], row_num, column_names[i], field_builders[field.getIndex()], format_settings.capn_proto.enum_comparing_mode, temporary_text_data_storage);
|
||||
if (value)
|
||||
struct_builder.set(field, *value);
|
||||
}
|
||||
|
||||
serializer->writeRow(columns, std::move(root), row_num);
|
||||
capnp::writeMessage(*output_stream, message);
|
||||
|
||||
}
|
||||
|
||||
void registerOutputFormatCapnProto(FormatFactory & factory)
|
||||
|
@ -3,15 +3,17 @@
|
||||
#include "config.h"
|
||||
#if USE_CAPNP
|
||||
|
||||
#include <Processors/Formats/IRowOutputFormat.h>
|
||||
#include <Formats/FormatSchemaInfo.h>
|
||||
#include <Formats/CapnProtoUtils.h>
|
||||
#include <capnp/schema.h>
|
||||
#include <capnp/dynamic.h>
|
||||
#include <kj/io.h>
|
||||
# include <Formats/CapnProtoSchema.h>
|
||||
# include <Formats/CapnProtoSerializer.h>
|
||||
# include <Formats/FormatSchemaInfo.h>
|
||||
# include <Processors/Formats/IRowOutputFormat.h>
|
||||
# include <capnp/dynamic.h>
|
||||
# include <capnp/schema.h>
|
||||
# include <kj/io.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class CapnProtoOutputStream : public kj::OutputStream
|
||||
{
|
||||
public:
|
||||
@ -43,8 +45,9 @@ private:
|
||||
DataTypes column_types;
|
||||
capnp::StructSchema schema;
|
||||
std::unique_ptr<CapnProtoOutputStream> output_stream;
|
||||
const FormatSettings format_settings;
|
||||
CapnProtoSchemaParser schema_parser;
|
||||
std::unique_ptr<CapnProtoSerializer> serializer;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -283,6 +283,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
|
||||
|
||||
/// Allow optional \n before eof.
|
||||
checkChar('\n', *buf);
|
||||
if (format_settings.custom.skip_trailing_empty_lines)
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
return buf->eof();
|
||||
}
|
||||
|
||||
@ -294,6 +296,8 @@ bool CustomSeparatedFormatReader::checkForSuffixImpl(bool check_eof)
|
||||
|
||||
/// Allow optional \n before eof.
|
||||
checkChar('\n', *buf);
|
||||
if (format_settings.custom.skip_trailing_empty_lines)
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
if (buf->eof())
|
||||
return true;
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ void registerJSONColumnsSchemaReader(FormatFactory & factory)
|
||||
);
|
||||
factory.registerAdditionalInfoForSchemaCacheGetter("JSONColumns", [](const FormatSettings & settings)
|
||||
{
|
||||
return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
|
||||
return getAdditionalFormatInfoForAllRowBasedFormats(settings) + getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -176,6 +176,8 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase(
|
||||
, hints_str(format_settings_.schema_inference_hints)
|
||||
, reader(std::move(reader_))
|
||||
, column_names_from_settings(splitColumnNames(format_settings_.column_names_for_schema_inference))
|
||||
, max_rows_to_read(format_settings_.max_rows_to_read_for_schema_inference)
|
||||
, max_bytes_to_read(format_settings_.max_bytes_to_read_for_schema_inference)
|
||||
{
|
||||
}
|
||||
|
||||
@ -196,12 +198,12 @@ void JSONColumnsSchemaReaderBase::transformTypesIfNeeded(DataTypePtr & type, Dat
|
||||
|
||||
NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
|
||||
{
|
||||
size_t total_rows_read = 0;
|
||||
std::unordered_map<String, DataTypePtr> names_to_types;
|
||||
std::vector<String> names_order;
|
||||
/// Read data block by block and determine the type for each column
|
||||
/// until max_rows_to_read_for_schema_inference is reached.
|
||||
while (total_rows_read < format_settings.max_rows_to_read_for_schema_inference)
|
||||
/// until max_rows_to_read/max_bytes_to_read is reached.
|
||||
/// Note that we can exceed max_bytes_to_read to compete block parsing.
|
||||
while (total_rows_read < max_rows_to_read && in.count() < max_bytes_to_read)
|
||||
{
|
||||
if (in.eof())
|
||||
break;
|
||||
@ -268,7 +270,7 @@ NamesAndTypesList JSONColumnsSchemaReaderBase::readSchema()
|
||||
return result;
|
||||
}
|
||||
|
||||
DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read)
|
||||
DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows)
|
||||
{
|
||||
/// Check for empty column.
|
||||
if (reader->checkColumnEnd())
|
||||
@ -279,7 +281,7 @@ DataTypePtr JSONColumnsSchemaReaderBase::readColumnAndGetDataType(const String &
|
||||
do
|
||||
{
|
||||
/// If we reached max_rows_to_read, skip the rest part of this column.
|
||||
if (rows_read == max_rows_to_read)
|
||||
if (rows_read == max_rows)
|
||||
{
|
||||
reader->skipColumn();
|
||||
break;
|
||||
|
@ -82,11 +82,19 @@ public:
|
||||
bool needContext() const override { return !hints_str.empty(); }
|
||||
void setContext(ContextPtr & ctx) override;
|
||||
|
||||
void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override
|
||||
{
|
||||
max_rows_to_read = max_rows;
|
||||
max_bytes_to_read = max_bytes;
|
||||
}
|
||||
|
||||
size_t getNumRowsRead() const override { return total_rows_read; }
|
||||
|
||||
private:
|
||||
NamesAndTypesList readSchema() override;
|
||||
|
||||
/// Read whole column in the block (up to max_rows_to_read rows) and extract the data type.
|
||||
DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows_to_read);
|
||||
/// Read whole column in the block (up to max_rows rows) and extract the data type.
|
||||
DataTypePtr readColumnAndGetDataType(const String & column_name, size_t & rows_read, size_t max_rows);
|
||||
|
||||
const FormatSettings format_settings;
|
||||
String hints_str;
|
||||
@ -95,6 +103,10 @@ private:
|
||||
std::unique_ptr<JSONColumnsReaderBase> reader;
|
||||
Names column_names_from_settings;
|
||||
JSONInferenceInfo inference_info;
|
||||
|
||||
size_t total_rows_read = 0;
|
||||
size_t max_rows_to_read;
|
||||
size_t max_bytes_to_read;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ void registerJSONCompactColumnsSchemaReader(FormatFactory & factory)
|
||||
);
|
||||
factory.registerAdditionalInfoForSchemaCacheGetter("JSONCompactColumns", [](const FormatSettings & settings)
|
||||
{
|
||||
auto result = getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
|
||||
auto result = getAdditionalFormatInfoForAllRowBasedFormats(settings) + getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON);
|
||||
return result + fmt::format(", column_names_for_schema_inference={}", settings.column_names_for_schema_inference);
|
||||
});
|
||||
}
|
||||
|
@ -88,7 +88,14 @@ void registerInputFormatProtobufList(FormatFactory & factory)
|
||||
});
|
||||
factory.markFormatSupportsSubsetOfColumns("ProtobufList");
|
||||
factory.registerAdditionalInfoForSchemaCacheGetter(
|
||||
"ProtobufList", [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); });
|
||||
"ProtobufList",
|
||||
[](const FormatSettings & settings)
|
||||
{
|
||||
return fmt::format(
|
||||
"format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}",
|
||||
settings.schema.format_schema,
|
||||
settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference);
|
||||
});
|
||||
}
|
||||
|
||||
void registerProtobufListSchemaReader(FormatFactory & factory)
|
||||
|
@ -128,7 +128,14 @@ void registerProtobufSchemaReader(FormatFactory & factory)
|
||||
|
||||
for (const auto & name : {"Protobuf", "ProtobufSingle"})
|
||||
factory.registerAdditionalInfoForSchemaCacheGetter(
|
||||
name, [](const FormatSettings & settings) { return fmt::format("format_schema={}", settings.schema.format_schema); });
|
||||
name,
|
||||
[](const FormatSettings & settings)
|
||||
{
|
||||
return fmt::format(
|
||||
"format_schema={}, skip_fields_with_unsupported_types_in_schema_inference={}",
|
||||
settings.schema.format_schema,
|
||||
settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -286,6 +286,20 @@ void TabSeparatedFormatReader::setReadBuffer(ReadBuffer & in_)
|
||||
FormatWithNamesAndTypesReader::setReadBuffer(*buf);
|
||||
}
|
||||
|
||||
bool TabSeparatedFormatReader::checkForSuffix()
|
||||
{
|
||||
if (!format_settings.tsv.skip_trailing_empty_lines)
|
||||
return buf->eof();
|
||||
|
||||
PeekableReadBufferCheckpoint checkpoint(*buf);
|
||||
while (checkChar('\n', *buf) || checkChar('\r', *buf));
|
||||
if (buf->eof())
|
||||
return true;
|
||||
|
||||
buf->rollbackToCheckpoint();
|
||||
return false;
|
||||
}
|
||||
|
||||
TabSeparatedSchemaReader::TabSeparatedSchemaReader(
|
||||
ReadBuffer & in_, bool with_names_, bool with_types_, bool is_raw_, const FormatSettings & format_settings_)
|
||||
: FormatWithNamesAndTypesSchemaReader(
|
||||
|
@ -75,6 +75,8 @@ public:
|
||||
|
||||
void setReadBuffer(ReadBuffer & in_) override;
|
||||
|
||||
bool checkForSuffix() override;
|
||||
|
||||
private:
|
||||
template <bool is_header>
|
||||
std::vector<String> readRowImpl();
|
||||
|
@ -88,7 +88,7 @@ std::vector<float> ApproximateNearestNeighborCondition::getReferenceVector() con
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Reference vector was requested for useless or uninitialized index.");
|
||||
}
|
||||
|
||||
size_t ApproximateNearestNeighborCondition::getNumOfDimensions() const
|
||||
size_t ApproximateNearestNeighborCondition::getDimensions() const
|
||||
{
|
||||
if (index_is_useful && query_information.has_value())
|
||||
return query_information->reference_vector.size();
|
||||
|
@ -90,8 +90,8 @@ public:
|
||||
/// Distance should be calculated regarding to referenceVector
|
||||
std::vector<float> getReferenceVector() const;
|
||||
|
||||
/// Reference vector's dimension size
|
||||
size_t getNumOfDimensions() const;
|
||||
/// Reference vector's dimension count
|
||||
size_t getDimensions() const;
|
||||
|
||||
String getColumnName() const;
|
||||
|
||||
|
@ -99,6 +99,7 @@
|
||||
|
||||
#include <fmt/format.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Poco/Net/NetException.h>
|
||||
|
||||
template <>
|
||||
struct fmt::formatter<DB::DataPartPtr> : fmt::formatter<std::string>
|
||||
@ -1254,6 +1255,14 @@ MergeTreeData::LoadPartResult MergeTreeData::loadDataPart(
|
||||
mark_broken();
|
||||
return res;
|
||||
}
|
||||
catch (const Poco::Net::NetException &)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (const Poco::TimeoutException &)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
mark_broken();
|
||||
|
@ -27,13 +27,13 @@ namespace ErrorCodes
|
||||
|
||||
|
||||
template <typename Distance>
|
||||
AnnoyIndexWithSerialization<Distance>::AnnoyIndexWithSerialization(uint64_t dim)
|
||||
: Base::AnnoyIndex(dim)
|
||||
AnnoyIndexWithSerialization<Distance>::AnnoyIndexWithSerialization(size_t dimensions)
|
||||
: Base::AnnoyIndex(dimensions)
|
||||
{
|
||||
}
|
||||
|
||||
template<typename Distance>
|
||||
void AnnoyIndexWithSerialization<Distance>::serialize(WriteBuffer& ostr) const
|
||||
void AnnoyIndexWithSerialization<Distance>::serialize(WriteBuffer & ostr) const
|
||||
{
|
||||
chassert(Base::_built);
|
||||
writeIntBinary(Base::_s, ostr);
|
||||
@ -43,11 +43,11 @@ void AnnoyIndexWithSerialization<Distance>::serialize(WriteBuffer& ostr) const
|
||||
writeIntBinary(Base::_K, ostr);
|
||||
writeIntBinary(Base::_seed, ostr);
|
||||
writeVectorBinary(Base::_roots, ostr);
|
||||
ostr.write(reinterpret_cast<const char*>(Base::_nodes), Base::_s * Base::_n_nodes);
|
||||
ostr.write(reinterpret_cast<const char *>(Base::_nodes), Base::_s * Base::_n_nodes);
|
||||
}
|
||||
|
||||
template<typename Distance>
|
||||
void AnnoyIndexWithSerialization<Distance>::deserialize(ReadBuffer& istr)
|
||||
void AnnoyIndexWithSerialization<Distance>::deserialize(ReadBuffer & istr)
|
||||
{
|
||||
chassert(!Base::_built);
|
||||
readIntBinary(Base::_s, istr);
|
||||
@ -69,7 +69,7 @@ void AnnoyIndexWithSerialization<Distance>::deserialize(ReadBuffer& istr)
|
||||
}
|
||||
|
||||
template<typename Distance>
|
||||
uint64_t AnnoyIndexWithSerialization<Distance>::getNumOfDimensions() const
|
||||
size_t AnnoyIndexWithSerialization<Distance>::getDimensions() const
|
||||
{
|
||||
return Base::get_f();
|
||||
}
|
||||
@ -97,14 +97,14 @@ void MergeTreeIndexGranuleAnnoy<Distance>::serializeBinary(WriteBuffer & ostr) c
|
||||
{
|
||||
/// Number of dimensions is required in the index constructor,
|
||||
/// so it must be written and read separately from the other part
|
||||
writeIntBinary(index->getNumOfDimensions(), ostr); // write dimension
|
||||
writeIntBinary(static_cast<UInt64>(index->getDimensions()), ostr); // write dimension
|
||||
index->serialize(ostr);
|
||||
}
|
||||
|
||||
template <typename Distance>
|
||||
void MergeTreeIndexGranuleAnnoy<Distance>::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
|
||||
{
|
||||
uint64_t dimension;
|
||||
UInt64 dimension;
|
||||
readIntBinary(dimension, istr);
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);
|
||||
index->deserialize(istr);
|
||||
@ -114,7 +114,7 @@ template <typename Distance>
|
||||
MergeTreeIndexAggregatorAnnoy<Distance>::MergeTreeIndexAggregatorAnnoy(
|
||||
const String & index_name_,
|
||||
const Block & index_sample_block_,
|
||||
uint64_t trees_)
|
||||
UInt64 trees_)
|
||||
: index_name(index_name_)
|
||||
, index_sample_block(index_sample_block_)
|
||||
, trees(trees_)
|
||||
@ -251,10 +251,10 @@ std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI
|
||||
|
||||
const AnnoyIndexWithSerializationPtr<Distance> annoy = granule->index;
|
||||
|
||||
if (ann_condition.getNumOfDimensions() != annoy->getNumOfDimensions())
|
||||
if (ann_condition.getDimensions() != annoy->getDimensions())
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "The dimension of the space in the request ({}) "
|
||||
"does not match the dimension in the index ({})",
|
||||
ann_condition.getNumOfDimensions(), annoy->getNumOfDimensions());
|
||||
ann_condition.getDimensions(), annoy->getDimensions());
|
||||
|
||||
std::vector<UInt64> neighbors; /// indexes of dots which were closest to the reference vector
|
||||
std::vector<Float32> distances;
|
||||
@ -281,7 +281,7 @@ std::vector<size_t> MergeTreeIndexConditionAnnoy::getUsefulRangesImpl(MergeTreeI
|
||||
return granule_numbers;
|
||||
}
|
||||
|
||||
MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_)
|
||||
MergeTreeIndexAnnoy::MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_)
|
||||
: IMergeTreeIndex(index_)
|
||||
, trees(trees_)
|
||||
, distance_function(distance_function_)
|
||||
@ -320,9 +320,9 @@ MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index)
|
||||
if (!index.arguments.empty())
|
||||
distance_function = index.arguments[0].get<String>();
|
||||
|
||||
uint64_t trees = default_trees;
|
||||
UInt64 trees = default_trees;
|
||||
if (index.arguments.size() > 1)
|
||||
trees = index.arguments[1].get<uint64_t>();
|
||||
trees = index.arguments[1].get<UInt64>();
|
||||
|
||||
return std::make_shared<MergeTreeIndexAnnoy>(index, trees, distance_function);
|
||||
}
|
||||
@ -338,7 +338,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Distance function argument of Annoy index must be of type String");
|
||||
|
||||
if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::UInt64)
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be UInt64");
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Number of trees argument of Annoy index must be of type UInt64");
|
||||
|
||||
/// Check that the index is created on a single column
|
||||
|
||||
@ -351,17 +351,16 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
{
|
||||
String distance_name = index.arguments[0].get<String>();
|
||||
if (distance_name != "L2Distance" && distance_name != "cosineDistance")
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index supports only distance functions 'L2Distance' and 'cosineDistance'. Given distance function: {}", distance_name);
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Annoy index only supports distance functions 'L2Distance' and 'cosineDistance'");
|
||||
}
|
||||
|
||||
/// Check data type of indexed column:
|
||||
|
||||
auto throw_unsupported_underlying_column_exception = [](DataTypePtr data_type)
|
||||
auto throw_unsupported_underlying_column_exception = []()
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32). Given type: {}",
|
||||
data_type->getName());
|
||||
"Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
|
||||
};
|
||||
|
||||
DataTypePtr data_type = index.sample_block.getDataTypes()[0];
|
||||
@ -370,7 +369,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
{
|
||||
TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId();
|
||||
if (!WhichDataType(nested_type_index).isFloat32())
|
||||
throw_unsupported_underlying_column_exception(data_type);
|
||||
throw_unsupported_underlying_column_exception();
|
||||
}
|
||||
else if (const auto * data_type_tuple = typeid_cast<const DataTypeTuple *>(data_type.get()))
|
||||
{
|
||||
@ -379,11 +378,11 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
|
||||
{
|
||||
TypeIndex nested_type_index = inner_type->getTypeId();
|
||||
if (!WhichDataType(nested_type_index).isFloat32())
|
||||
throw_unsupported_underlying_column_exception(data_type);
|
||||
throw_unsupported_underlying_column_exception();
|
||||
}
|
||||
}
|
||||
else
|
||||
throw_unsupported_underlying_column_exception(data_type);
|
||||
throw_unsupported_underlying_column_exception();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -16,10 +16,10 @@ class AnnoyIndexWithSerialization : public Annoy::AnnoyIndex<UInt64, Float32, Di
|
||||
using Base = Annoy::AnnoyIndex<UInt64, Float32, Distance, Annoy::Kiss64Random, Annoy::AnnoyIndexMultiThreadedBuildPolicy>;
|
||||
|
||||
public:
|
||||
explicit AnnoyIndexWithSerialization(uint64_t dim);
|
||||
void serialize(WriteBuffer& ostr) const;
|
||||
void deserialize(ReadBuffer& istr);
|
||||
uint64_t getNumOfDimensions() const;
|
||||
explicit AnnoyIndexWithSerialization(size_t dimensions);
|
||||
void serialize(WriteBuffer & ostr) const;
|
||||
void deserialize(ReadBuffer & istr);
|
||||
size_t getDimensions() const;
|
||||
};
|
||||
|
||||
template <typename Distance>
|
||||
@ -46,7 +46,7 @@ struct MergeTreeIndexGranuleAnnoy final : public IMergeTreeIndexGranule
|
||||
template <typename Distance>
|
||||
struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator
|
||||
{
|
||||
MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, uint64_t trees);
|
||||
MergeTreeIndexAggregatorAnnoy(const String & index_name_, const Block & index_sample_block, UInt64 trees);
|
||||
~MergeTreeIndexAggregatorAnnoy() override = default;
|
||||
|
||||
bool empty() const override { return !index || index->get_n_items() == 0; }
|
||||
@ -55,7 +55,7 @@ struct MergeTreeIndexAggregatorAnnoy final : IMergeTreeIndexAggregator
|
||||
|
||||
const String index_name;
|
||||
const Block index_sample_block;
|
||||
const uint64_t trees;
|
||||
const UInt64 trees;
|
||||
AnnoyIndexWithSerializationPtr<Distance> index;
|
||||
};
|
||||
|
||||
@ -89,7 +89,7 @@ class MergeTreeIndexAnnoy : public IMergeTreeIndex
|
||||
{
|
||||
public:
|
||||
|
||||
MergeTreeIndexAnnoy(const IndexDescription & index_, uint64_t trees_, const String & distance_function_);
|
||||
MergeTreeIndexAnnoy(const IndexDescription & index_, UInt64 trees_, const String & distance_function_);
|
||||
|
||||
~MergeTreeIndexAnnoy() override = default;
|
||||
|
||||
@ -100,7 +100,7 @@ public:
|
||||
bool mayBenefitFromIndexForIn(const ASTPtr & /*node*/) const override { return false; }
|
||||
|
||||
private:
|
||||
const uint64_t trees;
|
||||
const UInt64 trees;
|
||||
const String distance_function;
|
||||
};
|
||||
|
||||
|
@ -67,8 +67,8 @@ struct Settings;
|
||||
M(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \
|
||||
M(CleanDeletedRows, clean_deleted_rows, CleanDeletedRows::Never, "Is the Replicated Merge cleanup has to be done automatically at each merge or manually (possible values are 'Always'/'Never' (default))", 0) \
|
||||
M(UInt64, replicated_max_mutations_in_one_entry, 10000, "Max number of mutation commands that can be merged together and executed in one MUTATE_PART entry (0 means unlimited)", 0) \
|
||||
M(UInt64, number_of_mutations_to_delay, 0, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \
|
||||
M(UInt64, number_of_mutations_to_throw, 0, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \
|
||||
M(UInt64, number_of_mutations_to_delay, 500, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \
|
||||
M(UInt64, number_of_mutations_to_throw, 1000, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \
|
||||
M(UInt64, min_delay_to_mutate_ms, 10, "Min delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \
|
||||
M(UInt64, max_delay_to_mutate_ms, 1000, "Max delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \
|
||||
\
|
||||
|
@ -1358,7 +1358,7 @@ size_t StorageMergeTree::getNumberOfUnfinishedMutations() const
|
||||
size_t count = 0;
|
||||
for (const auto & [version, _] : current_mutations_by_version | std::views::reverse)
|
||||
{
|
||||
auto status = getIncompleteMutationsStatusUnlocked(version, lock);
|
||||
auto status = getIncompleteMutationsStatusUnlocked(version, lock, nullptr, true);
|
||||
if (!status)
|
||||
continue;
|
||||
|
||||
|
42
tests/ci/attach_gdb.lib
Normal file
42
tests/ci/attach_gdb.lib
Normal file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
function attach_gdb_to_clickhouse()
|
||||
{
|
||||
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
|
||||
# and clickhouse-server can do fork-exec, for example, to run some bridge.
|
||||
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
|
||||
# explicitly ignore non-fatal signals that are used by server.
|
||||
# Number of SIGRTMIN can be determined only in runtime.
|
||||
RTMIN=$(kill -l SIGRTMIN)
|
||||
echo "
|
||||
set follow-fork-mode parent
|
||||
handle SIGHUP nostop noprint pass
|
||||
handle SIGINT nostop noprint pass
|
||||
handle SIGQUIT nostop noprint pass
|
||||
handle SIGPIPE nostop noprint pass
|
||||
handle SIGTERM nostop noprint pass
|
||||
handle SIGUSR1 nostop noprint pass
|
||||
handle SIGUSR2 nostop noprint pass
|
||||
handle SIG$RTMIN nostop noprint pass
|
||||
info signals
|
||||
continue
|
||||
backtrace full
|
||||
thread apply all backtrace full
|
||||
info registers
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
p \"done\"
|
||||
detach
|
||||
quit
|
||||
" > script.gdb
|
||||
|
||||
# FIXME Hung check may work incorrectly because of attached gdb
|
||||
# We cannot attach another gdb to get stacktraces if some queries hung
|
||||
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
|
||||
sleep 5
|
||||
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
|
||||
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
|
||||
}
|
@ -378,34 +378,16 @@ def main():
|
||||
|
||||
print(f"::notice:: {check_name} Report url: {report_url}")
|
||||
if args.post_commit_status == "commit_status":
|
||||
if "parallelreplicas" in check_name.lower():
|
||||
post_commit_status(
|
||||
commit,
|
||||
"success",
|
||||
report_url,
|
||||
description,
|
||||
check_name_with_group,
|
||||
pr_info,
|
||||
)
|
||||
else:
|
||||
post_commit_status(
|
||||
commit, state, report_url, description, check_name_with_group, pr_info
|
||||
)
|
||||
post_commit_status(
|
||||
commit, state, report_url, description, check_name_with_group, pr_info
|
||||
)
|
||||
elif args.post_commit_status == "file":
|
||||
if "parallelreplicas" in check_name.lower():
|
||||
post_commit_status_to_file(
|
||||
post_commit_path,
|
||||
description,
|
||||
"success",
|
||||
report_url,
|
||||
)
|
||||
else:
|
||||
post_commit_status_to_file(
|
||||
post_commit_path,
|
||||
description,
|
||||
state,
|
||||
report_url,
|
||||
)
|
||||
post_commit_status_to_file(
|
||||
post_commit_path,
|
||||
description,
|
||||
state,
|
||||
report_url,
|
||||
)
|
||||
else:
|
||||
raise Exception(
|
||||
f'Unknown post_commit_status option "{args.post_commit_status}"'
|
||||
@ -423,11 +405,7 @@ def main():
|
||||
ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
|
||||
|
||||
if state != "success":
|
||||
# Parallel replicas are always green for now
|
||||
if (
|
||||
FORCE_TESTS_LABEL in pr_info.labels
|
||||
or "parallelreplicas" in check_name.lower()
|
||||
):
|
||||
if FORCE_TESTS_LABEL in pr_info.labels:
|
||||
print(f"'{FORCE_TESTS_LABEL}' enabled, will report success")
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
@ -9,6 +9,8 @@ FAIL="\tFAIL\t\\N\t"
|
||||
FAILURE_CONTEXT_LINES=100
|
||||
FAILURE_CONTEXT_MAX_LINE_WIDTH=300
|
||||
|
||||
source attach_gdb.lib
|
||||
|
||||
function escaped()
|
||||
{
|
||||
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
|
||||
@ -184,44 +186,7 @@ function start()
|
||||
counter=$((counter + 1))
|
||||
done
|
||||
|
||||
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
|
||||
# and clickhouse-server can do fork-exec, for example, to run some bridge.
|
||||
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
|
||||
# explicitly ignore non-fatal signals that are used by server.
|
||||
# Number of SIGRTMIN can be determined only in runtime.
|
||||
RTMIN=$(kill -l SIGRTMIN)
|
||||
echo "
|
||||
set follow-fork-mode parent
|
||||
handle SIGHUP nostop noprint pass
|
||||
handle SIGINT nostop noprint pass
|
||||
handle SIGQUIT nostop noprint pass
|
||||
handle SIGPIPE nostop noprint pass
|
||||
handle SIGTERM nostop noprint pass
|
||||
handle SIGUSR1 nostop noprint pass
|
||||
handle SIGUSR2 nostop noprint pass
|
||||
handle SIG$RTMIN nostop noprint pass
|
||||
info signals
|
||||
continue
|
||||
backtrace full
|
||||
thread apply all backtrace full
|
||||
info registers
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
p \"done\"
|
||||
detach
|
||||
quit
|
||||
" > script.gdb
|
||||
|
||||
# FIXME Hung check may work incorrectly because of attached gdb
|
||||
# 1. False positives are possible
|
||||
# 2. We cannot attach another gdb to get stacktraces if some queries hung
|
||||
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
|
||||
sleep 5
|
||||
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
|
||||
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
|
||||
attach_gdb_to_clickhouse
|
||||
}
|
||||
|
||||
function check_server_start()
|
||||
|
@ -340,9 +340,28 @@ def get_transactions_list(args):
|
||||
return f"Cannot get list of transactions: {e}"
|
||||
|
||||
|
||||
def kill_gdb_if_any():
|
||||
# Check if we have running gdb.
|
||||
code = subprocess.call("pidof gdb", shell=True)
|
||||
if code != 0:
|
||||
return
|
||||
|
||||
for i in range(5):
|
||||
code = subprocess.call("kill -TERM $(pidof gdb)", shell=True, timeout=30)
|
||||
if code != 0:
|
||||
sleep(i)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
# collect server stacktraces using gdb
|
||||
def get_stacktraces_from_gdb(server_pid):
|
||||
try:
|
||||
# We could attach gdb to clickhouse-server before running some tests
|
||||
# to print stacktraces of all crashes even if clickhouse cannot print it for some reason.
|
||||
# We should kill existing gdb if any before starting new one.
|
||||
kill_gdb_if_any()
|
||||
|
||||
cmd = f"gdb -batch -ex 'thread apply all backtrace' -p {server_pid}"
|
||||
return subprocess.check_output(cmd, shell=True).decode("utf-8")
|
||||
except Exception as e:
|
||||
|
@ -1050,6 +1050,8 @@ def select_without_columns(clickhouse_node, mysql_node, service_name):
|
||||
|
||||
|
||||
def insert_with_modify_binlog_checksum(clickhouse_node, mysql_node, service_name):
|
||||
clickhouse_node.query("DROP DATABASE IF EXISTS test_checksum")
|
||||
mysql_node.query("DROP DATABASE IF EXISTS test_checksum")
|
||||
mysql_node.query("CREATE DATABASE test_checksum")
|
||||
mysql_node.query("CREATE TABLE test_checksum.t (a INT PRIMARY KEY, b varchar(200))")
|
||||
clickhouse_node.query(
|
||||
@ -1081,6 +1083,21 @@ def insert_with_modify_binlog_checksum(clickhouse_node, mysql_node, service_name
|
||||
"1\t1111\n2\t2222\n3\t3333\n",
|
||||
)
|
||||
|
||||
clickhouse_node.query("DROP DATABASE test_checksum")
|
||||
mysql_node.query("SET GLOBAL binlog_checksum=NONE")
|
||||
clickhouse_node.query(
|
||||
"CREATE DATABASE test_checksum ENGINE = MaterializeMySQL('{}:3306', 'test_checksum', 'root', 'clickhouse')".format(
|
||||
service_name
|
||||
)
|
||||
)
|
||||
check_query(clickhouse_node, "SHOW TABLES FROM test_checksum FORMAT TSV", "t\n")
|
||||
mysql_node.query("INSERT INTO test_checksum.t VALUES(4, '4444')")
|
||||
check_query(
|
||||
clickhouse_node,
|
||||
"SELECT * FROM test_checksum.t ORDER BY a FORMAT TSV",
|
||||
"1\t1111\n2\t2222\n3\t3333\n4\t4444\n",
|
||||
)
|
||||
|
||||
clickhouse_node.query("DROP DATABASE test_checksum")
|
||||
mysql_node.query("DROP DATABASE test_checksum")
|
||||
|
||||
|
@ -12,6 +12,9 @@
|
||||
\N [NULL,NULL,42] (NULL)
|
||||
1 [1,NULL,2] (1)
|
||||
\N [NULL,NULL,42] (NULL)
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
one
|
||||
two
|
||||
tHrEe
|
||||
@ -21,6 +24,14 @@ threE
|
||||
first
|
||||
second
|
||||
third
|
||||
first
|
||||
second
|
||||
third
|
||||
OK
|
||||
one
|
||||
two
|
||||
tHrEe
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
OK
|
||||
|
@ -71,16 +71,25 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE capnp_nullable"
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT CAST(number, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message'" > $CAPN_PROTO_FILE
|
||||
$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 4)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 4)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 4)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 1, \'two\' = 2, \'tHrEe\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'oNe\' = 1, \'tWo\' = 2, \'threE\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'second\' = 1, \'third\' = 2)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'second\' = 1, \'third\' = 2)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 0, \'two\' = 1, \'three\' = 2)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 0, \'two\' = 1, \'tHrEe\' = 2, \'four\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'one\' = 1, \'two\' = 2, \'tHrEe\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 1, \'two\' = 2, \'three\' = 3)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT CAST(number % 2, 'Enum(\'one\' = 0, \'two\' = 1)') AS value FROM numbers(3) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_enum:Message'" > $CAPN_PROTO_FILE
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'two\' = 1)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_values'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'two\' = 1)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'value Enum(\'first\' = 0, \'two\' = 1)') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_enum:Message', format_capn_proto_enum_comparising_mode='by_names_case_insensitive'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS capnp_low_cardinality"
|
||||
$CLICKHOUSE_CLIENT --query="CREATE TABLE capnp_low_cardinality (lc1 LowCardinality(String), lc2 LowCardinality(Nullable(String)), lc3 Array(LowCardinality(Nullable(String)))) ENGINE=Memory"
|
||||
$CLICKHOUSE_CLIENT --query="INSERT INTO capnp_low_cardinality VALUES ('one', 'two', ['one', Null, 'two', Null]), ('two', Null, [Null])"
|
||||
@ -96,8 +105,8 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a_b U
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT number AS a_b, number + 1 AS a_c_d, number + 2 AS a_c_e_f FROM numbers(5) FORMAT CapnProto SETTINGS format_schema='$CLIENT_SCHEMADIR/02030_capnp_nested_tuples:Message'" > $CAPN_PROTO_FILE
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(bb UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(ff UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "CAPN_PROTO_BAD_CAST" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(bb UInt64, c Tuple(d UInt64, e Tuple(f UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "THERE_IS_NO_COLUMN" && echo 'OK' || echo 'FAIL';
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'a Tuple(b UInt64, c Tuple(d UInt64, e Tuple(ff UInt64)))') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_nested_tuples:Message'" 2>&1 | grep -F -q "THERE_IS_NO_COLUMN" && echo 'OK' || echo 'FAIL';
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM file('data.capnp', 'CapnProto', 'string String') SETTINGS format_schema='$SERVER_SCHEMADIR/02030_capnp_simple_types:Message'" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL';
|
||||
|
@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x"
|
||||
$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x settings number_of_mutations_to_delay = 0, number_of_mutations_to_throw = 0"
|
||||
$CLICKHOUSE_CLIENT -q "insert into many_mutations values (0, 0), (1, 1)"
|
||||
$CLICKHOUSE_CLIENT -q "system stop merges many_mutations"
|
||||
|
||||
|
@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x"
|
||||
$CLICKHOUSE_CLIENT -q "create table many_mutations (x UInt32, y UInt32) engine = MergeTree order by x settings number_of_mutations_to_delay = 0, number_of_mutations_to_throw = 0"
|
||||
$CLICKHOUSE_CLIENT -q "insert into many_mutations select number, number + 1 from numbers(2000)"
|
||||
$CLICKHOUSE_CLIENT -q "system stop merges many_mutations"
|
||||
|
||||
|
@ -1,118 +1,144 @@
|
||||
--- Test with Array ---
|
||||
WHERE type, L2Distance
|
||||
1 [0,0,10]
|
||||
2 [0,0,10.5]
|
||||
3 [0,0,9.5]
|
||||
4 [0,0,9.7]
|
||||
5 [0,0,10.2]
|
||||
ORDER BY type, L2Distance
|
||||
1 [0,0,10]
|
||||
5 [0,0,10.2]
|
||||
4 [0,0,9.7]
|
||||
WHERE type, L2Distance, check that index is used
|
||||
Expression ((Projection + Before ORDER BY))
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 1/3
|
||||
ORDER BY type, L2Distance, check that index is used
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
parameter annoy_index_search_k_nodes
|
||||
parameter max_limit_for_ann_queries
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
--- Test with Tuple ---
|
||||
WHERE type, L2Distance
|
||||
1 (0,0,10)
|
||||
2 (0,0,10.5)
|
||||
3 (0,0,9.5)
|
||||
4 (0,0,9.7)
|
||||
5 (0,0,10.2)
|
||||
ORDER BY type, L2Distance
|
||||
1 (0,0,10)
|
||||
5 (0,0,10.2)
|
||||
4 (0,0,9.7)
|
||||
WHERE type, L2Distance, check that index is used
|
||||
Expression ((Projection + Before ORDER BY))
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 1/3
|
||||
ORDER BY type, L2Distance, check that index is used
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
parameter annoy_index_search_k_nodes
|
||||
parameter max_limit_for_ann_queries
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
--- Test alternative metric (cosine distance) and non-default NumTrees ---
|
||||
WHERE type, L2Distance
|
||||
1 [0,0,10]
|
||||
2 [0,0,10.5]
|
||||
3 [0,0,9.5]
|
||||
4 [0,0,9.7]
|
||||
5 [0,0,10.2]
|
||||
ORDER BY type, L2Distance
|
||||
1 [0,0,10]
|
||||
5 [0,0,10.2]
|
||||
4 [0,0,9.7]
|
||||
--- Negative tests ---
|
||||
--- Test default GRANULARITY (should be 100 mio. for annoy)---
|
||||
CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX annoy_index vector TYPE annoy GRANULARITY 100000000\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192
|
||||
CREATE TABLE default.tab\n(\n `id` Int32,\n `vector` Array(Float32),\n INDEX annoy_index vector TYPE annoy GRANULARITY 100000000\n)\nENGINE = MergeTree\nORDER BY id\nSETTINGS index_granularity = 8192
|
||||
--- Test with Array, GRANULARITY = 1, index_granularity = 5 ---
|
||||
WHERE type, L2Distance, check that index is used
|
||||
Expression ((Projection + Before ORDER BY))
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 1/3
|
||||
ORDER BY type, L2Distance, check that index is used
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Reference ARRAYs with non-matching dimension are rejected
|
||||
Special case: MaximumDistance is negative
|
||||
WHERE type, L2Distance
|
||||
Special case: setting annoy_index_search_k_nodes
|
||||
Special case: setting max_limit_for_ann_queries
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 ---
|
||||
WHERE type, L2Distance, check that index is used
|
||||
Expression ((Projection + Before ORDER BY))
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 1/3
|
||||
ORDER BY type, L2Distance, check that index is used
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 1
|
||||
Parts: 1/1
|
||||
Granules: 3/3
|
||||
--- Test non-default metric (cosine distance) + non-default NumTrees (200) ---
|
||||
--- Test with Array, GRANULARITY = 2, index_granularity = 4 ---
|
||||
WHERE type, L2Distance, check that index is used
|
||||
Expression ((Projection + Before ORDER BY))
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 2
|
||||
Parts: 0/1
|
||||
Granules: 2/4
|
||||
ORDER BY type, L2Distance, check that index is used
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 2
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
--- Test with Array, GRANULARITY = 4, index_granularity = 4 ---
|
||||
WHERE type, L2Distance, check that index is used
|
||||
Expression ((Projection + Before ORDER BY))
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 4
|
||||
Parts: 0/1
|
||||
Granules: 3/4
|
||||
ORDER BY type, L2Distance, check that index is used
|
||||
Expression (Projection)
|
||||
Limit (preliminary LIMIT (without OFFSET))
|
||||
Sorting (Sorting for ORDER BY)
|
||||
Expression (Before ORDER BY)
|
||||
ReadFromMergeTree (default.tab)
|
||||
Indexes:
|
||||
PrimaryKey
|
||||
Condition: true
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
Skip
|
||||
Name: annoy_index
|
||||
Description: annoy GRANULARITY 4
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
|
@ -1,150 +1,251 @@
|
||||
-- Tags: disabled, no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check
|
||||
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-upgrade-check
|
||||
|
||||
SET allow_experimental_annoy_index = 1;
|
||||
|
||||
SELECT '--- Test with Array ---';
|
||||
|
||||
DROP TABLE IF EXISTS tab;
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5;
|
||||
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]);
|
||||
|
||||
SELECT 'WHERE type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0
|
||||
LIMIT 5;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0])
|
||||
LIMIT 3;
|
||||
|
||||
-- Produces different error code with analyzer, TODO: check
|
||||
-- SELECT 'Reference ARRAYs with non-matching dimension are rejected';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- ORDER BY L2Distance(embedding, [0.0, 0.0])
|
||||
-- LIMIT 3; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
SELECT 'WHERE type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0
|
||||
LIMIT 5;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0])
|
||||
LIMIT 3;
|
||||
|
||||
SELECT 'parameter annoy_index_search_k_nodes';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, [5.3, 7.3, 2.1])
|
||||
LIMIT 5
|
||||
SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results
|
||||
|
||||
SELECT 'parameter max_limit_for_ann_queries';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, [5.3, 7.3, 2.1])
|
||||
LIMIT 5
|
||||
SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Test with Tuple ---';
|
||||
|
||||
CREATE TABLE tab(id Int32, embedding Tuple(Float32, Float32, Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5;
|
||||
INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0));
|
||||
|
||||
SELECT 'WHERE type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(embedding, (0.0, 0.0, 10.0)) < 1.0
|
||||
LIMIT 5;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, (0.0, 0.0, 10.0))
|
||||
LIMIT 3;
|
||||
|
||||
SELECT 'WHERE type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(embedding, (0.0, 0.0, 10.0)) < 1.0
|
||||
LIMIT 5;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, (0.0, 0.0, 10.0))
|
||||
LIMIT 3;
|
||||
|
||||
SELECT 'parameter annoy_index_search_k_nodes';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, (5.3, 7.3, 2.1))
|
||||
LIMIT 5
|
||||
SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results
|
||||
|
||||
SELECT 'parameter max_limit_for_ann_queries';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, (5.3, 7.3, 2.1))
|
||||
LIMIT 5
|
||||
SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Test alternative metric (cosine distance) and non-default NumTrees ---';
|
||||
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('cosineDistance', 200)) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity=5;
|
||||
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]);
|
||||
|
||||
SELECT 'WHERE type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(embedding, [0.0, 0.0, 10.0]) < 1.0
|
||||
LIMIT 5;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(embedding, [0.0, 0.0, 10.0])
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
SET allow_experimental_analyzer = 0;
|
||||
|
||||
SELECT '--- Negative tests ---';
|
||||
|
||||
DROP TABLE IF EXISTS tab;
|
||||
|
||||
-- must have at most 2 arguments
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('too', 'many', 'arguments')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
-- first argument (distance_function) must be String
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
-- 2nd argument (number of trees) must be UInt64
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
-- reject unsupported distance functions
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index embedding TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('L2Distance', 'not an UInt64')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
-- must be created on single column
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float32), INDEX annoy_index (embedding, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index (vector, id) TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }
|
||||
|
||||
-- reject unsupported distance functions
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('wormholeDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
|
||||
|
||||
-- must be created on Array/Tuple(Float32) columns
|
||||
SET allow_suspicious_low_cardinality_types = 1;
|
||||
CREATE TABLE tab(id Int32, embedding Float32, INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, embedding Array(Float64), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, embedding LowCardinality(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, embedding Nullable(Float32), INDEX annoy_index embedding TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vector Float32, INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vector Array(Float64), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vector Tuple(Float64), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vector LowCardinality(Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
CREATE TABLE tab(id Int32, vector Nullable(Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN }
|
||||
|
||||
SELECT '--- Test default GRANULARITY (should be 100 mio. for annoy)---';
|
||||
|
||||
CREATE TABLE tab (id Int32, vector Array(Float32), INDEX annoy_index(vector) TYPE annoy) ENGINE=MergeTree ORDER BY id;
|
||||
SHOW CREATE TABLE tab;
|
||||
DROP TABLE tab;
|
||||
|
||||
CREATE TABLE tab (id Int32, vector Array(Float32)) ENGINE=MergeTree ORDER BY id;
|
||||
ALTER TABLE tab ADD INDEX annoy_index(vector) TYPE annoy;
|
||||
SHOW CREATE TABLE tab;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Test with Array, GRANULARITY = 1, index_granularity = 5 ---';
|
||||
|
||||
DROP TABLE IF EXISTS tab;
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5;
|
||||
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]);
|
||||
|
||||
-- rows = 15, index_granularity = 5, GRANULARITY = 1 gives 3 annoy-indexed blocks (each comprising a single granule)
|
||||
-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one annoy-indexed block produces results --> "Granules: 1/3"
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'WHERE type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'WHERE type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
|
||||
LIMIT 3;
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'ORDER BY type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0])
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [0.0, 0.0, 10.0])
|
||||
LIMIT 3;
|
||||
|
||||
-- Test special cases. Corresponding special case tests are omitted from later tests.
|
||||
|
||||
SELECT 'Reference ARRAYs with non-matching dimension are rejected';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [0.0, 0.0])
|
||||
LIMIT 3; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
SELECT 'Special case: MaximumDistance is negative';
|
||||
SELECT 'WHERE type, L2Distance';
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < -1.0
|
||||
LIMIT 3; -- { serverError INCORRECT_QUERY }
|
||||
|
||||
SELECT 'Special case: setting annoy_index_search_k_nodes';
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [5.3, 7.3, 2.1])
|
||||
LIMIT 3
|
||||
SETTINGS annoy_index_search_k_nodes=0; -- searches zero nodes --> no results
|
||||
|
||||
SELECT 'Special case: setting max_limit_for_ann_queries';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [5.3, 7.3, 2.1])
|
||||
LIMIT 3
|
||||
SETTINGS max_limit_for_ann_queries=2; -- doesn't use the ann index
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
-- Test Tuple embeddings. Triggers different logic than Array inside MergeTreeIndexAnnoy but the same logic as Array above MergeTreeIndexAnnoy.
|
||||
-- Therefore test Tuple case just once.
|
||||
|
||||
SELECT '--- Test with Tuple, GRANULARITY = 1, index_granularity = 5 ---';
|
||||
|
||||
CREATE TABLE tab(id Int32, vector Tuple(Float32, Float32, Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5;
|
||||
INSERT INTO tab VALUES (1, (0.0, 0.0, 10.0)), (2, (0.0, 0.0, 10.5)), (3, (0.0, 0.0, 9.5)), (4, (0.0, 0.0, 9.7)), (5, (0.0, 0.0, 10.2)), (6, (10.0, 0.0, 0.0)), (7, (9.5, 0.0, 0.0)), (8, (9.7, 0.0, 0.0)), (9, (10.2, 0.0, 0.0)), (10, (10.5, 0.0, 0.0)), (11, (0.0, 10.0, 0.0)), (12, (0.0, 9.5, 0.0)), (13, (0.0, 9.7, 0.0)), (14, (0.0, 10.2, 0.0)), (15, (0.0, 10.5, 0.0));
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'WHERE type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'WHERE type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(vector, (0.0, 0.0, 10.0)) < 1.0
|
||||
LIMIT 3;
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'ORDER BY type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- ORDER BY L2Distance(vector, (0.0, 0.0, 10.0))
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, (0.0, 0.0, 10.0))
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
-- Not a systematic test, just to make sure no bad things happen
|
||||
SELECT '--- Test non-default metric (cosine distance) + non-default NumTrees (200) ---';
|
||||
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy('cosineDistance', 200) GRANULARITY 1) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 5;
|
||||
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0]), (2, [0.0, 0.0, 10.5]), (3, [0.0, 0.0, 9.5]), (4, [0.0, 0.0, 9.7]), (5, [0.0, 0.0, 10.2]), (6, [10.0, 0.0, 0.0]), (7, [9.5, 0.0, 0.0]), (8, [9.7, 0.0, 0.0]), (9, [10.2, 0.0, 0.0]), (10, [10.5, 0.0, 0.0]), (11, [0.0, 10.0, 0.0]), (12, [0.0, 9.5, 0.0]), (13, [0.0, 9.7, 0.0]), (14, [0.0, 10.2, 0.0]), (15, [0.0, 10.5, 0.0]);
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'WHERE type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- WHERE L2Distance(vector, [0.0, 0.0, 10.0]) < 1.0
|
||||
-- LIMIT 3;
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'ORDER BY type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- ORDER BY L2Distance(vector, [0.0, 0.0, 10.0])
|
||||
-- LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Test with Array, GRANULARITY = 2, index_granularity = 4 ---';
|
||||
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4;
|
||||
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]);
|
||||
|
||||
-- rows = 16, index_granularity = 4, GRANULARITY = 2 gives 2 annoy-indexed blocks (each comprising two granules)
|
||||
-- condition 'L2Distance(vector, reference_vector) < 1.0' ensures that only one annoy-indexed block produces results --> "Granules: 2/4"
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'WHERE type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'WHERE type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
|
||||
LIMIT 3;
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'ORDER BY type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
SELECT '--- Test with Array, GRANULARITY = 4, index_granularity = 4 ---';
|
||||
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy() GRANULARITY 4) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 4;
|
||||
INSERT INTO tab VALUES (1, [0.0, 0.0, 10.0, 0.0]), (2, [0.0, 0.0, 10.5, 0.0]), (3, [0.0, 0.0, 9.5, 0.0]), (4, [0.0, 0.0, 9.7, 0.0]), (5, [10.0, 0.0, 0.0, 0.0]), (6, [9.5, 0.0, 0.0, 0.0]), (7, [9.7, 0.0, 0.0, 0.0]), (8, [10.2, 0.0, 0.0, 0.0]), (9, [0.0, 10.0, 0.0, 0.0]), (10, [0.0, 9.5, 0.0, 0.0]), (11, [0.0, 9.7, 0.0, 0.0]), (12, [0.0, 9.7, 0.0, 0.0]), (13, [0.0, 0.0, 0.0, 10.3]), (14, [0.0, 0.0, 0.0, 9.5]), (15, [0.0, 0.0, 0.0, 10.0]), (16, [0.0, 0.0, 0.0, 10.5]);
|
||||
|
||||
-- rows = 16, index_granularity = 4, GRANULARITY = 4 gives a single annoy-indexed block (comprising all granules)
|
||||
-- no two matches happen to be located in the same granule, so with LIMIT = 3, we'll get "Granules: 2/4"
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'WHERE type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'WHERE type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
WHERE L2Distance(vector, [10.0, 0.0, 10.0, 0.0]) < 5.0
|
||||
LIMIT 3;
|
||||
|
||||
-- See (*) why commented out
|
||||
-- SELECT 'ORDER BY type, L2Distance';
|
||||
-- SELECT *
|
||||
-- FROM tab
|
||||
-- ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
|
||||
-- LIMIT 3;
|
||||
|
||||
SELECT 'ORDER BY type, L2Distance, check that index is used';
|
||||
EXPLAIN indexes=1
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [10.0, 0.0, 10.0, 0.0])
|
||||
LIMIT 3;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
-- (*) Storage and search in Annoy indexes is inherently random. Tests which check for exact row matches would be unstable. Therefore,
|
||||
-- comment them out.
|
||||
|
@ -0,0 +1 @@
|
||||
42 (42,42)
|
10
tests/queries/0_stateless/02735_capnp_case_insensitive_names_matching.sh
Executable file
10
tests/queries/0_stateless/02735_capnp_case_insensitive_names_matching.sh
Executable file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest, no-parallel, no-replicated-database
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
SCHEMADIR=$CURDIR/format_schemas
|
||||
$CLICKHOUSE_LOCAL -q "select 42 as Field1, (42, 42)::Tuple(Field1 UInt32, Field2 UInt32) as Nested format CapnProto settings format_schema='$SCHEMADIR/02735_case_insensitive_names_matching:Message'" | $CLICKHOUSE_LOCAL --input-format CapnProto --structure "Field1 UInt32, Nested Tuple(Field1 UInt32, Field2 UInt32)" -q "select * from table" --format_schema="$SCHEMADIR/02735_case_insensitive_names_matching:Message"
|
||||
|
@ -0,0 +1,3 @@
|
||||
(42,(42,42),[(42,42),(24,24)]) [(42,(42,42),[(42,42),(24,24)]),(24,(24,24),[(24,24),(42,42)])]
|
||||
42 42 42
|
||||
[42,24] [42,24] [42,24] [[42,24],[24,42]] [[42,24],[24,42]]
|
24
tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh
Executable file
24
tests/queries/0_stateless/02736_reading_and_writing_structure_fields.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest, no-parallel, no-replicated-database
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
SCHEMADIR=$CURDIR/format_schemas
|
||||
DATA_FILE=02736_$CLICKHOUSE_TEST_UNIQUE_NAME.bin
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select tuple(42, tuple(42, 42), [tuple(42, 42), tuple(24, 24)]) as nested, [tuple(42, tuple(42, 42), [tuple(42, 42), tuple(24, 24)]), tuple(24, tuple(24, 24), [tuple(24, 24), tuple(42, 42)])] as nestedList format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto) settings format_schema='$SCHEMADIR/02736_nested_structures:Message'"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select 42 as nested_field1, 42 as nested_nested_field1, 42 as nested_nested_field2 format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto, 'nested_field1 UInt32, nested_nested_field1 UInt32, nested_nested_field2 UInt32') settings format_schema='$SCHEMADIR/02736_nested_structures:Message'"
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select [42, 24] as nestedList_field1, [42, 24] as nestedList_nested_field1, [42, 24] as nestedList_nested_field2, [[42, 24], [24, 42]] as nestedList_nestedList_field1, [[42, 24], [24, 42]] as nestedList_nestedList_field2 format CapnProto settings format_schema='$SCHEMADIR/02736_nested_structures:Message'" > $DATA_FILE
|
||||
|
||||
$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CapnProto, 'nestedList_field1 Array(UInt32), nestedList_nested_field1 Array(UInt32), nestedList_nested_field2 Array(UInt32), nestedList_nestedList_field1 Array(Array(UInt32)), nestedList_nestedList_field2 Array(Array(UInt32))') settings format_schema='$SCHEMADIR/02736_nested_structures:Message'"
|
||||
|
||||
rm $DATA_FILE
|
||||
|
@ -0,0 +1,3 @@
|
||||
1 2
|
||||
1 2
|
||||
1 2
|
@ -0,0 +1,12 @@
|
||||
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n') settings input_format_tsv_skip_trailing_empty_lines=1;
|
||||
select * from format(TSV, 'x UInt32, y UInt32', '1\t2\n\n1\t2\n') settings input_format_tsv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
|
||||
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n') settings input_format_csv_skip_trailing_empty_lines=1;
|
||||
select * from format(CSV, 'x UInt32, y UInt32', '1,2\n\n1,2\n') settings input_format_csv_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
|
||||
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=0; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1;
|
||||
select * from format(CustomSeparated, 'x UInt32, y UInt32', '1\t2\n\n\n1\t2\n\n\n') settings input_format_custom_skip_trailing_empty_lines=1; -- {serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED}
|
||||
|
@ -0,0 +1 @@
|
||||
a Nullable(Int64)
|
@ -0,0 +1,4 @@
|
||||
set input_format_max_rows_to_read_for_schema_inference=2;
|
||||
desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA}
|
||||
desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=20;
|
||||
|
@ -0,0 +1,13 @@
|
||||
@0x9ef128e10a8010b8;
|
||||
|
||||
struct Nested
|
||||
{
|
||||
field1 @0 : UInt32;
|
||||
field2 @1 : UInt32;
|
||||
}
|
||||
|
||||
struct Message
|
||||
{
|
||||
field1 @0 : UInt32;
|
||||
nested @1 : Nested;
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
@0x9ef128e10a8010b8;
|
||||
|
||||
struct Nested2
|
||||
{
|
||||
field1 @0 : UInt32;
|
||||
field2 @1 : UInt32;
|
||||
}
|
||||
|
||||
struct Nested
|
||||
{
|
||||
field1 @0 : UInt32;
|
||||
nested @1 : Nested2;
|
||||
nestedList @2 : List(Nested2);
|
||||
}
|
||||
|
||||
struct Message
|
||||
{
|
||||
nested @0 : Nested;
|
||||
nestedList @1 : List(Nested);
|
||||
}
|
||||
|
@ -1,4 +1 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT ParsedParams.Key1 FROM test.visits FINAL WHERE VisitID != 0 AND notEmpty(ParsedParams.Key1) ORDER BY VisitID LIMIT 10
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
DROP TABLE IF EXISTS test.merge_hits;
|
||||
CREATE TABLE IF NOT EXISTS test.merge_hits AS test.hits ENGINE = Merge(test, '^hits$');
|
||||
SELECT count() FROM test.merge_hits WHERE AdvEngineID = 2;
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT
|
||||
EventDate,
|
||||
hits,
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT
|
||||
EventDate,
|
||||
count() AS hits,
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT
|
||||
domain,
|
||||
hits,
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SET any_join_distinct_right_table_keys = 1;
|
||||
SET joined_subquery_requires_alias = 0;
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
USE test;
|
||||
|
||||
DROP TABLE IF EXISTS join;
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
set any_join_distinct_right_table_keys = 1;
|
||||
set joined_subquery_requires_alias = 0;
|
||||
|
||||
|
@ -1,4 +1,2 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT UserID, EventTime::DateTime('Asia/Dubai'), pp.Key1, pp.Key2, ParsedParams.Key1 FROM test.hits ARRAY JOIN ParsedParams AS pp WHERE CounterID = 1704509 ORDER BY UserID, EventTime, pp.Key1, pp.Key2 LIMIT 100;
|
||||
SELECT UserID, EventTime::DateTime('Asia/Dubai'), pp.Key1, pp.Key2, ParsedParams.Key1 FROM test.hits LEFT ARRAY JOIN ParsedParams AS pp WHERE CounterID = 1704509 ORDER BY UserID, EventTime, pp.Key1, pp.Key2 LIMIT 100;
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT PP.Key1 AS `ym:s:paramsLevel1`, sum(arrayAll(`x_1` -> `x_1`= '', ParsedParams.Key2)) AS `ym:s:visits` FROM test.hits ARRAY JOIN ParsedParams AS `PP` WHERE CounterID = 1704509 GROUP BY `ym:s:paramsLevel1` ORDER BY PP.Key1, `ym:s:visits` LIMIT 0, 100;
|
||||
SELECT PP.Key1 AS x1, ParsedParams.Key2 AS x2 FROM test.hits ARRAY JOIN ParsedParams AS PP WHERE CounterID = 1704509 ORDER BY x1, x2 LIMIT 10;
|
||||
SELECT ParsedParams.Key2 AS x FROM test.hits ARRAY JOIN ParsedParams AS PP ORDER BY x DESC LIMIT 10;
|
||||
|
@ -1,3 +1 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT count() FROM (SELECT Goals.ID FROM test.visits ARRAY JOIN Goals WHERE CounterID = 842440 LIMIT 10 UNION ALL SELECT Goals.ID FROM test.visits ARRAY JOIN Goals WHERE CounterID = 842440 LIMIT 10);
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-random-settings, no-parallel-replicas
|
||||
|
||||
SET max_bytes_before_external_group_by = 200000000;
|
||||
|
||||
SET max_memory_usage = 1500000000;
|
||||
|
@ -1,6 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel-replicas
|
||||
# clickhouse-local may not work with parallel replicas
|
||||
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
|
@ -1,6 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel-replicas
|
||||
# clickhouse-local may not work with parallel replicas
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
|
@ -1,5 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
|
||||
SELECT RegionID, uniqHLL12(WatchID) AS X FROM remote('127.0.0.{1,2}', test, hits) GROUP BY RegionID HAVING X > 100000 ORDER BY RegionID ASC;
|
||||
SELECT RegionID, uniqCombined(WatchID) AS X FROM remote('127.0.0.{1,2}', test, hits) GROUP BY RegionID HAVING X > 100000 ORDER BY RegionID ASC;
|
||||
SELECT abs(uniq(WatchID) - uniqExact(WatchID)) FROM test.hits;
|
||||
|
@ -1,4 +1,4 @@
|
||||
-- Tags: distributed, no-parallel-replicas
|
||||
-- Tags: distributed
|
||||
|
||||
SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID);
|
||||
SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID) SETTINGS optimize_aggregation_in_order = 1;
|
||||
|
@ -1,4 +1,4 @@
|
||||
-- Tags: no-tsan, no-replicated-database, no-parallel, no-parallel-replicas
|
||||
-- Tags: no-tsan, no-replicated-database, no-parallel
|
||||
-- Tag no-replicated-database: Fails due to additional replicas or shards
|
||||
|
||||
DROP TABLE IF EXISTS fixed_granularity_table;
|
||||
|
@ -1,6 +1,3 @@
|
||||
-- Tags: no-parallel-replicas
|
||||
-- Merge tables doesn't work with parallel replicas currently
|
||||
|
||||
SET max_execution_speed = 4000000, timeout_before_checking_execution_speed = 0;
|
||||
|
||||
CREATE TEMPORARY TABLE times (t DateTime);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user